{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Conversión de formatos usando Pandas --- 11:28 min\n", "=======\n", "\n", "* 11:28 min | Última modificación: Octubre 13, 2021 | [YouTube](https://youtu.be/qKoJMhzz2Co)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Una tarea común que se da en la práctica es convertir una tabla de datos de un formato a otro. Pandas posee herramientas que permiten realizar esta conversión de forma directa." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Al finalizar el documento, usted estará en capacidad de leer y escribir tablas en los siguientes formatos:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* CSV.\n", "\n", "* JSON.\n", "\n", "* Pickle.\n", "\n", "* Ancho fijo.\n", "\n", "* HTML.\n", "\n", "* Microsoft Excel.\n", "\n", "* HDF5.\n", "\n", "* Stata.\n", "\n", "* Matlab y Octave." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preparación" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "pd.set_option(\"display.notebook_repr_html\", False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Creación de una tabla de ejemplo" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " index name value\n", "0 1 A 3.03\n", "1 2 B 5.14\n", "2 3 C 0.40\n", "3 4 D 1.13\n", "4 5 E 8.25" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(\n", " {\n", " \"index\": list(range(1, 6)),\n", " \"name\": [\"A\", \"B\", \"C\", \"D\", \"E\"],\n", " \"value\": [3.03, 5.14, 0.40, 1.13, 8.25],\n", " }\n", ")\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Almacenamiento del archivo en disco" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "index,name,value\n", "1,A,3.03\n", "2,B,5.14\n", "3,C,0.4\n", "4,D,1.13\n", "5,E,8.25\n" ] } ], "source": [ "df.to_csv(\n", " \"/tmp/data.csv\",\n", " index=False,\n", ")\n", "\n", "!cat /tmp/data.csv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Conversión de CSV a JSON" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"index\":{\n", " \"0\":1,\n", " \"1\":2,\n", " \"2\":3,\n", " \"3\":4,\n", " \"4\":5\n", " },\n", " \"name\":{\n", " \"0\":\"A\",\n", " \"1\":\"B\",\n", " \"2\":\"C\",\n", " \"3\":\"D\",\n", " \"4\":\"E\"\n", " },\n", " \"value\":{\n", " \"0\":3.03,\n", " \"1\":5.14,\n", " \"2\":0.4,\n", " \"3\":1.13,\n", " \"4\":8.25\n", " }\n", "}" ] } ], "source": [ "#\n", "# Conversión a JSON con la orientación por\n", "# defecto ('columns')\n", "#\n", "pd.read_csv(\"/tmp/data.csv\").to_json(\n", " \"/tmp/data.json\",\n", " indent=True,\n", ")\n", "\n", "!cat /tmp/data.json" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"0\":{\n", " \"index\":1,\n", " \"name\":\"A\",\n", " \"value\":3.03\n", " },\n", " \"1\":{\n", " \"index\":2,\n", " \"name\":\"B\",\n", " \"value\":5.14\n", " },\n", " \"2\":{\n", " \"index\":3,\n", " \"name\":\"C\",\n", " \"value\":0.4\n", " },\n", " \"3\":{\n", " \"index\":4,\n", " \"name\":\"D\",\n", " \"value\":1.13\n", " },\n", " \"4\":{\n", " \"index\":5,\n", " \"name\":\"E\",\n", " \"value\":8.25\n", " }\n", "}" ] } ], "source": [ "#\n", "# Orientación: 'index'\n", "#\n", "pd.read_csv(\"/tmp/data.csv\").to_json(\n", " \"/tmp/data.json\",\n", " indent=True,\n", " orient='index',\n", ")\n", "\n", "!cat /tmp/data.json" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"columns\":[\n", " \"index\",\n", " \"name\",\n", " \"value\"\n", " ],\n", " \"index\":[\n", " 0,\n", " 1,\n", " 2,\n", " 3,\n", " 4\n", " ],\n", " \"data\":[\n", " [\n", " 1,\n", " \"A\",\n", " 3.03\n", " ],\n", " [\n", " 2,\n", " \"B\",\n", " 5.14\n", " ],\n", " [\n", " 3,\n", " \"C\",\n", " 0.4\n", " ],\n", " [\n", " 4,\n", " \"D\",\n", " 1.13\n", " ],\n", " [\n", " 5,\n", " \"E\",\n", " 8.25\n", " ]\n", " ]\n", "}" ] } ], "source": [ "#\n", "# Orientación: 'split'\n", "#\n", "pd.read_csv(\"/tmp/data.csv\").to_json(\n", " \"/tmp/data.json\",\n", " indent=True,\n", " orient='split',\n", ")\n", "\n", "!cat /tmp/data.json" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[\n", " {\n", " \"index\":1,\n", " \"name\":\"A\",\n", " \"value\":3.03\n", " },\n", " {\n", " \"index\":2,\n", " \"name\":\"B\",\n", " \"value\":5.14\n", " },\n", " {\n", " \"index\":3,\n", " \"name\":\"C\",\n", " \"value\":0.4\n", " },\n", " {\n", " \"index\":4,\n", " \"name\":\"D\",\n", " \"value\":1.13\n", " },\n", " {\n", " \"index\":5,\n", " \"name\":\"E\",\n", " \"value\":8.25\n", " }\n", "]" ] } ], "source": [ "#\n", "# Orientación: 'records'\n", "#\n", "pd.read_csv(\"/tmp/data.csv\").to_json(\n", " \"/tmp/data.json\",\n", " indent=True,\n", " orient='records',\n", ")\n", "\n", "!cat /tmp/data.json" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"index\":{\n", " \"0\":1,\n", " \"1\":2,\n", " \"2\":3,\n", " \"3\":4,\n", " \"4\":5\n", " },\n", " \"name\":{\n", " \"0\":\"A\",\n", " \"1\":\"B\",\n", " \"2\":\"C\",\n", " \"3\":\"D\",\n", " \"4\":\"E\"\n", " },\n", " \"value\":{\n", " \"0\":3.03,\n", " \"1\":5.14,\n", " \"2\":0.4,\n", " \"3\":1.13,\n", " \"4\":8.25\n", " }\n", "}" ] } ], "source": [ "#\n", "# Orientación: 'columns'\n", "#\n", "pd.read_csv(\"/tmp/data.csv\").to_json(\n", " \"/tmp/data.json\",\n", " indent=True,\n", " orient='columns',\n", ")\n", "\n", "!cat /tmp/data.json" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[\n", " [\n", " 1,\n", " \"A\",\n", " 3.03\n", " ],\n", " [\n", " 2,\n", " \"B\",\n", " 5.14\n", " ],\n", " [\n", " 3,\n", " \"C\",\n", " 0.4\n", " ],\n", " [\n", " 4,\n", " \"D\",\n", " 1.13\n", " ],\n", " [\n", " 5,\n", " \"E\",\n", " 8.25\n", " ]\n", "]" ] } ], "source": [ "#\n", "# Orientación: 'values'\n", "#\n", "pd.read_csv(\"/tmp/data.csv\").to_json(\n", " \"/tmp/data.json\",\n", " indent=True,\n", " orient='values',\n", ")\n", "\n", "!cat /tmp/data.json" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"schema\":{\n", " \"fields\":[\n", " {\n", " \"name\":\"index\",\n", " \"type\":\"integer\"\n", " },\n", " {\n", " \"name\":\"index\",\n", " \"type\":\"integer\"\n", " },\n", " {\n", " \"name\":\"name\",\n", " \"type\":\"string\"\n", " },\n", " {\n", " \"name\":\"value\",\n", " \"type\":\"number\"\n", " }\n", " ],\n", " \"primaryKey\":[\n", " \"index\"\n", " ],\n", " \"pandas_version\":\"0.20.0\"\n", " },\n", " \"data\":[\n", " {\n", " \"level_0\":0,\n", " \"index\":1,\n", " \"name\":\"A\",\n", " \"value\":3.03\n", " },\n", " {\n", " \"level_0\":1,\n", " \"index\":2,\n", " \"name\":\"B\",\n", " \"value\":5.14\n", " },\n", " {\n", " \"level_0\":2,\n", " \"index\":3,\n", " \"name\":\"C\",\n", " \"value\":0.4\n", " },\n", " {\n", " \"level_0\":3,\n", " \"index\":4,\n", " \"name\":\"D\",\n", " \"value\":1.13\n", " },\n", " {\n", " \"level_0\":4,\n", " \"index\":5,\n", " \"name\":\"E\",\n", " \"value\":8.25\n", " }\n", " ]\n", "}" ] } ], "source": [ "#\n", "# Orientación: 'table'\n", "#\n", "pd.read_csv(\"/tmp/data.csv\").to_json(\n", " \"/tmp/data.json\",\n", " indent=True,\n", " orient='table',\n", ")\n", "\n", "!cat /tmp/data.json" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Formato nativo de Python" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " index name value\n", "0 1 A 3.03\n", "1 2 B 5.14\n", "2 3 C 0.40\n", "3 4 D 1.13\n", "4 5 E 8.25" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Escribe el dataset en formato binario\n", "#\n", "df.to_pickle('/tmp/data.pickle')\n", "\n", "#\n", "# Lee el archivo en formato binario\n", "#\n", "pd.read_pickle('/tmp/data.pickle')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Archivos delimitados por caracteres con Pandas" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "index,name,value\n", "1,A,3.03\n", "2,B,5.14\n", "3,C,0.4\n", "4,D,1.13\n", "5,E,8.25\n" ] } ], "source": [ "#\n", "# Formato por defecto generado por pandas\n", "#\n", "df.to_csv(\n", " \"/tmp/data.csv\",\n", " index=False,\n", ")\n", "\n", "!cat /tmp/data.csv" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "index;name;value\n", "1;A;3,03\n", "2;B;5,14\n", "3;C;0,4\n", "4;D;1,13\n", "5;E;8,25\n" ] } ], "source": [ "#\n", "# Escritura en formato de Español que es usado\n", "# por defecto en Microsoft Excel\n", "#\n", "df.to_csv(\n", " \"/tmp/data.csv2\",\n", " sep=\";\",\n", " decimal=\",\",\n", " index=False,\n", ")\n", "\n", "!cat /tmp/data.csv2" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " index name value\n", "0 1 A 3.03\n", "1 2 B 5.14\n", "2 3 C 0.40\n", "3 4 D 1.13\n", "4 5 E 8.25" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Lectura del archivo\n", "#\n", "pd.read_csv(\n", " \"/tmp/data.csv2\",\n", " sep=\";\",\n", " thousands=None,\n", " decimal=\",\",\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Formato de ancho fijo con Pandas" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing /tmp/data.txt\n" ] } ], "source": [ "%%writefile /tmp/data.txt\n", "indexnames valuescodes\n", " 1john wick 2.13 10\n", " 2mark twin 3.14 11\n", " 3louis ng 4.34 12\n", " 4dan brown 2.31 13\n", " 5ann marie 4.98 14" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " index names values codes\n", "0 1 john wick 2.13 10\n", "1 2 mark twin 3.14 11\n", "2 3 louis ng 4.34 12\n", "3 4 dan brown 2.31 13\n", "4 5 ann marie 4.98 14" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Se debe especificar el ancho de cada columna\n", "#\n", "pd.read_fwf(\n", " \"/tmp/data.txt\",\n", " colspecs=\"infer\",\n", " widths=[5, 9, 8, 5],\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## HTML con Pandas" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "
index | \n", "name | \n", "value | \n", "
---|---|---|
1 | \n", "A | \n", "3.03 | \n", "
2 | \n", "B | \n", "5.14 | \n", "
3 | \n", "C | \n", "0.40 | \n", "
4 | \n", "D | \n", "1.13 | \n", "
5 | \n", "E | \n", "8.25 | \n", "