{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Conversión de formatos usando Pandas --- 11:28 min\n", "=======\n", "\n", "* 11:28 min | Última modificación: Octubre 13, 2021 | [YouTube](https://youtu.be/qKoJMhzz2Co)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Una tarea común que se da en la práctica es convertir una tabla de datos de un formato a otro. Pandas posee herramientas que permiten realizar esta conversión de forma directa." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Al finalizar el documento, usted estará en capacidad de leer y escribir tablas en los siguientes formatos:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* CSV.\n", "\n", "* JSON.\n", "\n", "* Pickle.\n", "\n", "* Ancho fijo.\n", "\n", "* HTML.\n", "\n", "* Microsoft Excel.\n", "\n", "* HDF5.\n", "\n", "* Stata.\n", "\n", "* Matlab y Octave." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preparación" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "pd.set_option(\"display.notebook_repr_html\", False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Creación de una tabla de ejemplo" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " index name value\n", "0 1 A 3.03\n", "1 2 B 5.14\n", "2 3 C 0.40\n", "3 4 D 1.13\n", "4 5 E 8.25" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(\n", " {\n", " \"index\": list(range(1, 6)),\n", " \"name\": [\"A\", \"B\", \"C\", \"D\", \"E\"],\n", " \"value\": [3.03, 5.14, 0.40, 1.13, 8.25],\n", " }\n", ")\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Almacenamiento del archivo en disco" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "index,name,value\n", "1,A,3.03\n", "2,B,5.14\n", "3,C,0.4\n", "4,D,1.13\n", "5,E,8.25\n" ] } ], "source": [ "df.to_csv(\n", " \"/tmp/data.csv\",\n", " index=False,\n", ")\n", "\n", "!cat /tmp/data.csv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Conversión de CSV a JSON" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"index\":{\n", " \"0\":1,\n", " \"1\":2,\n", " \"2\":3,\n", " \"3\":4,\n", " \"4\":5\n", " },\n", " \"name\":{\n", " \"0\":\"A\",\n", " \"1\":\"B\",\n", " \"2\":\"C\",\n", " \"3\":\"D\",\n", " \"4\":\"E\"\n", " },\n", " \"value\":{\n", " \"0\":3.03,\n", " \"1\":5.14,\n", " \"2\":0.4,\n", " \"3\":1.13,\n", " \"4\":8.25\n", " }\n", "}" ] } ], "source": [ "#\n", "# Conversión a JSON con la orientación por\n", "# defecto ('columns')\n", "#\n", "pd.read_csv(\"/tmp/data.csv\").to_json(\n", " \"/tmp/data.json\",\n", " indent=True,\n", ")\n", "\n", "!cat /tmp/data.json" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"0\":{\n", " \"index\":1,\n", " \"name\":\"A\",\n", " \"value\":3.03\n", " },\n", " \"1\":{\n", " \"index\":2,\n", " \"name\":\"B\",\n", " \"value\":5.14\n", " },\n", " \"2\":{\n", " \"index\":3,\n", " \"name\":\"C\",\n", " \"value\":0.4\n", " },\n", " \"3\":{\n", " \"index\":4,\n", " \"name\":\"D\",\n", " \"value\":1.13\n", " },\n", " \"4\":{\n", " \"index\":5,\n", " \"name\":\"E\",\n", " \"value\":8.25\n", " }\n", "}" ] } ], "source": [ "#\n", "# Orientación: 'index'\n", "#\n", "pd.read_csv(\"/tmp/data.csv\").to_json(\n", " \"/tmp/data.json\",\n", " indent=True,\n", " orient='index',\n", ")\n", "\n", "!cat /tmp/data.json" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"columns\":[\n", " \"index\",\n", " \"name\",\n", " \"value\"\n", " ],\n", " \"index\":[\n", " 0,\n", " 1,\n", " 2,\n", " 3,\n", " 4\n", " ],\n", " \"data\":[\n", " [\n", " 1,\n", " \"A\",\n", " 3.03\n", " ],\n", " [\n", " 2,\n", " \"B\",\n", " 5.14\n", " ],\n", " [\n", " 3,\n", " \"C\",\n", " 0.4\n", " ],\n", " [\n", " 4,\n", " \"D\",\n", " 1.13\n", " ],\n", " [\n", " 5,\n", " \"E\",\n", " 8.25\n", " ]\n", " ]\n", "}" ] } ], "source": [ "#\n", "# Orientación: 'split'\n", "#\n", "pd.read_csv(\"/tmp/data.csv\").to_json(\n", " \"/tmp/data.json\",\n", " indent=True,\n", " orient='split',\n", ")\n", "\n", "!cat /tmp/data.json" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[\n", " {\n", " \"index\":1,\n", " \"name\":\"A\",\n", " \"value\":3.03\n", " },\n", " {\n", " \"index\":2,\n", " \"name\":\"B\",\n", " \"value\":5.14\n", " },\n", " {\n", " \"index\":3,\n", " \"name\":\"C\",\n", " \"value\":0.4\n", " },\n", " {\n", " \"index\":4,\n", " \"name\":\"D\",\n", " \"value\":1.13\n", " },\n", " {\n", " \"index\":5,\n", " \"name\":\"E\",\n", " \"value\":8.25\n", " }\n", "]" ] } ], "source": [ "#\n", "# Orientación: 'records'\n", "#\n", "pd.read_csv(\"/tmp/data.csv\").to_json(\n", " \"/tmp/data.json\",\n", " indent=True,\n", " orient='records',\n", ")\n", "\n", "!cat /tmp/data.json" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"index\":{\n", " \"0\":1,\n", " \"1\":2,\n", " \"2\":3,\n", " \"3\":4,\n", " \"4\":5\n", " },\n", " \"name\":{\n", " \"0\":\"A\",\n", " \"1\":\"B\",\n", " \"2\":\"C\",\n", " \"3\":\"D\",\n", " \"4\":\"E\"\n", " },\n", " \"value\":{\n", " \"0\":3.03,\n", " \"1\":5.14,\n", " \"2\":0.4,\n", " \"3\":1.13,\n", " \"4\":8.25\n", " }\n", "}" ] } ], "source": [ "#\n", "# Orientación: 'columns'\n", "#\n", "pd.read_csv(\"/tmp/data.csv\").to_json(\n", " \"/tmp/data.json\",\n", " indent=True,\n", " orient='columns',\n", ")\n", "\n", "!cat /tmp/data.json" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[\n", " [\n", " 1,\n", " \"A\",\n", " 3.03\n", " ],\n", " [\n", " 2,\n", " \"B\",\n", " 5.14\n", " ],\n", " [\n", " 3,\n", " \"C\",\n", " 0.4\n", " ],\n", " [\n", " 4,\n", " \"D\",\n", " 1.13\n", " ],\n", " [\n", " 5,\n", " \"E\",\n", " 8.25\n", " ]\n", "]" ] } ], "source": [ "#\n", "# Orientación: 'values'\n", "#\n", "pd.read_csv(\"/tmp/data.csv\").to_json(\n", " \"/tmp/data.json\",\n", " indent=True,\n", " orient='values',\n", ")\n", "\n", "!cat /tmp/data.json" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " \"schema\":{\n", " \"fields\":[\n", " {\n", " \"name\":\"index\",\n", " \"type\":\"integer\"\n", " },\n", " {\n", " \"name\":\"index\",\n", " \"type\":\"integer\"\n", " },\n", " {\n", " \"name\":\"name\",\n", " \"type\":\"string\"\n", " },\n", " {\n", " \"name\":\"value\",\n", " \"type\":\"number\"\n", " }\n", " ],\n", " \"primaryKey\":[\n", " \"index\"\n", " ],\n", " \"pandas_version\":\"0.20.0\"\n", " },\n", " \"data\":[\n", " {\n", " \"level_0\":0,\n", " \"index\":1,\n", " \"name\":\"A\",\n", " \"value\":3.03\n", " },\n", " {\n", " \"level_0\":1,\n", " \"index\":2,\n", " \"name\":\"B\",\n", " \"value\":5.14\n", " },\n", " {\n", " \"level_0\":2,\n", " \"index\":3,\n", " \"name\":\"C\",\n", " \"value\":0.4\n", " },\n", " {\n", " \"level_0\":3,\n", " \"index\":4,\n", " \"name\":\"D\",\n", " \"value\":1.13\n", " },\n", " {\n", " \"level_0\":4,\n", " \"index\":5,\n", " \"name\":\"E\",\n", " \"value\":8.25\n", " }\n", " ]\n", "}" ] } ], "source": [ "#\n", "# Orientación: 'table'\n", "#\n", "pd.read_csv(\"/tmp/data.csv\").to_json(\n", " \"/tmp/data.json\",\n", " indent=True,\n", " orient='table',\n", ")\n", "\n", "!cat /tmp/data.json" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Formato nativo de Python" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " index name value\n", "0 1 A 3.03\n", "1 2 B 5.14\n", "2 3 C 0.40\n", "3 4 D 1.13\n", "4 5 E 8.25" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Escribe el dataset en formato binario\n", "#\n", "df.to_pickle('/tmp/data.pickle')\n", "\n", "#\n", "# Lee el archivo en formato binario\n", "#\n", "pd.read_pickle('/tmp/data.pickle')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Archivos delimitados por caracteres con Pandas" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "index,name,value\n", "1,A,3.03\n", "2,B,5.14\n", "3,C,0.4\n", "4,D,1.13\n", "5,E,8.25\n" ] } ], "source": [ "#\n", "# Formato por defecto generado por pandas\n", "#\n", "df.to_csv(\n", " \"/tmp/data.csv\",\n", " index=False,\n", ")\n", "\n", "!cat /tmp/data.csv" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "index;name;value\n", "1;A;3,03\n", "2;B;5,14\n", "3;C;0,4\n", "4;D;1,13\n", "5;E;8,25\n" ] } ], "source": [ "#\n", "# Escritura en formato de Español que es usado\n", "# por defecto en Microsoft Excel\n", "#\n", "df.to_csv(\n", " \"/tmp/data.csv2\",\n", " sep=\";\",\n", " decimal=\",\",\n", " index=False,\n", ")\n", "\n", "!cat /tmp/data.csv2" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " index name value\n", "0 1 A 3.03\n", "1 2 B 5.14\n", "2 3 C 0.40\n", "3 4 D 1.13\n", "4 5 E 8.25" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Lectura del archivo\n", "#\n", "pd.read_csv(\n", " \"/tmp/data.csv2\",\n", " sep=\";\",\n", " thousands=None,\n", " decimal=\",\",\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Formato de ancho fijo con Pandas" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing /tmp/data.txt\n" ] } ], "source": [ "%%writefile /tmp/data.txt\n", "indexnames valuescodes\n", " 1john wick 2.13 10\n", " 2mark twin 3.14 11\n", " 3louis ng 4.34 12\n", " 4dan brown 2.31 13\n", " 5ann marie 4.98 14" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " index names values codes\n", "0 1 john wick 2.13 10\n", "1 2 mark twin 3.14 11\n", "2 3 louis ng 4.34 12\n", "3 4 dan brown 2.31 13\n", "4 5 ann marie 4.98 14" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Se debe especificar el ancho de cada columna\n", "#\n", "pd.read_fwf(\n", " \"/tmp/data.txt\",\n", " colspecs=\"infer\",\n", " widths=[5, 9, 8, 5],\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## HTML con Pandas" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexnamevalue
1A3.03
2B5.14
3C0.40
4D1.13
5E8.25
" ] } ], "source": [ "#\n", "# Escritura en HTML. Este formato es util para\n", "# insertar el dataframe en una pagina web\n", "#\n", "df.to_html(\n", " \"/tmp/data.html\",\n", " index=False,\n", ")\n", "\n", "!cat /tmp/data.html" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "## Microsoft Excel" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\n" ] } ], "source": [ "#\n", "# Requiere esta librería para realizar la \n", "# lectura y escritura de archivos de Excel\n", "#\n", "!pip3 install --quiet openpyxl" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " index name value\n", "0 1 A 3.03\n", "1 2 B 5.14\n", "2 3 C 0.40\n", "3 4 D 1.13\n", "4 5 E 8.25" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Exportación del df a formato de Excel\n", "#\n", "df.to_excel(\n", " \"/tmp/data.xlsx\",\n", " index=False,\n", ")\n", "\n", "#\n", "# Lectura del archivo de Excel\n", "#\n", "pd.read_excel(\n", " '/tmp/data.xlsx',\n", " engine='openpyxl',\n", ")" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "## Formato HDF5" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\n" ] } ], "source": [ "#\n", "# Se requiere esta librería\n", "#\n", "!pip3 install --quiet tables" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " index name value\n", "0 1 A 3.03\n", "1 2 B 5.14\n", "2 3 C 0.40\n", "3 4 D 1.13\n", "4 5 E 8.25" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Exportación a formato HDF5\n", "#\n", "df.to_hdf(\n", " \"/tmp/data.h5\",\n", " key=\"G1\",\n", ")\n", "\n", "pd.read_hdf(\n", " \"/tmp/data.h5\",\n", " key=\"G1\",\n", ")" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " index name value\n", "0 1 A 3.03\n", "1 2 B 5.14\n", "2 3 C 0.40\n", "3 4 D 1.13\n", "4 5 E 8.25" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.read_hdf('/tmp/data.h5')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STATA" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " level_0 index name value\n", "0 0 1 A 3.03\n", "1 1 2 B 5.14\n", "2 2 3 C 0.40\n", "3 3 4 D 1.13\n", "4 4 5 E 8.25" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Exporta el archivo\n", "#\n", "df.to_stata('/tmp/data.dta')\n", "\n", "#\n", "# Lee el archivo\n", "#\n", "pd.read_stata('/tmp/data.dta')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Lectura de Matlab y Octave" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'__header__': b'MATLAB 5.0 MAT-file Platform: posix, Created on: Sun Aug 29 04:14:07 2021',\n", " '__version__': '1.0',\n", " '__globals__': [],\n", " 'df': array([[(array([[None]], dtype=object), array([[None]], dtype=object), array([[None]], dtype=object))]],\n", " dtype=[('index', 'O'), ('name', 'O'), ('value', 'O')])}" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import scipy.io as sio\n", "\n", "sio.savemat(\"/tmp/data\", {\"df\": df})\n", "sio.loadmat(\"/tmp/data\")" ] } ], "metadata": { "anaconda-cloud": {}, "kernel_info": { "name": "python3" }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" }, "nteract": { "version": "0.7.1" } }, "nbformat": 4, "nbformat_minor": 4 }