{ "cells": [ { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "Redimensionamiento de tablas --- 10:31 min\n", "=======\n", "\n", "* 10:31 min | Última modificación: Octubre 6, 2021 " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "En la preparación de datos usualmente es necesario realizar transformaciones que modifican la estructura de una tabla, con el fin de transformar los datos a un formato que pueda ser usado para proveer insights." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Al finalizar el documento, usted estará en capacidad de transformar una tabla usando los operadores:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* Melt & Pivot.\n", "\n", "* Stack & Unstack.\n", "\n", "* Tablas dinámicas." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preparación" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "\n", "pd.set_option(\"display.notebook_repr_html\", False)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " Sepal_Length Sepal_Width Petal_Length Petal_Width Species\n", "0 5.1 3.5 1.4 0.2 setosa\n", "1 4.9 3.0 1.4 0.2 setosa\n", "2 4.7 3.2 1.3 0.2 setosa\n", "3 4.6 3.1 1.5 0.2 setosa\n", "4 5.0 3.6 1.4 0.2 setosa" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris = pd.read_csv(\n", " \"https://raw.githubusercontent.com/jdvelasq/datalabs/master/datasets/iris.csv\",\n", " sep=\",\",\n", " thousands=None,\n", " decimal=\".\",\n", ")\n", "\n", "iris.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Melt & Pivot" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " Sepal_Length Sepal_Width Petal_Length Petal_Width Species id\n", "0 5.1 3.5 1.4 0.2 setosa 0\n", "1 4.9 3.0 1.4 0.2 setosa 1\n", "2 4.7 3.2 1.3 0.2 setosa 2\n", "3 4.6 3.1 1.5 0.2 setosa 3\n", "4 5.0 3.6 1.4 0.2 setosa 4" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Agrega una clave para identificar cada caso\n", "#\n", "iris[\"id\"] = list(range(150))\n", "iris.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " id Variables Values\n", "0 0 Sepal_Length 5.1\n", "1 1 Sepal_Length 4.9\n", "2 2 Sepal_Length 4.7\n", "3 3 Sepal_Length 4.6\n", "4 4 Sepal_Length 5\n", ".. ... ... ...\n", "745 145 Species virginica\n", "746 146 Species virginica\n", "747 147 Species virginica\n", "748 148 Species virginica\n", "749 149 Species virginica\n", "\n", "[750 rows x 3 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris_melt = pd.melt(\n", " iris,\n", " id_vars=\"id\",\n", " var_name=\"Variables\",\n", " value_name=\"Values\",\n", ")\n", "\n", "iris_melt" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Variables Petal_Length Petal_Width Sepal_Length Sepal_Width Species\n", "id \n", "0 1.4 0.2 5.1 3.5 setosa\n", "1 1.4 0.2 4.9 3 setosa\n", "2 1.3 0.2 4.7 3.2 setosa\n", "3 1.5 0.2 4.6 3.1 setosa\n", "4 1.4 0.2 5 3.6 setosa" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris_melt.pivot(\n", " index=\"id\",\n", " columns=\"Variables\",\n", " values=\"Values\",\n", ").head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## pivot_table()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Species\n", "setosa 1.462\n", "versicolor 4.260\n", "virginica 5.552\n", "Name: Petal_Length, dtype: float64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Cálculo de la media de la columna 'Petal_Length'\n", "# por cada categoría de la columna 'Species'\n", "#\n", "iris.groupby(\"Species\")[\"Petal_Length\"].mean()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " Petal_Length\n", "Species \n", "setosa 1.462\n", "versicolor 4.260\n", "virginica 5.552" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Cálculo equivalente usando pivot_table()\n", "#\n", "iris.pivot_table(values=\"Petal_Length\", index=\"Species\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " Petal_Length\n", "Species \n", "setosa 1.50\n", "versicolor 4.35\n", "virginica 5.55" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.pivot_table(\n", " values=\"Petal_Length\",\n", " index=\"Species\",\n", " aggfunc=np.median,\n", ")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " amax amin\n", " Petal_Length Petal_Length\n", "Species \n", "setosa 1.9 1.0\n", "versicolor 5.1 3.0\n", "virginica 6.9 4.5" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.pivot_table(\n", " values=\"Petal_Length\",\n", " index=\"Species\",\n", " aggfunc=[np.max, np.min],\n", ")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Sepal_Length 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5.0 5.1 \\\n", "Species \n", "setosa 1.1 1.333333 1.3 1.325 1.45 1.58 1.45 1.45 1.5625 \n", "versicolor NaN NaN NaN NaN NaN NaN 3.30 3.40 3.0000 \n", "virginica NaN NaN NaN NaN NaN NaN 4.50 NaN NaN \n", "\n", "Sepal_Length 5.2 ... 6.8 6.9 7.0 7.1 7.2 7.3 7.4 7.6 7.7 \\\n", "Species ... \n", "setosa 1.466667 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", "versicolor 3.900000 ... 4.8 4.9 4.7 NaN NaN NaN NaN NaN NaN \n", "virginica NaN ... 5.7 5.4 NaN 5.9 5.966667 6.3 6.1 6.6 6.6 \n", "\n", "Sepal_Length 7.9 \n", "Species \n", "setosa NaN \n", "versicolor NaN \n", "virginica 6.4 \n", "\n", "[3 rows x 35 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.pivot_table(\n", " values=\"Petal_Length\",\n", " index=\"Species\",\n", " columns=\"Sepal_Length\",\n", ")" ] }, { "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "## Stack & Unstack" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 Sepal_Length 5.1\n", " Sepal_Width 3.5\n", " Petal_Length 1.4\n", " Petal_Width 0.2\n", " Species setosa\n", " id 0\n", "1 Sepal_Length 4.9\n", " Sepal_Width 3\n", " Petal_Length 1.4\n", " Petal_Width 0.2\n", " Species setosa\n", " id 1\n", "2 Sepal_Length 4.7\n", " Sepal_Width 3.2\n", " Petal_Length 1.3\n", " Petal_Width 0.2\n", " Species setosa\n", " id 2\n", "3 Sepal_Length 4.6\n", " Sepal_Width 3.1\n", " Petal_Length 1.5\n", " Petal_Width 0.2\n", " Species setosa\n", " id 3\n", "dtype: object" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Retorna un vector con un índice compuesto por\n", "# cada registro del dataframe\n", "#\n", "iris.stack().head(24)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " Sepal_Length Sepal_Width Petal_Length Petal_Width Species id\n", "0 5.1 3.5 1.4 0.2 setosa 0\n", "1 4.9 3 1.4 0.2 setosa 1\n", "2 4.7 3.2 1.3 0.2 setosa 2\n", "3 4.6 3.1 1.5 0.2 setosa 3\n", "4 5 3.6 1.4 0.2 setosa 4" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iris.stack().unstack().head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tablas dinámicas" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " key1 key2 values1 values2\n", "0 a A 1 7\n", "1 a B 2 8\n", "2 b A 3 9\n", "3 b B 4 10\n", "4 c A 5 11\n", "5 c B 6 12" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(\n", " {\n", " \"key1\": [\"a\", \"a\", \"b\", \"b\", \"c\", \"c\"],\n", " \"key2\": [\"A\", \"B\", \"A\", \"B\", \"A\", \"B\"],\n", " \"values1\": [1, 2, 3, 4, 5, 6],\n", " \"values2\": [7, 8, 9, 10, 11, 12],\n", " }\n", ")\n", "df" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " values1 values2\n", "key1 key2 \n", "a A 1 7\n", " B 2 8\n", "b A 3 9\n", " B 4 10\n", "c A 5 11\n", " B 6 12" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.pivot_table(\n", " df,\n", " index=[\"key1\", \"key2\"],\n", " values=[\"values1\", \"values2\"],\n", ")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ " values1 values2\n", "key2 key1 \n", "A a 1 7\n", " b 3 9\n", " c 5 11\n", "B a 2 8\n", " b 4 10\n", " c 6 12" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.pivot_table(\n", " df,\n", " index=[\"key2\", \"key1\"],\n", " values=[\"values1\", \"values2\"],\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Método pivot()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " foo bar baz zoo\n", "0 one A 1 x\n", "1 one B 2 y\n", "2 one C 3 z\n", "3 two A 4 q\n", "4 two B 5 w\n", "5 two C 6 t" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(\n", " {\n", " \"foo\": [\"one\", \"one\", \"one\", \"two\", \"two\", \"two\"],\n", " \"bar\": [\"A\", \"B\", \"C\", \"A\", \"B\", \"C\"],\n", " \"baz\": [1, 2, 3, 4, 5, 6],\n", " \"zoo\": [\"x\", \"y\", \"z\", \"q\", \"w\", \"t\"],\n", " }\n", ")\n", "df" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "bar A B C\n", "foo \n", "one 1 2 3\n", "two 4 5 6" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.pivot(\n", " index=\"foo\",\n", " columns=\"bar\",\n", " values=\"baz\",\n", ")" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " baz zoo \n", "bar A B C A B C\n", "foo \n", "one 1 2 3 x y z\n", "two 4 5 6 q w t" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.pivot(\n", " index=\"foo\",\n", " columns=\"bar\",\n", ")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " baz zoo \n", "bar A B C A B C\n", "foo \n", "one 1 2 3 x y z\n", "two 4 5 6 q w t" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.pivot(\n", " index=\"foo\",\n", " columns=\"bar\",\n", " values=[\"baz\", \"zoo\"],\n", ")" ] } ], "metadata": { "anaconda-cloud": {}, "kernel_info": { "name": "python3" }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" }, "nteract": { "version": "0.7.1" } }, "nbformat": 4, "nbformat_minor": 4 }