Redimensionamiento de tablas — 10:31 min
10:31 min | Última modificación: Octubre 6, 2021
En la preparación de datos usualmente es necesario realizar transformaciones que modifican la estructura de una tabla, con el fin de transformar los datos a un formato que pueda ser usado para proveer insights.
Al finalizar el documento, usted estará en capacidad de transformar una tabla usando los operadores:
Melt & Pivot.
Stack & Unstack.
Tablas dinámicas.
Preparación
[1]:
import numpy as np
import pandas as pd
pd.set_option("display.notebook_repr_html", False)
[2]:
iris = pd.read_csv(
"https://raw.githubusercontent.com/jdvelasq/datalabs/master/datasets/iris.csv",
sep=",",
thousands=None,
decimal=".",
)
iris.head()
[2]:
Sepal_Length Sepal_Width Petal_Length Petal_Width Species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
Melt & Pivot
[3]:
#
# Agrega una clave para identificar cada caso
#
iris["id"] = list(range(150))
iris.head()
[3]:
Sepal_Length Sepal_Width Petal_Length Petal_Width Species id
0 5.1 3.5 1.4 0.2 setosa 0
1 4.9 3.0 1.4 0.2 setosa 1
2 4.7 3.2 1.3 0.2 setosa 2
3 4.6 3.1 1.5 0.2 setosa 3
4 5.0 3.6 1.4 0.2 setosa 4
[4]:
iris_melt = pd.melt(
iris,
id_vars="id",
var_name="Variables",
value_name="Values",
)
iris_melt
[4]:
id Variables Values
0 0 Sepal_Length 5.1
1 1 Sepal_Length 4.9
2 2 Sepal_Length 4.7
3 3 Sepal_Length 4.6
4 4 Sepal_Length 5
.. ... ... ...
745 145 Species virginica
746 146 Species virginica
747 147 Species virginica
748 148 Species virginica
749 149 Species virginica
[750 rows x 3 columns]
[5]:
iris_melt.pivot(
index="id",
columns="Variables",
values="Values",
).head()
[5]:
Variables Petal_Length Petal_Width Sepal_Length Sepal_Width Species
id
0 1.4 0.2 5.1 3.5 setosa
1 1.4 0.2 4.9 3 setosa
2 1.3 0.2 4.7 3.2 setosa
3 1.5 0.2 4.6 3.1 setosa
4 1.4 0.2 5 3.6 setosa
pivot_table()
[6]:
#
# Cálculo de la media de la columna 'Petal_Length'
# por cada categoría de la columna 'Species'
#
iris.groupby("Species")["Petal_Length"].mean()
[6]:
Species
setosa 1.462
versicolor 4.260
virginica 5.552
Name: Petal_Length, dtype: float64
[7]:
#
# Cálculo equivalente usando pivot_table()
#
iris.pivot_table(values="Petal_Length", index="Species")
[7]:
Petal_Length
Species
setosa 1.462
versicolor 4.260
virginica 5.552
[8]:
iris.pivot_table(
values="Petal_Length",
index="Species",
aggfunc=np.median,
)
[8]:
Petal_Length
Species
setosa 1.50
versicolor 4.35
virginica 5.55
[9]:
iris.pivot_table(
values="Petal_Length",
index="Species",
aggfunc=[np.max, np.min],
)
[9]:
amax amin
Petal_Length Petal_Length
Species
setosa 1.9 1.0
versicolor 5.1 3.0
virginica 6.9 4.5
[10]:
iris.pivot_table(
values="Petal_Length",
index="Species",
columns="Sepal_Length",
)
[10]:
Sepal_Length 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5.0 5.1 \
Species
setosa 1.1 1.333333 1.3 1.325 1.45 1.58 1.45 1.45 1.5625
versicolor NaN NaN NaN NaN NaN NaN 3.30 3.40 3.0000
virginica NaN NaN NaN NaN NaN NaN 4.50 NaN NaN
Sepal_Length 5.2 ... 6.8 6.9 7.0 7.1 7.2 7.3 7.4 7.6 7.7 \
Species ...
setosa 1.466667 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN
versicolor 3.900000 ... 4.8 4.9 4.7 NaN NaN NaN NaN NaN NaN
virginica NaN ... 5.7 5.4 NaN 5.9 5.966667 6.3 6.1 6.6 6.6
Sepal_Length 7.9
Species
setosa NaN
versicolor NaN
virginica 6.4
[3 rows x 35 columns]
## Stack & Unstack
[11]:
#
# Retorna un vector con un índice compuesto por
# cada registro del dataframe
#
iris.stack().head(24)
[11]:
0 Sepal_Length 5.1
Sepal_Width 3.5
Petal_Length 1.4
Petal_Width 0.2
Species setosa
id 0
1 Sepal_Length 4.9
Sepal_Width 3
Petal_Length 1.4
Petal_Width 0.2
Species setosa
id 1
2 Sepal_Length 4.7
Sepal_Width 3.2
Petal_Length 1.3
Petal_Width 0.2
Species setosa
id 2
3 Sepal_Length 4.6
Sepal_Width 3.1
Petal_Length 1.5
Petal_Width 0.2
Species setosa
id 3
dtype: object
[12]:
iris.stack().unstack().head()
[12]:
Sepal_Length Sepal_Width Petal_Length Petal_Width Species id
0 5.1 3.5 1.4 0.2 setosa 0
1 4.9 3 1.4 0.2 setosa 1
2 4.7 3.2 1.3 0.2 setosa 2
3 4.6 3.1 1.5 0.2 setosa 3
4 5 3.6 1.4 0.2 setosa 4
Tablas dinámicas
[13]:
df = pd.DataFrame(
{
"key1": ["a", "a", "b", "b", "c", "c"],
"key2": ["A", "B", "A", "B", "A", "B"],
"values1": [1, 2, 3, 4, 5, 6],
"values2": [7, 8, 9, 10, 11, 12],
}
)
df
[13]:
key1 key2 values1 values2
0 a A 1 7
1 a B 2 8
2 b A 3 9
3 b B 4 10
4 c A 5 11
5 c B 6 12
[14]:
pd.pivot_table(
df,
index=["key1", "key2"],
values=["values1", "values2"],
)
[14]:
values1 values2
key1 key2
a A 1 7
B 2 8
b A 3 9
B 4 10
c A 5 11
B 6 12
[15]:
pd.pivot_table(
df,
index=["key2", "key1"],
values=["values1", "values2"],
)
[15]:
values1 values2
key2 key1
A a 1 7
b 3 9
c 5 11
B a 2 8
b 4 10
c 6 12
Método pivot()
[16]:
df = pd.DataFrame(
{
"foo": ["one", "one", "one", "two", "two", "two"],
"bar": ["A", "B", "C", "A", "B", "C"],
"baz": [1, 2, 3, 4, 5, 6],
"zoo": ["x", "y", "z", "q", "w", "t"],
}
)
df
[16]:
foo bar baz zoo
0 one A 1 x
1 one B 2 y
2 one C 3 z
3 two A 4 q
4 two B 5 w
5 two C 6 t
[17]:
df.pivot(
index="foo",
columns="bar",
values="baz",
)
[17]:
bar A B C
foo
one 1 2 3
two 4 5 6
[18]:
df.pivot(
index="foo",
columns="bar",
)
[18]:
baz zoo
bar A B C A B C
foo
one 1 2 3 x y z
two 4 5 6 q w t
[19]:
df.pivot(
index="foo",
columns="bar",
values=["baz", "zoo"],
)
[19]:
baz zoo
bar A B C A B C
foo
one 1 2 3 x y z
two 4 5 6 q w t