Comandos equivalentes de SQL en Pandas — 18:15 min
18:15 min | Última modificación: Octubre 6, 2021
El lenguaje SQL es una de las principales herramientas para la gestión de datos, por lo que su uso es ampliamente difundido. En este documento se discute la equivalencia entre los principales comandos de Pandas y SQL.
Al finalizar el documento, usted estará en capacidad ejecutar en Pandas comandos equivalentes a los siguientes operadores de SQL:
SELECT
SEELCT DISTINCT
ORDER BY
WHERE
GROUP BY
INNER JOIN
LEFT OUTER JOIN
RIGHT OUTER JOIN
FULL JOIN
UNION
UPDATE
DELETE
Adaptado de: https://pandas.pydata.org/pandas-docs/stable/getting_started/comparison/comparison_with_sql.html
Preparación
[1]:
import numpy as np
import pandas as pd
pd.set_option("display.notebook_repr_html", False)
## Carga de datos
[2]:
tips = pd.read_csv(
"https://raw.githubusercontent.com/jdvelasq/playground/master/datasets/tips.csv",
sep=",",
thousands=None,
decimal=".",
)
SELECT
[3]:
#
# SELECT
# total_bill,
# tip,
# smoker,
# time
# FROM
# tips_df
# LIMIT 5;
#
tips[
[
"total_bill",
"tip",
"smoker",
"time",
]
].head(5)
[3]:
total_bill tip smoker time
0 16.99 1.01 No Dinner
1 10.34 1.66 No Dinner
2 21.01 3.50 No Dinner
3 23.68 3.31 No Dinner
4 24.59 3.61 No Dinner
SELECT DISTINCT
[4]:
#
# SELECT DISTINCT
# day
# FROM
# tips_df
# LIMIT 5;
#
tips["day"].unique()
[4]:
array(['Sun', 'Sat', 'Thur', 'Fri'], dtype=object)
ORDER BY
[5]:
#
# SELECT
# *
# FROM
# tips
# ORDER BY
# total_bill ASC
# LIMIT 5;
#
tips.sort_values('total_bill').head(5)
[5]:
total_bill tip sex smoker day time size
67 3.07 1.00 Female Yes Sat Dinner 1
92 5.75 1.00 Female Yes Fri Dinner 2
111 7.25 1.00 Female No Sat Dinner 1
172 7.25 5.15 Male Yes Sun Dinner 2
149 7.51 2.00 Male No Thur Lunch 2
[6]:
tips.sort_values(
"total_bill",
ascending=True,
).head(5)
[6]:
total_bill tip sex smoker day time size
67 3.07 1.00 Female Yes Sat Dinner 1
92 5.75 1.00 Female Yes Fri Dinner 2
111 7.25 1.00 Female No Sat Dinner 1
172 7.25 5.15 Male Yes Sun Dinner 2
149 7.51 2.00 Male No Thur Lunch 2
[7]:
#
# SELECT
# *
# FROM
# df
# ORDER BY
# total_bill DESC
# LIMIT 5;
#
tips.sort_values(
"total_bill",
ascending=False,
).head(5)
[7]:
total_bill tip sex smoker day time size
170 50.81 10.00 Male Yes Sat Dinner 3
212 48.33 9.00 Male No Sat Dinner 4
59 48.27 6.73 Male No Sat Dinner 4
156 48.17 5.00 Male No Sun Dinner 6
182 45.35 3.50 Male Yes Sun Dinner 3
WHERE
[8]:
#
# SELECT
# *
# FROM
# df
# WHERE
# time = 'Dinner'
# LIMIT 5;
#
tips[tips['time'] == 'Dinner'].head(5)
[8]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
[9]:
is_dinner = tips['time'] == 'Dinner'
is_dinner.value_counts()
[9]:
True 176
False 68
Name: time, dtype: int64
[10]:
tips[is_dinner].head(5)
[10]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
[11]:
tips.query("time == 'Dinner'").head(5)
[11]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
[12]:
#
# SELECT
# *
# FROM
# df
# WHERE
# time = 'Dinner' AND tip > 5.00;
#
tips[(tips['time'] == 'Dinner') & (tips['tip'] > 5.00)]
[12]:
total_bill tip sex smoker day time size
23 39.42 7.58 Male No Sat Dinner 4
44 30.40 5.60 Male No Sun Dinner 4
47 32.40 6.00 Male No Sun Dinner 4
52 34.81 5.20 Female No Sun Dinner 4
59 48.27 6.73 Male No Sat Dinner 4
116 29.93 5.07 Male No Sun Dinner 4
155 29.85 5.14 Female No Sun Dinner 5
170 50.81 10.00 Male Yes Sat Dinner 3
172 7.25 5.15 Male Yes Sun Dinner 2
181 23.33 5.65 Male Yes Sun Dinner 2
183 23.17 6.50 Male Yes Sun Dinner 4
211 25.89 5.16 Male Yes Sat Dinner 4
212 48.33 9.00 Male No Sat Dinner 4
214 28.17 6.50 Female Yes Sat Dinner 3
239 29.03 5.92 Male No Sat Dinner 3
[13]:
#
# SELECT
# *
# FROM
# df
# WHERE
# size >= 5 OR total_bill > 45;
#
tips[(tips['size'] >= 5) | (tips['total_bill'] > 45)]
[13]:
total_bill tip sex smoker day time size
59 48.27 6.73 Male No Sat Dinner 4
125 29.80 4.20 Female No Thur Lunch 6
141 34.30 6.70 Male No Thur Lunch 6
142 41.19 5.00 Male No Thur Lunch 5
143 27.05 5.00 Female No Thur Lunch 6
155 29.85 5.14 Female No Sun Dinner 5
156 48.17 5.00 Male No Sun Dinner 6
170 50.81 10.00 Male Yes Sat Dinner 3
182 45.35 3.50 Male Yes Sun Dinner 3
185 20.69 5.00 Male No Sun Dinner 5
187 30.46 2.00 Male Yes Sun Dinner 5
212 48.33 9.00 Male No Sat Dinner 4
216 28.15 3.00 Male Yes Sat Dinner 5
GROUP BY
[14]:
#
# SELECT
# sex,
# count(*)
# FROM
# df
# GROUP BY
# sex;
#
tips.groupby('sex').size()
[14]:
sex
Female 87
Male 157
dtype: int64
[15]:
tips.groupby('sex').count()
[15]:
total_bill tip smoker day time size
sex
Female 87 87 87 87 87 87
Male 157 157 157 157 157 157
[16]:
tips.groupby('sex')['total_bill'].count()
[16]:
sex
Female 87
Male 157
Name: total_bill, dtype: int64
[17]:
#
# SELECT
# day,
# AVG(tip),
# COUNT(*)
# FROM
# df
# GROUP BY
# day;
#
tips.groupby('day').agg({'tip': np.mean, 'day': np.size})
[17]:
tip day
day
Fri 2.734737 19
Sat 2.993103 87
Sun 3.255132 76
Thur 2.771452 62
[18]:
#
# SELECT
# smoker,
# day,
# COUNT(*),
# AVG(tip)
# FROM
# df
# GROUP BY
# smoker,
# day;
#
tips.groupby(['smoker', 'day']).agg({'tip': [np.size, np.mean]})
[18]:
tip
size mean
smoker day
No Fri 4.0 2.812500
Sat 45.0 3.102889
Sun 57.0 3.167895
Thur 45.0 2.673778
Yes Fri 15.0 2.714000
Sat 42.0 2.875476
Sun 19.0 3.516842
Thur 17.0 3.030000
[19]:
#
# Funciones que pueden ser aplicadas a un DataFrame:
#
# abs all any clip clip_lower clip_upper
# corr corrwith count cov cummax cummin
# cumprod cumsum describe diff eval kurt
# mad max mean median min mode
# ct_change prod quantile rank round sem
# skew sum std var
#
INNER JOIN
[20]:
#
# DataFrames de ejemplo
#
df1 = pd.DataFrame(
{
"key": ["A", "B", "C", "D"],
"value": np.random.randn(4)
}
)
df1
[20]:
key value
0 A 0.388804
1 B -1.410434
2 C 0.878714
3 D -0.818024
[21]:
df2 = pd.DataFrame(
{
'key': ['B', 'D', 'D', 'E'],
'value': np.random.randn(4)
}
)
df2
[21]:
key value
0 B 1.360152
1 D -0.489195
2 D 0.406400
3 E 0.835870
[22]:
#
# SELECT
# *
# FROM
# df1
# INNER JOIN
# df2
# ON df1.key = df2.key;
#
pd.merge(
df1,
df2,
on="key",
)
[22]:
key value_x value_y
0 B -1.410434 1.360152
1 D -0.818024 -0.489195
2 D -0.818024 0.406400
LEFT OUTER JOIN
[23]:
#
# SELECT
# *
# FROM
# df1
# LEFT OUTER JOIN
# df2
# ON
# df1.key = df2.key;
#
pd.merge(
df1,
df2,
on="key",
how="left",
)
[23]:
key value_x value_y
0 A 0.388804 NaN
1 B -1.410434 1.360152
2 C 0.878714 NaN
3 D -0.818024 -0.489195
4 D -0.818024 0.406400
RIGHT OUTER JOIN
[24]:
#
# SELECT
# *
# FROM
# df1
# RIGHT OUTER JOIN
# df2
# ON
# df1.key = df2.key;
#
pd.merge(
df1,
df2,
on="key",
how="right",
)
[24]:
key value_x value_y
0 B -1.410434 1.360152
1 D -0.818024 -0.489195
2 D -0.818024 0.406400
3 E NaN 0.835870
FULL OUTER JOIN
[25]:
#
# SELECT
# *
# FROM
# df1
# FULL OUTER JOIN
# df2
# ON
# df1.key = df2.key;
#
pd.merge(df1, df2, on='key', how='outer')
[25]:
key value_x value_y
0 A 0.388804 NaN
1 B -1.410434 1.360152
2 C 0.878714 NaN
3 D -0.818024 -0.489195
4 D -0.818024 0.406400
5 E NaN 0.835870
UNION
[26]:
df1 = pd.DataFrame(
{
'city': ['Chicago', 'San Francisco', 'New York City'],
'rank': range(1, 4)
}
)
df1
[26]:
city rank
0 Chicago 1
1 San Francisco 2
2 New York City 3
[27]:
df2 = pd.DataFrame(
{
'city': ['Chicago', 'Boston', 'Los Angeles'],
'rank': [1, 4, 5]
}
)
df2
[27]:
city rank
0 Chicago 1
1 Boston 4
2 Los Angeles 5
[28]:
#
# SELECT city, rank
# FROM df1
# UNION ALL
# SELECT city, rank
# FROM df2;
#
pd.concat(
[df1, df2],
)
[28]:
city rank
0 Chicago 1
1 San Francisco 2
2 New York City 3
0 Chicago 1
1 Boston 4
2 Los Angeles 5
[29]:
#
# SELECT
# city,
# rank
# FROM
# df1
# UNION
# SELECT
# city,
# rank
# FROM
# df2
#
pd.concat(
[df1, df2],
).drop_duplicates()
[29]:
city rank
0 Chicago 1
1 San Francisco 2
2 New York City 3
1 Boston 4
2 Los Angeles 5
Primeras N filas con desplazamiento (offset)
[30]:
#
# SELECT
# *
# FROM
# tips
# ORDER BY
# tip DESC
# LIMIT 10 OFFSET 5;
#
tips.nlargest(10 + 5, columns='tip').tail(10)
[30]:
total_bill tip sex smoker day time size
183 23.17 6.50 Male Yes Sun Dinner 4
214 28.17 6.50 Female Yes Sat Dinner 3
47 32.40 6.00 Male No Sun Dinner 4
239 29.03 5.92 Male No Sat Dinner 3
88 24.71 5.85 Male No Thur Lunch 2
181 23.33 5.65 Male Yes Sun Dinner 2
44 30.40 5.60 Male No Sun Dinner 4
52 34.81 5.20 Female No Sun Dinner 4
85 34.83 5.17 Female No Thur Lunch 4
211 25.89 5.16 Male Yes Sat Dinner 4
Primeras N filas por grupo
[31]:
#
# SELECT * FROM (
# SELECT
# t.*,
# ROW_NUMBER() OVER(PARTITION BY day ORDER BY total_bill DESC) AS rn
# FROM df t
# )
# WHERE rn < 3
# ORDER BY day, rn;
#
(
tips.assign(
rn=tips.sort_values(
["total_bill"],
ascending=False,
)
.groupby(["day"])
.cumcount()
+ 1
)
.query("rn < 3")
.sort_values(["day", "rn"])
)
[31]:
total_bill tip sex smoker day time size rn
95 40.17 4.73 Male Yes Fri Dinner 4 1
90 28.97 3.00 Male Yes Fri Dinner 2 2
170 50.81 10.00 Male Yes Sat Dinner 3 1
212 48.33 9.00 Male No Sat Dinner 4 2
156 48.17 5.00 Male No Sun Dinner 6 1
182 45.35 3.50 Male Yes Sun Dinner 3 2
197 43.11 5.00 Female Yes Thur Lunch 4 1
142 41.19 5.00 Male No Thur Lunch 5 2
[32]:
(
tips.assign(
rnk=tips.groupby(["day"])["total_bill"].rank(
method="first",
ascending=False,
)
)
.query("rnk < 3")
.sort_values(["day", "rnk"])
)
[32]:
total_bill tip sex smoker day time size rnk
95 40.17 4.73 Male Yes Fri Dinner 4 1.0
90 28.97 3.00 Male Yes Fri Dinner 2 2.0
170 50.81 10.00 Male Yes Sat Dinner 3 1.0
212 48.33 9.00 Male No Sat Dinner 4 2.0
156 48.17 5.00 Male No Sun Dinner 6 1.0
182 45.35 3.50 Male Yes Sun Dinner 3 2.0
197 43.11 5.00 Female Yes Thur Lunch 4 1.0
142 41.19 5.00 Male No Thur Lunch 5 2.0
[33]:
#
# SELECT * FROM (
# SELECT
# t.*,
# RANK() OVER(PARTITION BY sex ORDER BY tip) AS rnk
# FROM tips t
# WHERE tip < 2
# )
# WHERE rnk < 3
# ORDER BY sex, rnk;
#
(
tips[tips["tip"] < 2]
.assign(rnk_min=tips.groupby(["sex"])["tip"].rank(method="min"))
.query("rnk_min < 3")
.sort_values(["sex", "rnk_min"])
)
[33]:
total_bill tip sex smoker day time size rnk_min
67 3.07 1.00 Female Yes Sat Dinner 1 1.0
92 5.75 1.00 Female Yes Fri Dinner 2 1.0
111 7.25 1.00 Female No Sat Dinner 1 1.0
236 12.60 1.00 Male Yes Sat Dinner 2 1.0
237 32.83 1.17 Male Yes Sat Dinner 2 2.0
UPDATE
[34]:
#
# UPDATE
# tips
# SET
# tip = tip*2
# WHERE
# tip < 2;
#
tips.loc[tips['tip'] < 2, 'tip'] *= 2
tips.head()
[34]:
total_bill tip sex smoker day time size
0 16.99 2.02 Female No Sun Dinner 2
1 10.34 3.32 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
DELETE
[35]:
#
# DELETE FROM
# tips
# WHERE
# tip > 9;
#
tips.loc[tips['tip'] <= 9].head(10)
[35]:
total_bill tip sex smoker day time size
0 16.99 2.02 Female No Sun Dinner 2
1 10.34 3.32 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
5 25.29 4.71 Male No Sun Dinner 4
6 8.77 2.00 Male No Sun Dinner 2
7 26.88 3.12 Male No Sun Dinner 4
8 15.04 3.92 Male No Sun Dinner 2
9 14.78 3.23 Male No Sun Dinner 2