Comandos equivalentes de SQL en Pandas — 18:15 min

  • 18:15 min | Última modificación: Octubre 6, 2021

El lenguaje SQL es una de las principales herramientas para la gestión de datos, por lo que su uso es ampliamente difundido. En este documento se discute la equivalencia entre los principales comandos de Pandas y SQL.

Al finalizar el documento, usted estará en capacidad ejecutar en Pandas comandos equivalentes a los siguientes operadores de SQL:

  • SELECT

  • SEELCT DISTINCT

  • ORDER BY

  • WHERE

  • GROUP BY

  • INNER JOIN

  • LEFT OUTER JOIN

  • RIGHT OUTER JOIN

  • FULL JOIN

  • UNION

  • UPDATE

  • DELETE

Adaptado de: https://pandas.pydata.org/pandas-docs/stable/getting_started/comparison/comparison_with_sql.html

Preparación

[1]:
import numpy as np

import pandas as pd

pd.set_option("display.notebook_repr_html", False)

## Carga de datos

[2]:
tips = pd.read_csv(
    "https://raw.githubusercontent.com/jdvelasq/playground/master/datasets/tips.csv",
    sep=",",
    thousands=None,
    decimal=".",
)

SELECT

[3]:
#
#  SELECT
#      total_bill,
#      tip,
#      smoker,
#      time
#  FROM
#      tips_df
#  LIMIT 5;
#
tips[
    [
        "total_bill",
        "tip",
        "smoker",
        "time",
    ]
].head(5)
[3]:
   total_bill   tip smoker    time
0       16.99  1.01     No  Dinner
1       10.34  1.66     No  Dinner
2       21.01  3.50     No  Dinner
3       23.68  3.31     No  Dinner
4       24.59  3.61     No  Dinner

SELECT DISTINCT

[4]:
#
#  SELECT DISTINCT
#      day
#  FROM
#      tips_df
#  LIMIT 5;
#
tips["day"].unique()
[4]:
array(['Sun', 'Sat', 'Thur', 'Fri'], dtype=object)

ORDER BY

[5]:
#
#  SELECT
#      *
#  FROM
#      tips
#  ORDER BY
#      total_bill ASC
#  LIMIT 5;
#
tips.sort_values('total_bill').head(5)
[5]:
     total_bill   tip     sex smoker   day    time  size
67         3.07  1.00  Female    Yes   Sat  Dinner     1
92         5.75  1.00  Female    Yes   Fri  Dinner     2
111        7.25  1.00  Female     No   Sat  Dinner     1
172        7.25  5.15    Male    Yes   Sun  Dinner     2
149        7.51  2.00    Male     No  Thur   Lunch     2
[6]:
tips.sort_values(
    "total_bill",
    ascending=True,
).head(5)
[6]:
     total_bill   tip     sex smoker   day    time  size
67         3.07  1.00  Female    Yes   Sat  Dinner     1
92         5.75  1.00  Female    Yes   Fri  Dinner     2
111        7.25  1.00  Female     No   Sat  Dinner     1
172        7.25  5.15    Male    Yes   Sun  Dinner     2
149        7.51  2.00    Male     No  Thur   Lunch     2
[7]:
#
#  SELECT
#      *
#  FROM
#      df
#  ORDER BY
#      total_bill DESC
#  LIMIT 5;
#
tips.sort_values(
    "total_bill",
    ascending=False,
).head(5)
[7]:
     total_bill    tip   sex smoker  day    time  size
170       50.81  10.00  Male    Yes  Sat  Dinner     3
212       48.33   9.00  Male     No  Sat  Dinner     4
59        48.27   6.73  Male     No  Sat  Dinner     4
156       48.17   5.00  Male     No  Sun  Dinner     6
182       45.35   3.50  Male    Yes  Sun  Dinner     3

WHERE

[8]:
#
#  SELECT
#      *
#  FROM
#      df
#  WHERE
#      time = 'Dinner'
#  LIMIT 5;
#
tips[tips['time'] == 'Dinner'].head(5)
[8]:
   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4
[9]:
is_dinner = tips['time'] == 'Dinner'
is_dinner.value_counts()
[9]:
True     176
False     68
Name: time, dtype: int64
[10]:
tips[is_dinner].head(5)
[10]:
   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4
[11]:
tips.query("time == 'Dinner'").head(5)
[11]:
   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4
[12]:
#
#  SELECT
#      *
#  FROM
#      df
#  WHERE
#      time = 'Dinner' AND tip > 5.00;
#
tips[(tips['time'] == 'Dinner') & (tips['tip'] > 5.00)]
[12]:
     total_bill    tip     sex smoker  day    time  size
23        39.42   7.58    Male     No  Sat  Dinner     4
44        30.40   5.60    Male     No  Sun  Dinner     4
47        32.40   6.00    Male     No  Sun  Dinner     4
52        34.81   5.20  Female     No  Sun  Dinner     4
59        48.27   6.73    Male     No  Sat  Dinner     4
116       29.93   5.07    Male     No  Sun  Dinner     4
155       29.85   5.14  Female     No  Sun  Dinner     5
170       50.81  10.00    Male    Yes  Sat  Dinner     3
172        7.25   5.15    Male    Yes  Sun  Dinner     2
181       23.33   5.65    Male    Yes  Sun  Dinner     2
183       23.17   6.50    Male    Yes  Sun  Dinner     4
211       25.89   5.16    Male    Yes  Sat  Dinner     4
212       48.33   9.00    Male     No  Sat  Dinner     4
214       28.17   6.50  Female    Yes  Sat  Dinner     3
239       29.03   5.92    Male     No  Sat  Dinner     3
[13]:
#
#  SELECT
#      *
#  FROM
#      df
#  WHERE
#      size >= 5 OR total_bill > 45;
#
tips[(tips['size'] >= 5) | (tips['total_bill'] > 45)]
[13]:
     total_bill    tip     sex smoker   day    time  size
59        48.27   6.73    Male     No   Sat  Dinner     4
125       29.80   4.20  Female     No  Thur   Lunch     6
141       34.30   6.70    Male     No  Thur   Lunch     6
142       41.19   5.00    Male     No  Thur   Lunch     5
143       27.05   5.00  Female     No  Thur   Lunch     6
155       29.85   5.14  Female     No   Sun  Dinner     5
156       48.17   5.00    Male     No   Sun  Dinner     6
170       50.81  10.00    Male    Yes   Sat  Dinner     3
182       45.35   3.50    Male    Yes   Sun  Dinner     3
185       20.69   5.00    Male     No   Sun  Dinner     5
187       30.46   2.00    Male    Yes   Sun  Dinner     5
212       48.33   9.00    Male     No   Sat  Dinner     4
216       28.15   3.00    Male    Yes   Sat  Dinner     5

GROUP BY

[14]:
#
#  SELECT
#      sex,
#      count(*)
#  FROM
#      df
#  GROUP BY
#      sex;
#
tips.groupby('sex').size()
[14]:
sex
Female     87
Male      157
dtype: int64
[15]:
tips.groupby('sex').count()
[15]:
        total_bill  tip  smoker  day  time  size
sex
Female          87   87      87   87    87    87
Male           157  157     157  157   157   157
[16]:
tips.groupby('sex')['total_bill'].count()
[16]:
sex
Female     87
Male      157
Name: total_bill, dtype: int64
[17]:
#
#  SELECT
#      day,
#      AVG(tip),
#      COUNT(*)
#  FROM
#      df
#  GROUP BY
#      day;
#
tips.groupby('day').agg({'tip': np.mean, 'day': np.size})
[17]:
           tip  day
day
Fri   2.734737   19
Sat   2.993103   87
Sun   3.255132   76
Thur  2.771452   62
[18]:
#
#  SELECT
#      smoker,
#      day,
#      COUNT(*),
#      AVG(tip)
#  FROM
#      df
#  GROUP BY
#      smoker,
#      day;
#
tips.groupby(['smoker', 'day']).agg({'tip': [np.size, np.mean]})
[18]:
              tip
             size      mean
smoker day
No     Fri    4.0  2.812500
       Sat   45.0  3.102889
       Sun   57.0  3.167895
       Thur  45.0  2.673778
Yes    Fri   15.0  2.714000
       Sat   42.0  2.875476
       Sun   19.0  3.516842
       Thur  17.0  3.030000
[19]:
#
# Funciones que pueden ser aplicadas a un DataFrame:
#
#    abs        all       any       clip    clip_lower  clip_upper
#    corr       corrwith  count     cov     cummax      cummin
#    cumprod    cumsum    describe  diff    eval        kurt
#    mad        max       mean      median  min         mode
#    ct_change  prod      quantile  rank    round       sem
#    skew       sum       std       var
#

INNER JOIN

[20]:
#
# DataFrames de ejemplo
#
df1 = pd.DataFrame(
    {
        "key": ["A", "B", "C", "D"],
        "value": np.random.randn(4)
    }
)

df1
[20]:
  key     value
0   A  0.388804
1   B -1.410434
2   C  0.878714
3   D -0.818024
[21]:
df2 = pd.DataFrame(
    {
        'key': ['B', 'D', 'D', 'E'],
        'value': np.random.randn(4)
    }
)

df2
[21]:
  key     value
0   B  1.360152
1   D -0.489195
2   D  0.406400
3   E  0.835870
[22]:
#
#  SELECT
#      *
#  FROM
#      df1
#  INNER JOIN
#      df2
#  ON df1.key = df2.key;
#
pd.merge(
    df1,
    df2,
    on="key",
)
[22]:
  key   value_x   value_y
0   B -1.410434  1.360152
1   D -0.818024 -0.489195
2   D -0.818024  0.406400

LEFT OUTER JOIN

[23]:
#
#  SELECT
#      *
#  FROM
#      df1
#  LEFT OUTER JOIN
#      df2
#  ON
#      df1.key = df2.key;
#
pd.merge(
    df1,
    df2,
    on="key",
    how="left",
)
[23]:
  key   value_x   value_y
0   A  0.388804       NaN
1   B -1.410434  1.360152
2   C  0.878714       NaN
3   D -0.818024 -0.489195
4   D -0.818024  0.406400

RIGHT OUTER JOIN

[24]:
#
#  SELECT
#      *
#  FROM
#      df1
#  RIGHT OUTER JOIN
#      df2
# ON
#     df1.key = df2.key;
#
pd.merge(
    df1,
    df2,
    on="key",
    how="right",
)
[24]:
  key   value_x   value_y
0   B -1.410434  1.360152
1   D -0.818024 -0.489195
2   D -0.818024  0.406400
3   E       NaN  0.835870

FULL OUTER JOIN

[25]:
#
#  SELECT
#      *
#  FROM
#      df1
#  FULL OUTER JOIN
#      df2
#  ON
#      df1.key = df2.key;
#
pd.merge(df1, df2, on='key', how='outer')
[25]:
  key   value_x   value_y
0   A  0.388804       NaN
1   B -1.410434  1.360152
2   C  0.878714       NaN
3   D -0.818024 -0.489195
4   D -0.818024  0.406400
5   E       NaN  0.835870

UNION

[26]:
df1 = pd.DataFrame(
    {
        'city': ['Chicago', 'San Francisco', 'New York City'],
        'rank': range(1, 4)
    }
)

df1
[26]:
            city  rank
0        Chicago     1
1  San Francisco     2
2  New York City     3
[27]:
df2 = pd.DataFrame(
    {
        'city': ['Chicago', 'Boston', 'Los Angeles'],
        'rank': [1, 4, 5]
    }
)

df2
[27]:
          city  rank
0      Chicago     1
1       Boston     4
2  Los Angeles     5
[28]:
#
#  SELECT city, rank
#  FROM df1
#  UNION ALL
#  SELECT city, rank
#  FROM df2;
#
pd.concat(
    [df1, df2],
)
[28]:
            city  rank
0        Chicago     1
1  San Francisco     2
2  New York City     3
0        Chicago     1
1         Boston     4
2    Los Angeles     5
[29]:
#
#  SELECT
#      city,
#     rank
#  FROM
#      df1
#  UNION
#      SELECT
#          city,
#          rank
#      FROM
#          df2
#
pd.concat(
    [df1, df2],
).drop_duplicates()
[29]:
            city  rank
0        Chicago     1
1  San Francisco     2
2  New York City     3
1         Boston     4
2    Los Angeles     5

Primeras N filas con desplazamiento (offset)

[30]:
#
#  SELECT
#      *
#  FROM
#      tips
#  ORDER BY
#      tip DESC
#  LIMIT 10 OFFSET 5;
#
tips.nlargest(10 + 5, columns='tip').tail(10)
[30]:
     total_bill   tip     sex smoker   day    time  size
183       23.17  6.50    Male    Yes   Sun  Dinner     4
214       28.17  6.50  Female    Yes   Sat  Dinner     3
47        32.40  6.00    Male     No   Sun  Dinner     4
239       29.03  5.92    Male     No   Sat  Dinner     3
88        24.71  5.85    Male     No  Thur   Lunch     2
181       23.33  5.65    Male    Yes   Sun  Dinner     2
44        30.40  5.60    Male     No   Sun  Dinner     4
52        34.81  5.20  Female     No   Sun  Dinner     4
85        34.83  5.17  Female     No  Thur   Lunch     4
211       25.89  5.16    Male    Yes   Sat  Dinner     4

Primeras N filas por grupo

[31]:
#
#  SELECT * FROM (
#    SELECT
#      t.*,
#      ROW_NUMBER() OVER(PARTITION BY day ORDER BY total_bill DESC) AS rn
#    FROM df t
#  )
#  WHERE rn < 3
#  ORDER BY day, rn;
#
(
    tips.assign(
        rn=tips.sort_values(
            ["total_bill"],
            ascending=False,
        )
        .groupby(["day"])
        .cumcount()
        + 1
    )
    .query("rn < 3")
    .sort_values(["day", "rn"])
)
[31]:
     total_bill    tip     sex smoker   day    time  size  rn
95        40.17   4.73    Male    Yes   Fri  Dinner     4   1
90        28.97   3.00    Male    Yes   Fri  Dinner     2   2
170       50.81  10.00    Male    Yes   Sat  Dinner     3   1
212       48.33   9.00    Male     No   Sat  Dinner     4   2
156       48.17   5.00    Male     No   Sun  Dinner     6   1
182       45.35   3.50    Male    Yes   Sun  Dinner     3   2
197       43.11   5.00  Female    Yes  Thur   Lunch     4   1
142       41.19   5.00    Male     No  Thur   Lunch     5   2
[32]:
(
    tips.assign(
        rnk=tips.groupby(["day"])["total_bill"].rank(
            method="first",
            ascending=False,
        )
    )
    .query("rnk < 3")
    .sort_values(["day", "rnk"])
)
[32]:
     total_bill    tip     sex smoker   day    time  size  rnk
95        40.17   4.73    Male    Yes   Fri  Dinner     4  1.0
90        28.97   3.00    Male    Yes   Fri  Dinner     2  2.0
170       50.81  10.00    Male    Yes   Sat  Dinner     3  1.0
212       48.33   9.00    Male     No   Sat  Dinner     4  2.0
156       48.17   5.00    Male     No   Sun  Dinner     6  1.0
182       45.35   3.50    Male    Yes   Sun  Dinner     3  2.0
197       43.11   5.00  Female    Yes  Thur   Lunch     4  1.0
142       41.19   5.00    Male     No  Thur   Lunch     5  2.0
[33]:
#
#  SELECT * FROM (
#      SELECT
#          t.*,
#          RANK() OVER(PARTITION BY sex ORDER BY tip) AS rnk
#      FROM tips t
#      WHERE tip < 2
#  )
#  WHERE rnk < 3
#  ORDER BY sex, rnk;
#
(
    tips[tips["tip"] < 2]
    .assign(rnk_min=tips.groupby(["sex"])["tip"].rank(method="min"))
    .query("rnk_min < 3")
    .sort_values(["sex", "rnk_min"])
)
[33]:
     total_bill   tip     sex smoker  day    time  size  rnk_min
67         3.07  1.00  Female    Yes  Sat  Dinner     1      1.0
92         5.75  1.00  Female    Yes  Fri  Dinner     2      1.0
111        7.25  1.00  Female     No  Sat  Dinner     1      1.0
236       12.60  1.00    Male    Yes  Sat  Dinner     2      1.0
237       32.83  1.17    Male    Yes  Sat  Dinner     2      2.0

UPDATE

[34]:
#
#  UPDATE
#      tips
#  SET
#      tip = tip*2
#  WHERE
#      tip < 2;
#
tips.loc[tips['tip'] < 2, 'tip'] *= 2
tips.head()
[34]:
   total_bill   tip     sex smoker  day    time  size
0       16.99  2.02  Female     No  Sun  Dinner     2
1       10.34  3.32    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4

DELETE

[35]:
#
#  DELETE FROM
#      tips
#  WHERE
#      tip > 9;
#
tips.loc[tips['tip'] <= 9].head(10)
[35]:
   total_bill   tip     sex smoker  day    time  size
0       16.99  2.02  Female     No  Sun  Dinner     2
1       10.34  3.32    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4
5       25.29  4.71    Male     No  Sun  Dinner     4
6        8.77  2.00    Male     No  Sun  Dinner     2
7       26.88  3.12    Male     No  Sun  Dinner     4
8       15.04  3.92    Male     No  Sun  Dinner     2
9       14.78  3.23    Male     No  Sun  Dinner     2