{ "cells": [ { "cell_type": "markdown", "id": "8975fbcc-1897-4341-97e3-4d99c0501e20", "metadata": { "tags": [] }, "source": [ "Fusión (merging/join) de datasets --- 14:56 min\n", "===\n", "\n", "* 14:56 min | Última modificación: Octubre 6, 2021 " ] }, { "cell_type": "markdown", "id": "c6ea327f-fab1-41ec-820b-add14eb1464a", "metadata": {}, "source": [ "Realiza la unión de datasets de forma similar a como se hacen los joins en las bases de datos." ] }, { "cell_type": "code", "execution_count": 1, "id": "a321e583-2c9f-44a0-8a2d-0ab3526885ef", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "pd.set_option(\"display.notebook_repr_html\", False)" ] }, { "cell_type": "markdown", "id": "dc45dd52-c716-4ad6-91f3-30397a2f647a", "metadata": { "tags": [] }, "source": [ "## Fusión uno-a-uno sobre columnas comunes" ] }, { "cell_type": "markdown", "id": "b0f574ec-183d-451c-b96f-f29def1a2c54", "metadata": {}, "source": [ "El join se ejecuta sobre las columnas que tienen los mismos nombres en ambas tablas." ] }, { "cell_type": "markdown", "id": "ee2ef102-1f62-4c70-a351-9b428ab3ee06", "metadata": {}, "source": [ "**Creación y carga del primer dataframe.**" ] }, { "cell_type": "code", "execution_count": 2, "id": "5b73d526-880c-47aa-8ab3-22e7ca58a7b9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing /tmp/clients.csv\n" ] } ], "source": [ "%%writefile /tmp/clients.csv\n", "clientId,key,name\n", "11,B,Ferris Q. Le\n", "10,A,Omar Y. Fletcher\n", "11,A,Mira N. Franklin\n", "10,B,Buffy W. Vincent" ] }, { "cell_type": "markdown", "id": "394ca60c-b10d-41e2-ab36-ea4922b1ef74", "metadata": {}, "source": [ "**Creación y carga del segundo dataframe.**" ] }, { "cell_type": "code", "execution_count": 3, "id": "9698dc54-c558-496f-b227-264bd7b803f0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Overwriting /tmp/bonus.csv\n" ] } ], "source": [ "%%writefile /tmp/bonus.csv\n", "clientId,key,bonus\n", "10,A,138\n", "10,B,227\n", "11,A,279\n", "11,B,160" ] }, { "cell_type": "markdown", "id": "0c68e941-2a17-438c-8816-8167107c6f6d", "metadata": {}, "source": [ "**Merge usando como claves las columnas comunes**" ] }, { "cell_type": "code", "execution_count": 4, "id": "8ce173a6-371d-4ee4-8068-87e56935adb6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " clientId key name\n", "1 10 A Omar Y. Fletcher\n", "3 10 B Buffy W. Vincent\n", "2 11 A Mira N. Franklin\n", "0 11 B Ferris Q. Le" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId key bonus\n", "0 10 A 138\n", "1 10 B 227\n", "2 11 A 279\n", "3 11 B 160" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId key name bonus\n", "0 10 A Omar Y. Fletcher 138\n", "1 10 B Buffy W. Vincent 227\n", "2 11 A Mira N. Franklin 279\n", "3 11 B Ferris Q. Le 160" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "clients = pd.read_csv(\"/tmp/clients.csv\")\n", "bonus = pd.read_csv(\"/tmp/bonus.csv\")\n", "\n", "display(\n", " clients.sort_values(['clientId', 'key']),\n", " \"\",\n", " bonus.sort_values(['clientId', 'key']),\n", " \"\",\n", " pd.merge(\n", " clients,\n", " bonus,\n", " sort=True,\n", " )\n", ")" ] }, { "cell_type": "markdown", "id": "db65f44e-ab9c-486f-a5ca-e5ec291952f9", "metadata": {}, "source": [ "## Fusión uno-a-uno con registros incompletos" ] }, { "cell_type": "code", "execution_count": 5, "id": "c95bbe71-bb22-4992-9e3e-46d9235555fb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Overwriting /tmp/clients.csv\n" ] } ], "source": [ "%%writefile /tmp/clients.csv\n", "clientId,name\n", "10,Ferris Q. Le\n", "11,Mira N. Franklin\n", "12,Baker C. Hurst" ] }, { "cell_type": "code", "execution_count": 6, "id": "af9d6a0b-a7cc-42d5-8edb-418dd76fb0e6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Overwriting /tmp/bonus.csv\n" ] } ], "source": [ "%%writefile /tmp/bonus.csv\n", "clientId,bonus\n", "10,279\n", "11,160\n", "20,169\n", "21,263" ] }, { "cell_type": "code", "execution_count": 7, "id": "c02fea2f-3cda-4d98-8854-3028c6df9f88", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " clientId name\n", "0 10 Ferris Q. Le\n", "1 11 Mira N. Franklin\n", "2 12 Baker C. Hurst" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId bonus\n", "0 10 279\n", "1 11 160\n", "2 20 169\n", "3 21 263" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId name bonus\n", "0 10 Ferris Q. Le 279\n", "1 11 Mira N. Franklin 160" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "clients = pd.read_csv(\"/tmp/clients.csv\")\n", "bonus = pd.read_csv(\"/tmp/bonus.csv\")\n", "\n", "#\n", "# Note que merge solo retorno los registros con\n", "# información completa\n", "#\n", "display(\n", " clients,\n", " \"\",\n", " bonus,\n", " \"\",\n", " pd.merge(\n", " clients,\n", " bonus,\n", " on=\"clientId\",\n", " )\n", ")" ] }, { "cell_type": "markdown", "id": "55fd89b5-34c3-48e9-a236-0c12a37263c5", "metadata": {}, "source": [ "## Fusión muchos-a-uno" ] }, { "cell_type": "code", "execution_count": 8, "id": "6d1b581e-ec42-413a-95af-8811763a18e8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Overwriting /tmp/clients.csv\n" ] } ], "source": [ "%%writefile /tmp/clients.csv\n", "clientId,name\n", "10,Ferris Q. Le\n", "11,Mira N. Franklin\n", "12,Baker C. Hurst" ] }, { "cell_type": "code", "execution_count": 9, "id": "9a9db559-3f5c-44f1-87fa-8e5f5e6d7413", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing /tmp/sales.csv\n" ] } ], "source": [ "%%writefile /tmp/sales.csv\n", "clientId, month, sales\n", "10, jan, 1239\n", "10, feb, 387\n", "11, jan, 454\n", "11, mar, 495\n", "11, sep, 145\n", "12, may, 4959\n", "12, dec, 493\n", "12, oct, 4981\n", "12, jan, 484\n", "15, may, 394\n", "15, sep, 585" ] }, { "cell_type": "code", "execution_count": 10, "id": "ce80d099-a091-403a-a9b0-1c13ccc34c6d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " clientId name\n", "0 10 Ferris Q. Le\n", "1 11 Mira N. Franklin\n", "2 12 Baker C. Hurst" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId month sales\n", "0 10 jan 1239\n", "1 10 feb 387\n", "2 11 jan 454\n", "3 11 mar 495\n", "4 11 sep 145\n", "5 12 may 4959\n", "6 12 dec 493\n", "7 12 oct 4981\n", "8 12 jan 484\n", "9 15 may 394\n", "10 15 sep 585" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId month sales name\n", "0 10 jan 1239 Ferris Q. Le\n", "1 10 feb 387 Ferris Q. Le\n", "2 11 jan 454 Mira N. Franklin\n", "3 11 mar 495 Mira N. Franklin\n", "4 11 sep 145 Mira N. Franklin\n", "5 12 may 4959 Baker C. Hurst\n", "6 12 dec 493 Baker C. Hurst\n", "7 12 oct 4981 Baker C. Hurst\n", "8 12 jan 484 Baker C. Hurst" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "clients = pd.read_csv(\"/tmp/clients.csv\")\n", "sales = pd.read_csv(\"/tmp/sales.csv\")\n", "\n", "#\n", "# Sales tiene valores del campo clientId\n", "# duplicados. Note que se copia la información\n", "# de la tabla de la derecha\n", "#\n", "display(\n", " clients,\n", " \"\",\n", " sales,\n", " \"\",\n", " pd.merge(\n", " sales,\n", " clients,\n", " sort=True,\n", " )\n", ")" ] }, { "cell_type": "markdown", "id": "034676ff-c053-4b67-8358-85a0ebf89a4e", "metadata": { "tags": [] }, "source": [ "## Fusión muchos-a-muchos" ] }, { "cell_type": "code", "execution_count": 11, "id": "ab4338af-71fc-4bd7-9adb-6d13f56f3380", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Overwriting /tmp/sales.csv\n" ] } ], "source": [ "%%writefile /tmp/sales.csv\n", "clientId,month\n", "10,jan\n", "10,feb\n", "11,jan\n", "11,mar\n", "11,sep\n", "12,may\n", "12,dec\n", "12,oct\n", "12,jan" ] }, { "cell_type": "code", "execution_count": 12, "id": "f8a6a9fc-8cff-4bd5-bdc5-f750d6ad53a4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing /tmp/lines.csv\n" ] } ], "source": [ "%%writefile /tmp/lines.csv\n", "clientId,line\n", "10,A\n", "10,B\n", "10,C\n", "11,D\n", "12,A\n", "12,D\n", "13,B\n", "13,C\n", "13,D" ] }, { "cell_type": "code", "execution_count": 13, "id": "2dd56104-bdc3-45db-8b62-ed62748e00a9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " clientId month\n", "0 10 jan\n", "1 10 feb\n", "2 11 jan\n", "3 11 mar\n", "4 11 sep\n", "5 12 may\n", "6 12 dec\n", "7 12 oct\n", "8 12 jan" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId line\n", "0 10 A\n", "1 10 B\n", "2 10 C\n", "3 11 D\n", "4 12 A\n", "5 12 D\n", "6 13 B\n", "7 13 C\n", "8 13 D" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId month line\n", "0 10 jan A\n", "1 10 jan B\n", "2 10 jan C\n", "3 10 feb A\n", "4 10 feb B\n", "5 10 feb C\n", "6 11 jan D\n", "7 11 mar D\n", "8 11 sep D\n", "9 12 may A\n", "10 12 may D\n", "11 12 dec A\n", "12 12 dec D\n", "13 12 oct A\n", "14 12 oct D\n", "15 12 jan A\n", "16 12 jan D" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sales = pd.read_csv(\"/tmp/sales.csv\")\n", "lines = pd.read_csv(\"/tmp/lines.csv\")\n", "\n", "display(\n", " sales,\n", " \"\",\n", " lines,\n", " \"\",\n", " pd.merge(\n", " sales,\n", " lines,\n", " )\n", ")" ] }, { "cell_type": "markdown", "id": "1a83fa7d-b9bc-4765-b7b9-39e7c72dd458", "metadata": { "tags": [] }, "source": [ "## Parámetros left_on, right_on" ] }, { "cell_type": "markdown", "id": "e39e13db-e497-40c4-9976-198fffc47c64", "metadata": {}, "source": [ "El campo clave tiene nombres diferentes en cada tabla." ] }, { "cell_type": "code", "execution_count": 14, "id": "7267bc8e-1d85-4187-8b87-c741bc1f19c7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Overwriting /tmp/clients.csv\n" ] } ], "source": [ "%%writefile /tmp/clients.csv\n", "clientId,name\n", "13,Ferris Q. Le\n", "10,Omar Y. Fletcher\n", "12,Mira N. Franklin\n", "11,Buffy W. Vincent" ] }, { "cell_type": "code", "execution_count": 15, "id": "09ef78c4-2dd6-4111-a90d-339e9b0adf42", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Overwriting /tmp/bonus.csv\n" ] } ], "source": [ "%%writefile /tmp/bonus.csv\n", "Client-Id,bonus\n", "10,279\n", "11,160\n", "12,267\n", "13,215" ] }, { "cell_type": "code", "execution_count": 16, "id": "821500b4-4951-4ceb-a59a-fbaf58bd0c5f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " clientId name\n", "0 13 Ferris Q. Le\n", "1 10 Omar Y. Fletcher\n", "2 12 Mira N. Franklin\n", "3 11 Buffy W. Vincent" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " Client-Id bonus\n", "0 10 279\n", "1 11 160\n", "2 12 267\n", "3 13 215" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId name Client-Id bonus\n", "0 13 Ferris Q. Le 13 215\n", "1 10 Omar Y. Fletcher 10 279\n", "2 12 Mira N. Franklin 12 267\n", "3 11 Buffy W. Vincent 11 160" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "clients = pd.read_csv(\"/tmp/clients.csv\")\n", "bonus = pd.read_csv(\"/tmp/bonus.csv\")\n", "\n", "#\n", "# Note que las dos claves aparecen en la tabla\n", "#\n", "display(\n", " clients,\n", " \"\",\n", " bonus,\n", " \"\",\n", " pd.merge(\n", " clients,\n", " bonus,\n", " left_on=\"clientId\",\n", " right_on=\"Client-Id\",\n", " )\n", ")" ] }, { "cell_type": "code", "execution_count": 17, "id": "9499e731-100a-44ce-a637-ffabc504193f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " clientId name\n", "0 13 Ferris Q. Le\n", "1 10 Omar Y. Fletcher\n", "2 12 Mira N. Franklin\n", "3 11 Buffy W. Vincent" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " Client-Id bonus\n", "0 10 279\n", "1 11 160\n", "2 12 267\n", "3 13 215" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId name bonus\n", "0 13 Ferris Q. Le 215\n", "1 10 Omar Y. Fletcher 279\n", "2 12 Mira N. Franklin 267\n", "3 11 Buffy W. Vincent 160" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#\n", "# Remoción de una de las columnas\n", "#\n", "display(\n", " clients,\n", " \"\",\n", " bonus,\n", " \"\",\n", " pd.merge(\n", " clients,\n", " bonus,\n", " left_on=\"clientId\",\n", " right_on=\"Client-Id\",\n", " ).drop(\n", " \"Client-Id\",\n", " axis=1,\n", " )\n", ")" ] }, { "cell_type": "markdown", "id": "85b8d9e9-d87f-459e-abb3-a2f4ae55fded", "metadata": {}, "source": [ "## Joins usando el parámetro how" ] }, { "cell_type": "markdown", "id": "800550bb-c676-41fb-a938-2b2562124490", "metadata": {}, "source": [ "![joins.png](images/joins.png)" ] }, { "cell_type": "markdown", "id": "44a57d96-7715-4a87-871b-cfdcbfd7047f", "metadata": {}, "source": [ "Ambas tablas tienen algunos campos en comun (y otros diferentes!)." ] }, { "cell_type": "code", "execution_count": 18, "id": "775eabc9-ab77-4486-b960-2ac0357b978b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing /tmp/clients_a.csv\n" ] } ], "source": [ "%%writefile /tmp/clients_a.csv\n", "clientId,name,location,amount\n", "10,Omar Y. Fletcher,6833 Mollis. Rd.,4929\n", "11,Buffy W. Vincent,\"P.O. Box 345, 8390 Ante Avenue\",7366\n", "12,Mira N. Franklin,\"P.O. Box 445, 323 Cursus Rd.\",6184" ] }, { "cell_type": "code", "execution_count": 19, "id": "c049ec26-3c7e-44fa-afdf-f4d1425d6e0b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing /tmp/clients_b.csv\n" ] } ], "source": [ "%%writefile /tmp/clients_b.csv\n", "clientId,name,location,bonus\n", "11,Buffy W. Vincent,\"P.O. Box 345, 8390 Ante Avenue\",100\n", "12,Mira N. Franklin,\"P.O. Box 445, 323 Cursus Rd.\",200\n", "13,Lilah O. Morrison,3859 Mauris Ave,300" ] }, { "cell_type": "code", "execution_count": 20, "id": "08621527-5775-4fa3-bd0d-23381ad26bc0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'amount'}" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "{'bonus'}" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "clients_a = pd.read_csv(\"/tmp/clients_a.csv\")\n", "clients_b = pd.read_csv(\"/tmp/clients_b.csv\")\n", "\n", "#\n", "# Campos diferentes en cada tabla\n", "#\n", "display(\n", " set(clients_a.columns) - set(clients_b.columns),\n", " set(clients_b.columns) - set(clients_a.columns),\n", ")" ] }, { "cell_type": "code", "execution_count": 21, "id": "0455a904-ad01-482a-a014-a65817de2251", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " clientId name location amount\n", "0 10 Omar Y. Fletcher 6833 Mollis. Rd. 4929\n", "1 11 Buffy W. Vincent P.O. Box 345, 8390 Ante Avenue 7366\n", "2 12 Mira N. Franklin P.O. Box 445, 323 Cursus Rd. 6184" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId name location bonus\n", "0 11 Buffy W. Vincent P.O. Box 345, 8390 Ante Avenue 100\n", "1 12 Mira N. Franklin P.O. Box 445, 323 Cursus Rd. 200\n", "2 13 Lilah O. Morrison 3859 Mauris Ave 300" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId name location amount bonus\n", "0 11 Buffy W. Vincent P.O. Box 345, 8390 Ante Avenue 7366 100\n", "1 12 Mira N. Franklin P.O. Box 445, 323 Cursus Rd. 6184 200" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#\n", "# Inner join\n", "#\n", "display(\n", " clients_a,\n", " \"\",\n", " clients_b,\n", " \"\",\n", " \n", " pd.merge(\n", " clients_a,\n", " clients_b,\n", " how=\"inner\",\n", " )\n", ")" ] }, { "cell_type": "code", "execution_count": 22, "id": "4d43f783-d462-45b3-97a3-cddbad13b500", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " clientId name location amount\n", "0 10 Omar Y. Fletcher 6833 Mollis. Rd. 4929\n", "1 11 Buffy W. Vincent P.O. Box 345, 8390 Ante Avenue 7366\n", "2 12 Mira N. Franklin P.O. Box 445, 323 Cursus Rd. 6184" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId name location bonus\n", "0 11 Buffy W. Vincent P.O. Box 345, 8390 Ante Avenue 100\n", "1 12 Mira N. Franklin P.O. Box 445, 323 Cursus Rd. 200\n", "2 13 Lilah O. Morrison 3859 Mauris Ave 300" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId name location amount bonus\n", "0 10 Omar Y. Fletcher 6833 Mollis. Rd. 4929 NaN\n", "1 11 Buffy W. Vincent P.O. Box 345, 8390 Ante Avenue 7366 100.0\n", "2 12 Mira N. Franklin P.O. Box 445, 323 Cursus Rd. 6184 200.0" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#\n", "# Left join\n", "#\n", "display(\n", " clients_a,\n", " \"\",\n", " clients_b,\n", " \"\",\n", " \n", " pd.merge(\n", " clients_a,\n", " clients_b,\n", " how=\"left\",\n", " )\n", ")" ] }, { "cell_type": "code", "execution_count": 23, "id": "94d4b8b6-2fea-4b3d-9576-e597332d9637", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " clientId name location amount\n", "0 10 Omar Y. Fletcher 6833 Mollis. Rd. 4929\n", "1 11 Buffy W. Vincent P.O. Box 345, 8390 Ante Avenue 7366\n", "2 12 Mira N. Franklin P.O. Box 445, 323 Cursus Rd. 6184" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId name location bonus\n", "0 11 Buffy W. Vincent P.O. Box 345, 8390 Ante Avenue 100\n", "1 12 Mira N. Franklin P.O. Box 445, 323 Cursus Rd. 200\n", "2 13 Lilah O. Morrison 3859 Mauris Ave 300" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId name location amount bonus\n", "0 11 Buffy W. Vincent P.O. Box 345, 8390 Ante Avenue 7366.0 100\n", "1 12 Mira N. Franklin P.O. Box 445, 323 Cursus Rd. 6184.0 200\n", "2 13 Lilah O. Morrison 3859 Mauris Ave NaN 300" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#\n", "# Right join\n", "#\n", "display(\n", " clients_a,\n", " \"\",\n", " clients_b,\n", " \"\",\n", " \n", " pd.merge(\n", " clients_a,\n", " clients_b,\n", " how=\"right\",\n", " )\n", ")" ] }, { "cell_type": "code", "execution_count": 24, "id": "b5579382-e406-4cb9-9b34-4cc5fe59af27", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " clientId name location amount\n", "0 10 Omar Y. Fletcher 6833 Mollis. Rd. 4929\n", "1 11 Buffy W. Vincent P.O. Box 345, 8390 Ante Avenue 7366\n", "2 12 Mira N. Franklin P.O. Box 445, 323 Cursus Rd. 6184" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId name location bonus\n", "0 11 Buffy W. Vincent P.O. Box 345, 8390 Ante Avenue 100\n", "1 12 Mira N. Franklin P.O. Box 445, 323 Cursus Rd. 200\n", "2 13 Lilah O. Morrison 3859 Mauris Ave 300" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId name location amount bonus\n", "0 10 Omar Y. Fletcher 6833 Mollis. Rd. 4929.0 NaN\n", "1 11 Buffy W. Vincent P.O. Box 345, 8390 Ante Avenue 7366.0 100.0\n", "2 12 Mira N. Franklin P.O. Box 445, 323 Cursus Rd. 6184.0 200.0\n", "3 13 Lilah O. Morrison 3859 Mauris Ave NaN 300.0" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#\n", "# Outer join\n", "#\n", "display(\n", " clients_a,\n", " \"\",\n", " clients_b,\n", " \"\",\n", " \n", " pd.merge(\n", " clients_a,\n", " clients_b,\n", " how=\"outer\",\n", " )\n", ")" ] }, { "cell_type": "code", "execution_count": 25, "id": "8bdb90b5-8867-4840-b149-0bd6f8c721c4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " clientId name location amount\n", "0 10 Omar Y. Fletcher 6833 Mollis. Rd. 4929\n", "1 11 Buffy W. Vincent P.O. Box 345, 8390 Ante Avenue 7366\n", "2 12 Mira N. Franklin P.O. Box 445, 323 Cursus Rd. 6184" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId name location bonus\n", "0 11 Buffy W. Vincent P.O. Box 345, 8390 Ante Avenue 100\n", "1 12 Mira N. Franklin P.O. Box 445, 323 Cursus Rd. 200\n", "2 13 Lilah O. Morrison 3859 Mauris Ave 300" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId name location amount bonus \\\n", "0 10 Omar Y. Fletcher 6833 Mollis. Rd. 4929.0 NaN \n", "1 11 Buffy W. Vincent P.O. Box 345, 8390 Ante Avenue 7366.0 100.0 \n", "2 12 Mira N. Franklin P.O. Box 445, 323 Cursus Rd. 6184.0 200.0 \n", "3 13 Lilah O. Morrison 3859 Mauris Ave NaN 300.0 \n", "\n", " _merge \n", "0 left_only \n", "1 both \n", "2 both \n", "3 right_only " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#\n", "# Outer join\n", "#\n", "display(\n", " clients_a,\n", " \"\",\n", " clients_b,\n", " \"\",\n", " \n", " pd.merge(\n", " clients_a,\n", " clients_b,\n", " how=\"outer\",\n", " indicator=True,\n", " )\n", ")" ] }, { "cell_type": "markdown", "id": "ccf30966-06f9-481b-b041-c840ecbdf6a6", "metadata": {}, "source": [ "## Sufijos" ] }, { "cell_type": "code", "execution_count": 26, "id": "672177f6-73fb-44bb-9393-9ef5fda94160", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing /tmp/data_1.csv\n" ] } ], "source": [ "%%writefile /tmp/data_1.csv\n", "clientId,info\n", "10,Omar Y. Fletcher\n", "11,Buffy W. Vincent\n", "12,Mira N. Franklin" ] }, { "cell_type": "code", "execution_count": 27, "id": "74023a10-9b84-4ee6-8e14-36377705b7dd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing /tmp/data_2.csv\n" ] } ], "source": [ "%%writefile /tmp/data_2.csv\n", "clientId,info\n", "10,6833 Mollis. Rd.\n", "11,\"P.O. Box 345, 8390 Ante Avenue\"\n", "12,\"P.O. Box 445, 323 Cursus Rd.\"" ] }, { "cell_type": "code", "execution_count": 28, "id": "2b53bf72-c118-48e0-8a92-61ee2f1aba42", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " clientId info\n", "0 10 Omar Y. Fletcher\n", "1 11 Buffy W. Vincent\n", "2 12 Mira N. Franklin" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId info\n", "0 10 6833 Mollis. Rd.\n", "1 11 P.O. Box 345, 8390 Ante Avenue\n", "2 12 P.O. Box 445, 323 Cursus Rd." ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId info_x info_y\n", "0 10 Omar Y. Fletcher 6833 Mollis. Rd.\n", "1 11 Buffy W. Vincent P.O. Box 345, 8390 Ante Avenue\n", "2 12 Mira N. Franklin P.O. Box 445, 323 Cursus Rd." ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df_1 = pd.read_csv(\"/tmp/data_1.csv\")\n", "df_2 = pd.read_csv(\"/tmp/data_2.csv\")\n", "\n", "\n", "#\n", "# Sufijos generados para desambiguar los nombres\n", "# de las columnas\n", "#\n", "display(\n", " df_1,\n", " \"\",\n", " df_2,\n", " \"\",\n", " pd.merge(\n", " df_1,\n", " df_2,\n", " on=\"clientId\",\n", " )\n", ")" ] }, { "cell_type": "code", "execution_count": 29, "id": "9aabeb83-9ca7-4a47-af43-e0ef5bccffa1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " clientId info\n", "0 10 Omar Y. Fletcher\n", "1 11 Buffy W. Vincent\n", "2 12 Mira N. Franklin" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId info\n", "0 10 6833 Mollis. Rd.\n", "1 11 P.O. Box 345, 8390 Ante Avenue\n", "2 12 P.O. Box 445, 323 Cursus Rd." ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId info_LD info_RD\n", "0 10 Omar Y. Fletcher 6833 Mollis. Rd.\n", "1 11 Buffy W. Vincent P.O. Box 345, 8390 Ante Avenue\n", "2 12 Mira N. Franklin P.O. Box 445, 323 Cursus Rd." ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#\n", "# Especificación del sufijo\n", "#\n", "display(\n", " df_1,\n", " \"\",\n", " df_2,\n", " \"\",\n", " pd.merge(\n", " df_1,\n", " df_2,\n", " on=\"clientId\",\n", " suffixes=[\"_LD\", \"_RD\"],\n", " )\n", ")" ] }, { "cell_type": "code", "execution_count": 30, "id": "d42fe7dd-d03c-4f08-b50c-55a740ed92b9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " clientId info\n", "0 10 Omar Y. Fletcher\n", "1 11 Buffy W. Vincent\n", "2 12 Mira N. Franklin" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId info\n", "0 10 6833 Mollis. Rd.\n", "1 11 P.O. Box 345, 8390 Ante Avenue\n", "2 12 P.O. Box 445, 323 Cursus Rd." ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " clientId name location\n", "0 10 Omar Y. Fletcher 6833 Mollis. Rd.\n", "1 11 Buffy W. Vincent P.O. Box 345, 8390 Ante Avenue\n", "2 12 Mira N. Franklin P.O. Box 445, 323 Cursus Rd." ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#\n", "# Renombramiento de las columnas\n", "#\n", "display(\n", " df_1,\n", " \"\",\n", " df_2,\n", " \"\",\n", " pd.merge(\n", " df_1,\n", " df_2,\n", " on=\"clientId\",\n", " ).rename(\n", " columns={\n", " \"info_x\": \"name\",\n", " \"info_y\": \"location\",\n", " }\n", " )\n", ")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 5 }