{ "cells": [ { "cell_type": "markdown", "id": "a70d52ad-0f57-44c7-80d3-67a8af0af81e", "metadata": { "tags": [] }, "source": [ "Registros duplicados --- 6:15 min\n", "===\n", "\n", "* 6:15 min | Última modificación: Octubre 14, 2021 | [YouTube](https://youtu.be/Qlrvykg3F3c)" ] }, { "cell_type": "code", "execution_count": 1, "id": "ce6a75ae-d34c-4c4a-a19b-f4d969af242f", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "markdown", "id": "dd456c8f-3fc4-4ddb-bdb3-4212423c960b", "metadata": {}, "source": [ "**Posibilidades**:\n", "\n", " * Registro completo duplicado.\n", " \n", " * Algunos campos duplicados, demas campos con valores diferentes\n", " \n", " \n", "**Soluciones posibles**:\n", "\n", " * Borrado de los registros completamente duplicados.\n", " \n", " * Agregación para campos con valores diferentes." ] }, { "cell_type": "code", "execution_count": 2, "id": "aaf77c0b-bc1c-447d-923a-f39dca269906", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Overwriting /tmp/data.csv\n" ] } ], "source": [ "%%writefile /tmp/data.csv\n", "clientId,name,phone,companyId\n", "1,Hersch Szymanowicz,+237 (561) 702-4118,3\n", "2,Gaven Brito,+51 (870) 799-1508,7\n", "3,Maressa Pavlishchev,+62 (350) 377-8621,4\n", "4,Corine Dunseith,+63 (797) 344-2571,8\n", "5,Ramon Lawrence,+66 (557) 865-3845,6\n", "6,Ibbie Whitehouse,+98 (495) 896-6408,1\n", "7,Neils Capelen,+86 (361) 914-8734,6\n", "8,Thia Malkie,+46 (564) 145-8997,4\n", "9,Missy Folomkin,+33 (962) 798-0776,7\n", "10,Eleanor Gallamore,+86 (366) 702-2334,5\n", "7,Neils Capelen,+86 (361) 914-8734,6\n", "4,Corine Dunseith,+63 (797) 344-2571,8\n", "2,Gaven Brito,+51 (870) 799-2308,2\n", "5,Ramon Lawrence,+66 (557) 061-3844,5\n", "4,Corine Dunseith,+63 (797) 344-2571,8" ] }, { "cell_type": "code", "execution_count": 3, "id": "a9e50d6a-4042-481d-8219-13509749de09", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
clientIdnamephonecompanyId
01Hersch Szymanowicz+237 (561) 702-41183
12Gaven Brito+51 (870) 799-15087
23Maressa Pavlishchev+62 (350) 377-86214
34Corine Dunseith+63 (797) 344-25718
45Ramon Lawrence+66 (557) 865-38456
56Ibbie Whitehouse+98 (495) 896-64081
67Neils Capelen+86 (361) 914-87346
78Thia Malkie+46 (564) 145-89974
89Missy Folomkin+33 (962) 798-07767
910Eleanor Gallamore+86 (366) 702-23345
107Neils Capelen+86 (361) 914-87346
114Corine Dunseith+63 (797) 344-25718
122Gaven Brito+51 (870) 799-23082
135Ramon Lawrence+66 (557) 061-38445
144Corine Dunseith+63 (797) 344-25718
\n", "
" ], "text/plain": [ " clientId name phone companyId\n", "0 1 Hersch Szymanowicz +237 (561) 702-4118 3\n", "1 2 Gaven Brito +51 (870) 799-1508 7\n", "2 3 Maressa Pavlishchev +62 (350) 377-8621 4\n", "3 4 Corine Dunseith +63 (797) 344-2571 8\n", "4 5 Ramon Lawrence +66 (557) 865-3845 6\n", "5 6 Ibbie Whitehouse +98 (495) 896-6408 1\n", "6 7 Neils Capelen +86 (361) 914-8734 6\n", "7 8 Thia Malkie +46 (564) 145-8997 4\n", "8 9 Missy Folomkin +33 (962) 798-0776 7\n", "9 10 Eleanor Gallamore +86 (366) 702-2334 5\n", "10 7 Neils Capelen +86 (361) 914-8734 6\n", "11 4 Corine Dunseith +63 (797) 344-2571 8\n", "12 2 Gaven Brito +51 (870) 799-2308 2\n", "13 5 Ramon Lawrence +66 (557) 061-3844 5\n", "14 4 Corine Dunseith +63 (797) 344-2571 8" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "clientId int64\n", "name object\n", "phone object\n", "companyId int64\n", "dtype: object" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df = pd.read_csv('/tmp/data.csv')\n", "\n", "display(\n", " df,\n", " df.dtypes\n", ")" ] }, { "cell_type": "code", "execution_count": 4, "id": "112364b7-0ebf-42e7-9802-152a131b11b4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 False\n", "1 False\n", "2 False\n", "3 False\n", "4 False\n", "5 False\n", "6 False\n", "7 False\n", "8 False\n", "9 False\n", "10 True\n", "11 True\n", "12 False\n", "13 False\n", "14 True\n", "dtype: bool" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# La función duplicaated() indica si el registro\n", "# completo está duplicado.\n", "#\n", "df.duplicated()" ] }, { "cell_type": "code", "execution_count": 5, "id": "957abf0c-66e3-4180-af1e-57f3aa62fb04", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
clientIdnamephonecompanyId
107Neils Capelen+86 (361) 914-87346
114Corine Dunseith+63 (797) 344-25718
144Corine Dunseith+63 (797) 344-25718
\n", "
" ], "text/plain": [ " clientId name phone companyId\n", "10 7 Neils Capelen +86 (361) 914-8734 6\n", "11 4 Corine Dunseith +63 (797) 344-2571 8\n", "14 4 Corine Dunseith +63 (797) 344-2571 8" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Visualización de los registros duplicados\n", "#\n", "df[df.duplicated(keep='first')]" ] }, { "cell_type": "code", "execution_count": 6, "id": "ed525d86-8638-4b4a-b466-6f09006147ca", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
clientIdnamephonecompanyId
34Corine Dunseith+63 (797) 344-25718
67Neils Capelen+86 (361) 914-87346
114Corine Dunseith+63 (797) 344-25718
\n", "
" ], "text/plain": [ " clientId name phone companyId\n", "3 4 Corine Dunseith +63 (797) 344-2571 8\n", "6 7 Neils Capelen +86 (361) 914-8734 6\n", "11 4 Corine Dunseith +63 (797) 344-2571 8" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Visualización de los registros duplicados\n", "#\n", "df[df.duplicated(keep='last')]" ] }, { "cell_type": "code", "execution_count": 7, "id": "5af8c805-8012-4a37-8ed2-1255d8e4a2ab", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
clientIdnamephonecompanyId
12Gaven Brito+51 (870) 799-15087
34Corine Dunseith+63 (797) 344-25718
45Ramon Lawrence+66 (557) 865-38456
67Neils Capelen+86 (361) 914-87346
114Corine Dunseith+63 (797) 344-25718
\n", "
" ], "text/plain": [ " clientId name phone companyId\n", "1 2 Gaven Brito +51 (870) 799-1508 7\n", "3 4 Corine Dunseith +63 (797) 344-2571 8\n", "4 5 Ramon Lawrence +66 (557) 865-3845 6\n", "6 7 Neils Capelen +86 (361) 914-8734 6\n", "11 4 Corine Dunseith +63 (797) 344-2571 8" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Visualización de los registros duplicados\n", "#\n", "df[\n", " df.duplicated(\n", " subset=[\"clientId\", \"name\"],\n", " keep=\"last\",\n", " )\n", "]" ] }, { "cell_type": "code", "execution_count": 8, "id": "30ea6a07-4b86-4eb6-b59c-48294c5f471b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
clientIdnamephonecompanyId
12Gaven Brito+51 (870) 799-15087
122Gaven Brito+51 (870) 799-23082
34Corine Dunseith+63 (797) 344-25718
114Corine Dunseith+63 (797) 344-25718
144Corine Dunseith+63 (797) 344-25718
45Ramon Lawrence+66 (557) 865-38456
135Ramon Lawrence+66 (557) 061-38445
67Neils Capelen+86 (361) 914-87346
107Neils Capelen+86 (361) 914-87346
\n", "
" ], "text/plain": [ " clientId name phone companyId\n", "1 2 Gaven Brito +51 (870) 799-1508 7\n", "12 2 Gaven Brito +51 (870) 799-2308 2\n", "3 4 Corine Dunseith +63 (797) 344-2571 8\n", "11 4 Corine Dunseith +63 (797) 344-2571 8\n", "14 4 Corine Dunseith +63 (797) 344-2571 8\n", "4 5 Ramon Lawrence +66 (557) 865-3845 6\n", "13 5 Ramon Lawrence +66 (557) 061-3844 5\n", "6 7 Neils Capelen +86 (361) 914-8734 6\n", "10 7 Neils Capelen +86 (361) 914-8734 6" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Visualización de los registros duplicados\n", "#\n", "df[\n", " df.duplicated(\n", " subset=[\"clientId\", \"name\"],\n", " keep=False,\n", " )\n", "].sort_values(by=['clientId', 'name'])" ] }, { "cell_type": "code", "execution_count": 9, "id": "54f51e1a-fb0b-49f9-8b36-735b31123685", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
clientIdnamephonecompanyId
107Neils Capelen+86 (361) 914-87346
114Corine Dunseith+63 (797) 344-25718
144Corine Dunseith+63 (797) 344-25718
\n", "
" ], "text/plain": [ " clientId name phone companyId\n", "10 7 Neils Capelen +86 (361) 914-8734 6\n", "11 4 Corine Dunseith +63 (797) 344-2571 8\n", "14 4 Corine Dunseith +63 (797) 344-2571 8" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Borrado de registros duplicados\n", "#\n", "df[df.duplicated()]" ] }, { "cell_type": "code", "execution_count": 10, "id": "f1e25ff4-7a18-4b58-8afd-1f3097202022", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
clientIdnamephonecompanyId
01Hersch Szymanowicz+237 (561) 702-41183
12Gaven Brito+51 (870) 799-15087
23Maressa Pavlishchev+62 (350) 377-86214
34Corine Dunseith+63 (797) 344-25718
45Ramon Lawrence+66 (557) 865-38456
56Ibbie Whitehouse+98 (495) 896-64081
67Neils Capelen+86 (361) 914-87346
78Thia Malkie+46 (564) 145-89974
89Missy Folomkin+33 (962) 798-07767
910Eleanor Gallamore+86 (366) 702-23345
122Gaven Brito+51 (870) 799-23082
135Ramon Lawrence+66 (557) 061-38445
\n", "
" ], "text/plain": [ " clientId name phone companyId\n", "0 1 Hersch Szymanowicz +237 (561) 702-4118 3\n", "1 2 Gaven Brito +51 (870) 799-1508 7\n", "2 3 Maressa Pavlishchev +62 (350) 377-8621 4\n", "3 4 Corine Dunseith +63 (797) 344-2571 8\n", "4 5 Ramon Lawrence +66 (557) 865-3845 6\n", "5 6 Ibbie Whitehouse +98 (495) 896-6408 1\n", "6 7 Neils Capelen +86 (361) 914-8734 6\n", "7 8 Thia Malkie +46 (564) 145-8997 4\n", "8 9 Missy Folomkin +33 (962) 798-0776 7\n", "9 10 Eleanor Gallamore +86 (366) 702-2334 5\n", "12 2 Gaven Brito +51 (870) 799-2308 2\n", "13 5 Ramon Lawrence +66 (557) 061-3844 5" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.drop_duplicates(inplace=True)\n", "df" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 5 }