{ "cells": [ { "cell_type": "markdown", "id": "024c2a6f-c81f-4f67-a0ef-411d508c22c3", "metadata": { "tags": [] }, "source": [ "Unificación de cadenas de texto --- 12:21 min\n", "===\n", "\n", "* 12:21 min | Última modificación: Octubre 14, 2021 | [YouTube](https://youtu.be/aITe9eyuAj8)" ] }, { "cell_type": "markdown", "id": "69c5b0bc-dd91-40cd-a547-785b58e8732d", "metadata": {}, "source": [ "**Tipos de problemas**\n", "\n", "* Errores de digitación.\n", "\n", "* Cumplimiento con un patrón (e-mails, telefonos, direcciones, ...)\n", "\n", "* Longitud mínima\n", "\n", "* Errores tipográficos" ] }, { "cell_type": "markdown", "id": "e9fe945f-c525-4adf-bcba-1ac68ab74a87", "metadata": {}, "source": [ "Errores de formato\n", "---" ] }, { "cell_type": "code", "execution_count": 1, "id": "ecb87dca-1c8b-4191-b31e-9075269c1624", "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "pd.set_option(\"display.notebook_repr_html\", False)" ] }, { "cell_type": "markdown", "id": "b76067fe-0a4c-44be-bd8f-26556fa3f411", "metadata": {}, "source": [ "El formato correcto del número telefónico es:\n", "\n", " 001##########" ] }, { "cell_type": "code", "execution_count": 2, "id": "ad04c86f-6385-4212-a5bd-1bea3fe6b298", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Overwriting /tmp/data.csv\n" ] } ], "source": [ "%%writefile /tmp/data.csv\n", "phone\n", "+001-693-708-9470\n", "1-881-785-8561\n", "+742-672-3143\n", "+1.235.809.0341\n", "+1 (808) 6256866\n", "1 (993) 7090910\n", "(858) 6725653\n", "11 930 5833\n", "+228 415 7806\n", "892 144 6773\n", "38728" ] }, { "cell_type": "code", "execution_count": 3, "id": "32c413a2-da6a-49e3-a711-48b4ca431477", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " phone\n", "0 +001-693-708-9470\n", "1 1-881-785-8561\n", "2 +742-672-3143\n", "3 +1.235.809.0341\n", "4 +1 (808) 6256866\n", "5 1 (993) 7090910\n", "6 (858) 6725653\n", "7 11 930 5833\n", "8 +228 415 7806\n", "9 892 144 6773\n", "10 38728" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " phone\n", "6 (858) 6725653\n", "0 +001-693-708-9470\n", "4 +1 (808) 6256866\n", "3 +1.235.809.0341\n", "8 +228 415 7806\n", "2 +742-672-3143\n", "5 1 (993) 7090910\n", "1 1-881-785-8561\n", "7 11 930 5833\n", "10 38728\n", "9 892 144 6773" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "''" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ " phone\n", "10 38728\n", "7 11 930 5833\n", "9 892 144 6773\n", "2 +742-672-3143\n", "6 (858) 6725653\n", "8 +228 415 7806\n", "1 1-881-785-8561\n", "3 +1.235.809.0341\n", "5 1 (993) 7090910\n", "4 +1 (808) 6256866\n", "0 +001-693-708-9470" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df = pd.read_csv('/tmp/data.csv')\n", "\n", "#\n", "# Resulta más apropiado visualizar la columna\n", "# ordenada formando bloques\n", "#\n", "display(\n", " df,\n", " '',\n", " df.sort_values('phone'),\n", " '',\n", " df.sort_values('phone', key=lambda series: [len(x) for x in series]),\n", ")" ] }, { "cell_type": "code", "execution_count": 4, "id": "f5275474-d998-4695-8f3f-8f051590b3c6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " phone\n", "10 38728\n", "7 11 930 5833\n", "2 742-672-3143\n", "8 228 415 7806\n", "9 892 144 6773\n", "6 (858) 6725653\n", "1 1-881-785-8561\n", "3 1.235.809.0341\n", "4 1 (808) 6256866\n", "5 1 (993) 7090910\n", "0 001-693-708-9470" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Eliminación del '+'\n", "#\n", "df.phone = df.phone.str.replace('+','')\n", "df.sort_values('phone', key=lambda series: [len(x) for x in series])" ] }, { "cell_type": "code", "execution_count": 5, "id": "640997c0-84fb-4e1e-bd7b-33e0927b9107", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " phone\n", "10 38728\n", "7 11 930 5833\n", "2 742-672-3143\n", "8 228 415 7806\n", "9 892 144 6773\n", "6 (858) 6725653\n", "0 001-693-708-9470\n", "1 001-881-785-8561\n", "3 001.235.809.0341\n", "4 001 (808) 6256866\n", "5 001 (993) 7090910" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Corrección de los números con \"1...\"\n", "#\n", "df.phone = df.phone.map(lambda x: \"00\" + x if re.search(r\"^1[\\s\\.-]\", x) else x)\n", "df.sort_values('phone', key=lambda series: [len(x) for x in series])" ] }, { "cell_type": "code", "execution_count": 6, "id": "505d9aa2-1a0e-4399-80bd-cae055c68db1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " phone\n", "10 38728\n", "7 11 930 5833\n", "2 742-672-3143\n", "8 228 415 7806\n", "9 892 144 6773\n", "6 (858) 6725653\n", "0 001-693-708-9470\n", "1 001-881-785-8561\n", "3 001-235-809-0341\n", "4 001 (808) 6256866\n", "5 001 (993) 7090910" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Reemplazo del '.'\n", "#\n", "df.phone = df.phone.str.replace('.','-')\n", "df.sort_values('phone', key=lambda series: [len(x) for x in series])" ] }, { "cell_type": "code", "execution_count": 7, "id": "5319a067-417d-4422-b12c-e437d749d199", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " phone\n", "10 38728\n", "7 11-930-5833\n", "2 742-672-3143\n", "8 228-415-7806\n", "9 892-144-6773\n", "6 (858)-6725653\n", "0 001-693-708-9470\n", "1 001-881-785-8561\n", "3 001-235-809-0341\n", "4 001-(808)-6256866\n", "5 001-(993)-7090910" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Reemplazo espacios en blanco\n", "#\n", "df.phone = df.phone.str.replace(' ','-')\n", "df.sort_values('phone', key=lambda series: [len(x) for x in series])" ] }, { "cell_type": "code", "execution_count": 8, "id": "dc272adc-c6e7-41ea-a115-bd48b66a90e1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " phone\n", "10 38728\n", "6 858-6725653\n", "7 11-930-5833\n", "2 742-672-3143\n", "8 228-415-7806\n", "9 892-144-6773\n", "4 001-808-6256866\n", "5 001-993-7090910\n", "0 001-693-708-9470\n", "1 001-881-785-8561\n", "3 001-235-809-0341" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Remoción paréntesis\n", "#\n", "df.phone = df.phone.str.replace('[()]','')\n", "df.sort_values('phone', key=lambda series: [len(x) for x in series])" ] }, { "cell_type": "code", "execution_count": 9, "id": "18f5f0be-252c-4fcb-bba8-2c6526198775", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " phone\n", "10 38728\n", "6 858-6725653\n", "7 11-930-5833\n", "4 001-808-6256866\n", "5 001-993-7090910\n", "0 001-693-708-9470\n", "1 001-881-785-8561\n", "2 001-742-672-3143\n", "3 001-235-809-0341\n", "8 001-228-415-7806\n", "9 001-892-144-6773" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Adición del 001 inicial\n", "#\n", "df.phone = df.phone.map(lambda x: \"001-\" + x if re.search(r\"^\\d{3}-\\d{3}-\\d{4}$\", x) else x)\n", "df.sort_values('phone', key=lambda series: [len(x) for x in series])" ] }, { "cell_type": "code", "execution_count": 10, "id": "607e2011-b3fb-4d30-bd7d-2fbc62ddb67a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " phone\n", "10 38728\n", "7 11-930-5833\n", "4 001-808-6256866\n", "5 001-993-7090910\n", "6 001-858-6725653\n", "0 001-693-708-9470\n", "1 001-881-785-8561\n", "2 001-742-672-3143\n", "3 001-235-809-0341\n", "8 001-228-415-7806\n", "9 001-892-144-6773" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Segundo registro de la tabla anterior\n", "#\n", "df.phone = df.phone.map(lambda x: '001-' + x if re.search(r\"^\\d{3}-\\d{7}$\", x) else x)\n", "df.sort_values('phone', key=lambda series: [len(x) for x in series])" ] }, { "cell_type": "code", "execution_count": 11, "id": "fadff965-92a2-44c9-bd6d-78ec37a61629", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " phone\n", "10 38728\n", "4 001-808-6256866\n", "5 001-993-7090910\n", "6 001-858-6725653\n", "0 001-693-708-9470\n", "1 001-881-785-8561\n", "2 001-742-672-3143\n", "3 001-235-809-0341\n", "7 001-011-930-5833\n", "8 001-228-415-7806\n", "9 001-892-144-6773" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Segundo registro de la tabla anterior\n", "#\n", "df.phone = df.phone.map(lambda x: '001-0' + x if re.search(r\"^\\d{2}-\\d{3}-\\d{4}$\", x) else x)\n", "df.sort_values('phone', key=lambda series: [len(x) for x in series])" ] }, { "cell_type": "code", "execution_count": 12, "id": "54cdb7c2-e471-4e9c-849f-254eba854477", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " phone\n", "10 38728\n", "0 0016937089470\n", "1 0018817858561\n", "2 0017426723143\n", "3 0012358090341\n", "4 0018086256866\n", "5 0019937090910\n", "6 0018586725653\n", "7 0010119305833\n", "8 0012284157806\n", "9 0018921446773" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Se corrigen los '-' donde es posible\n", "#\n", "df.phone = df.phone.map(lambda x: x.replace('-','') if re.search(r\"^001-\\d{3}-\\d{3}-\\d{4}$\", x) else x)\n", "df.phone = df.phone.map(lambda x: x.replace('-','') if re.search(r\"^001-\\d{3}-\\d{7}$\", x) else x)\n", "df.sort_values('phone', key=lambda series: [len(x) for x in series])" ] }, { "cell_type": "code", "execution_count": 13, "id": "d6f142f3-b607-4654-a5b0-9dfd26c57fae", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " phone\n", "0 0016937089470\n", "1 0018817858561\n", "2 0017426723143\n", "3 0012358090341\n", "4 0018086256866\n", "5 0019937090910\n", "6 0018586725653\n", "7 0010119305833\n", "8 0012284157806\n", "9 0018921446773\n", "10 NaN" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Se reemplazan los números invalidos por Nan\n", "#\n", "df.phone = df.phone.map(lambda x: np.nan if len(x) != 13 else x)\n", "df" ] }, { "cell_type": "markdown", "id": "6fd014e7-6e41-40d9-8d64-95ac635caf38", "metadata": {}, "source": [ "Comparación de cadenas de texto\n", "---" ] }, { "cell_type": "code", "execution_count": 14, "id": "9c1e3856-f524-4e66-8db7-beef6f9aa931", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\n" ] } ], "source": [ "!pip3 install --quiet fuzzywuzzy\n", "!pip3 install --quiet fuzzywuzzy python-Levenshtein" ] }, { "cell_type": "code", "execution_count": 15, "id": "d03592a5-6607-4f5e-8bec-5ef2ff7b9b72", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "100" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "100" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "100" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "100" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from fuzzywuzzy import fuzz, process\n", "\n", "#\n", "# Comparación de strings\n", "#\n", "display(\n", " fuzz.WRatio('hola mundo', 'hola mundo'),\n", " fuzz.WRatio('hola mundo', 'Hola mundo'),\n", " fuzz.WRatio('hola mundo', 'Hola Mundo'),\n", " fuzz.WRatio('hola mundo', 'hola mundo!')\n", ")" ] }, { "cell_type": "code", "execution_count": 16, "id": "0967bc95-b9f4-4761-84d8-e808b8ea19ab", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "100" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "100" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "95" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "91" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "87" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "90" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "90" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "#\n", "# Comparación de strings\n", "#\n", "display(\n", " fuzz.WRatio('hola mundo!', 'hola mundo'),\n", " fuzz.WRatio('hola mundo', 'Hola mundo!'),\n", " fuzz.WRatio('hola mundo', 'Hola mundoo'),\n", " fuzz.WRatio('hola mundo', 'hola mundooo'),\n", " fuzz.WRatio('hola mundo', 'hola mundoooo'),\n", " fuzz.WRatio('hola mundo', 'hola'),\n", " fuzz.WRatio('hola', 'hola mundo')\n", ")" ] }, { "cell_type": "code", "execution_count": 17, "id": "a48fb0f2-c84b-4c4f-a963-9011855c308b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Hola mundo', 100, 0),\n", " ('mundo', 90, 1),\n", " ('Hola', 90, 2),\n", " ('Hola mundo cruel!', 90, 3)]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Comparación con arrays\n", "#\n", "base_string = \"hola mundo\"\n", "array = pd.Series(\n", " [\n", " \"Hola mundo\",\n", " \"mundo\",\n", " \"Hola\",\n", " \"Hola mundo cruel!\",\n", " ]\n", ")\n", "\n", "\n", "process.extract(\n", " base_string,\n", " array,\n", ")" ] }, { "cell_type": "markdown", "id": "d158958c-be9a-4990-b2eb-1e629cdd9677", "metadata": {}, "source": [ "Corrección y unificación básica cuando se conoce la clave\n", "----" ] }, { "cell_type": "code", "execution_count": 18, "id": "f781e877-7ed0-46bf-b888-2627bb439783", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Overwriting /tmp/data.csv\n" ] } ], "source": [ "%%writefile /tmp/data.csv\n", "ciudad\n", "Medellin\n", "Bogota\n", "bogota\n", "bogote\n", "Mdellin\n", "medellin\n", "Cali\n", "cali\n", "CALI" ] }, { "cell_type": "code", "execution_count": 19, "id": "cb38473d-8053-41bb-b12a-cdba8ac36ecf", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " ciudad\n", "0 Medellin\n", "1 Bogota\n", "2 bogota\n", "3 bogote\n", "4 Mdellin\n", "5 medellin\n", "6 Cali\n", "7 cali\n", "8 CALI" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('/tmp/data.csv')\n", "df" ] }, { "cell_type": "code", "execution_count": 20, "id": "3fc1c4af-5bfc-4880-9d86-6bcecf7b6c2f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " ciudad ciudad_\n", "0 Medellin Medellin\n", "1 Bogota Bogota\n", "2 bogota Bogota\n", "3 bogote Bogota\n", "4 Mdellin Medellin\n", "5 medellin Medellin\n", "6 Cali Cali\n", "7 cali Cali\n", "8 CALI Cali" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#\n", "# Nombres validos\n", "#\n", "valid_names = [\"Medellin\", \"Bogota\", \"Cali\"]\n", "df[\"ciudad_\"] = df.ciudad.copy()\n", "\n", "#\n", "# Valor mínimo de similitud para hacer el cambio\n", "#\n", "min_threshold = 80\n", "\n", "#\n", "# Estructuta básica\n", "#\n", "for valid_name in valid_names:\n", "\n", " potential_matches = process.extract(\n", " valid_name,\n", " df.ciudad,\n", " limit=df.shape[0],\n", " )\n", "\n", " for potential_match in potential_matches:\n", "\n", " if potential_match[1] >= min_threshold:\n", "\n", " df.loc[df.ciudad == potential_match[0], \"ciudad_\"] = valid_name\n", "\n", "df" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 5 }