Unificación de cadenas de texto — 12:21 min
12:21 min | Última modificación: Octubre 14, 2021 | YouTube
Tipos de problemas
Errores de digitación.
Cumplimiento con un patrón (e-mails, telefonos, direcciones, …)
Longitud mínima
Errores tipográficos
Errores de formato
[1]:
import re
import numpy as np
import pandas as pd
pd.set_option("display.notebook_repr_html", False)
El formato correcto del número telefónico es:
001##########
[2]:
%%writefile /tmp/data.csv
phone
+001-693-708-9470
1-881-785-8561
+742-672-3143
+1.235.809.0341
+1 (808) 6256866
1 (993) 7090910
(858) 6725653
11 930 5833
+228 415 7806
892 144 6773
38728
Overwriting /tmp/data.csv
[3]:
df = pd.read_csv('/tmp/data.csv')
#
# Resulta más apropiado visualizar la columna
# ordenada formando bloques
#
display(
df,
'',
df.sort_values('phone'),
'',
df.sort_values('phone', key=lambda series: [len(x) for x in series]),
)
phone
0 +001-693-708-9470
1 1-881-785-8561
2 +742-672-3143
3 +1.235.809.0341
4 +1 (808) 6256866
5 1 (993) 7090910
6 (858) 6725653
7 11 930 5833
8 +228 415 7806
9 892 144 6773
10 38728
''
phone
6 (858) 6725653
0 +001-693-708-9470
4 +1 (808) 6256866
3 +1.235.809.0341
8 +228 415 7806
2 +742-672-3143
5 1 (993) 7090910
1 1-881-785-8561
7 11 930 5833
10 38728
9 892 144 6773
''
phone
10 38728
7 11 930 5833
9 892 144 6773
2 +742-672-3143
6 (858) 6725653
8 +228 415 7806
1 1-881-785-8561
3 +1.235.809.0341
5 1 (993) 7090910
4 +1 (808) 6256866
0 +001-693-708-9470
[4]:
#
# Eliminación del '+'
#
df.phone = df.phone.str.replace('+','')
df.sort_values('phone', key=lambda series: [len(x) for x in series])
[4]:
phone
10 38728
7 11 930 5833
2 742-672-3143
8 228 415 7806
9 892 144 6773
6 (858) 6725653
1 1-881-785-8561
3 1.235.809.0341
4 1 (808) 6256866
5 1 (993) 7090910
0 001-693-708-9470
[5]:
#
# Corrección de los números con "1..."
#
df.phone = df.phone.map(lambda x: "00" + x if re.search(r"^1[\s\.-]", x) else x)
df.sort_values('phone', key=lambda series: [len(x) for x in series])
[5]:
phone
10 38728
7 11 930 5833
2 742-672-3143
8 228 415 7806
9 892 144 6773
6 (858) 6725653
0 001-693-708-9470
1 001-881-785-8561
3 001.235.809.0341
4 001 (808) 6256866
5 001 (993) 7090910
[6]:
#
# Reemplazo del '.'
#
df.phone = df.phone.str.replace('.','-')
df.sort_values('phone', key=lambda series: [len(x) for x in series])
[6]:
phone
10 38728
7 11 930 5833
2 742-672-3143
8 228 415 7806
9 892 144 6773
6 (858) 6725653
0 001-693-708-9470
1 001-881-785-8561
3 001-235-809-0341
4 001 (808) 6256866
5 001 (993) 7090910
[7]:
#
# Reemplazo espacios en blanco
#
df.phone = df.phone.str.replace(' ','-')
df.sort_values('phone', key=lambda series: [len(x) for x in series])
[7]:
phone
10 38728
7 11-930-5833
2 742-672-3143
8 228-415-7806
9 892-144-6773
6 (858)-6725653
0 001-693-708-9470
1 001-881-785-8561
3 001-235-809-0341
4 001-(808)-6256866
5 001-(993)-7090910
[8]:
#
# Remoción paréntesis
#
df.phone = df.phone.str.replace('[()]','')
df.sort_values('phone', key=lambda series: [len(x) for x in series])
[8]:
phone
10 38728
6 858-6725653
7 11-930-5833
2 742-672-3143
8 228-415-7806
9 892-144-6773
4 001-808-6256866
5 001-993-7090910
0 001-693-708-9470
1 001-881-785-8561
3 001-235-809-0341
[9]:
#
# Adición del 001 inicial
#
df.phone = df.phone.map(lambda x: "001-" + x if re.search(r"^\d{3}-\d{3}-\d{4}$", x) else x)
df.sort_values('phone', key=lambda series: [len(x) for x in series])
[9]:
phone
10 38728
6 858-6725653
7 11-930-5833
4 001-808-6256866
5 001-993-7090910
0 001-693-708-9470
1 001-881-785-8561
2 001-742-672-3143
3 001-235-809-0341
8 001-228-415-7806
9 001-892-144-6773
[10]:
#
# Segundo registro de la tabla anterior
#
df.phone = df.phone.map(lambda x: '001-' + x if re.search(r"^\d{3}-\d{7}$", x) else x)
df.sort_values('phone', key=lambda series: [len(x) for x in series])
[10]:
phone
10 38728
7 11-930-5833
4 001-808-6256866
5 001-993-7090910
6 001-858-6725653
0 001-693-708-9470
1 001-881-785-8561
2 001-742-672-3143
3 001-235-809-0341
8 001-228-415-7806
9 001-892-144-6773
[11]:
#
# Segundo registro de la tabla anterior
#
df.phone = df.phone.map(lambda x: '001-0' + x if re.search(r"^\d{2}-\d{3}-\d{4}$", x) else x)
df.sort_values('phone', key=lambda series: [len(x) for x in series])
[11]:
phone
10 38728
4 001-808-6256866
5 001-993-7090910
6 001-858-6725653
0 001-693-708-9470
1 001-881-785-8561
2 001-742-672-3143
3 001-235-809-0341
7 001-011-930-5833
8 001-228-415-7806
9 001-892-144-6773
[12]:
#
# Se corrigen los '-' donde es posible
#
df.phone = df.phone.map(lambda x: x.replace('-','') if re.search(r"^001-\d{3}-\d{3}-\d{4}$", x) else x)
df.phone = df.phone.map(lambda x: x.replace('-','') if re.search(r"^001-\d{3}-\d{7}$", x) else x)
df.sort_values('phone', key=lambda series: [len(x) for x in series])
[12]:
phone
10 38728
0 0016937089470
1 0018817858561
2 0017426723143
3 0012358090341
4 0018086256866
5 0019937090910
6 0018586725653
7 0010119305833
8 0012284157806
9 0018921446773
[13]:
#
# Se reemplazan los números invalidos por Nan
#
df.phone = df.phone.map(lambda x: np.nan if len(x) != 13 else x)
df
[13]:
phone
0 0016937089470
1 0018817858561
2 0017426723143
3 0012358090341
4 0018086256866
5 0019937090910
6 0018586725653
7 0010119305833
8 0012284157806
9 0018921446773
10 NaN
Comparación de cadenas de texto
[14]:
!pip3 install --quiet fuzzywuzzy
!pip3 install --quiet fuzzywuzzy python-Levenshtein
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
[15]:
from fuzzywuzzy import fuzz, process
#
# Comparación de strings
#
display(
fuzz.WRatio('hola mundo', 'hola mundo'),
fuzz.WRatio('hola mundo', 'Hola mundo'),
fuzz.WRatio('hola mundo', 'Hola Mundo'),
fuzz.WRatio('hola mundo', 'hola mundo!')
)
100
100
100
100
[16]:
#
# Comparación de strings
#
display(
fuzz.WRatio('hola mundo!', 'hola mundo'),
fuzz.WRatio('hola mundo', 'Hola mundo!'),
fuzz.WRatio('hola mundo', 'Hola mundoo'),
fuzz.WRatio('hola mundo', 'hola mundooo'),
fuzz.WRatio('hola mundo', 'hola mundoooo'),
fuzz.WRatio('hola mundo', 'hola'),
fuzz.WRatio('hola', 'hola mundo')
)
100
100
95
91
87
90
90
[17]:
#
# Comparación con arrays
#
base_string = "hola mundo"
array = pd.Series(
[
"Hola mundo",
"mundo",
"Hola",
"Hola mundo cruel!",
]
)
process.extract(
base_string,
array,
)
[17]:
[('Hola mundo', 100, 0),
('mundo', 90, 1),
('Hola', 90, 2),
('Hola mundo cruel!', 90, 3)]
Corrección y unificación básica cuando se conoce la clave
[18]:
%%writefile /tmp/data.csv
ciudad
Medellin
Bogota
bogota
bogote
Mdellin
medellin
Cali
cali
CALI
Overwriting /tmp/data.csv
[19]:
df = pd.read_csv('/tmp/data.csv')
df
[19]:
ciudad
0 Medellin
1 Bogota
2 bogota
3 bogote
4 Mdellin
5 medellin
6 Cali
7 cali
8 CALI
[20]:
#
# Nombres validos
#
valid_names = ["Medellin", "Bogota", "Cali"]
df["ciudad_"] = df.ciudad.copy()
#
# Valor mínimo de similitud para hacer el cambio
#
min_threshold = 80
#
# Estructuta básica
#
for valid_name in valid_names:
potential_matches = process.extract(
valid_name,
df.ciudad,
limit=df.shape[0],
)
for potential_match in potential_matches:
if potential_match[1] >= min_threshold:
df.loc[df.ciudad == potential_match[0], "ciudad_"] = valid_name
df
[20]:
ciudad ciudad_
0 Medellin Medellin
1 Bogota Bogota
2 bogota Bogota
3 bogote Bogota
4 Mdellin Medellin
5 medellin Medellin
6 Cali Cali
7 cali Cali
8 CALI Cali