Unificación de cadenas de texto — 12:21 min

  • 12:21 min | Última modificación: Octubre 14, 2021 | YouTube

Tipos de problemas

  • Errores de digitación.

  • Cumplimiento con un patrón (e-mails, telefonos, direcciones, …)

  • Longitud mínima

  • Errores tipográficos

Errores de formato

[1]:
import re

import numpy as np
import pandas as pd

pd.set_option("display.notebook_repr_html", False)

El formato correcto del número telefónico es:

001##########
[2]:
%%writefile /tmp/data.csv
phone
+001-693-708-9470
1-881-785-8561
+742-672-3143
+1.235.809.0341
+1 (808) 6256866
1 (993) 7090910
(858) 6725653
11 930 5833
+228 415 7806
892 144 6773
38728
Overwriting /tmp/data.csv
[3]:
df = pd.read_csv('/tmp/data.csv')

#
# Resulta más apropiado visualizar la columna
# ordenada formando bloques
#
display(
    df,
    '',
    df.sort_values('phone'),
    '',
    df.sort_values('phone', key=lambda series: [len(x) for x in series]),
)
                phone
0   +001-693-708-9470
1      1-881-785-8561
2       +742-672-3143
3     +1.235.809.0341
4    +1 (808) 6256866
5     1 (993) 7090910
6       (858) 6725653
7         11 930 5833
8       +228 415 7806
9        892 144 6773
10              38728
''
                phone
6       (858) 6725653
0   +001-693-708-9470
4    +1 (808) 6256866
3     +1.235.809.0341
8       +228 415 7806
2       +742-672-3143
5     1 (993) 7090910
1      1-881-785-8561
7         11 930 5833
10              38728
9        892 144 6773
''
                phone
10              38728
7         11 930 5833
9        892 144 6773
2       +742-672-3143
6       (858) 6725653
8       +228 415 7806
1      1-881-785-8561
3     +1.235.809.0341
5     1 (993) 7090910
4    +1 (808) 6256866
0   +001-693-708-9470
[4]:
#
# Eliminación del '+'
#
df.phone = df.phone.str.replace('+','')
df.sort_values('phone', key=lambda series: [len(x) for x in series])
[4]:
               phone
10             38728
7        11 930 5833
2       742-672-3143
8       228 415 7806
9       892 144 6773
6      (858) 6725653
1     1-881-785-8561
3     1.235.809.0341
4    1 (808) 6256866
5    1 (993) 7090910
0   001-693-708-9470
[5]:
#
# Corrección de los números con "1..."
#
df.phone = df.phone.map(lambda x: "00" + x if re.search(r"^1[\s\.-]", x) else x)
df.sort_values('phone', key=lambda series: [len(x) for x in series])
[5]:
                phone
10              38728
7         11 930 5833
2        742-672-3143
8        228 415 7806
9        892 144 6773
6       (858) 6725653
0    001-693-708-9470
1    001-881-785-8561
3    001.235.809.0341
4   001 (808) 6256866
5   001 (993) 7090910
[6]:
#
# Reemplazo del '.'
#
df.phone = df.phone.str.replace('.','-')
df.sort_values('phone', key=lambda series: [len(x) for x in series])
[6]:
                phone
10              38728
7         11 930 5833
2        742-672-3143
8        228 415 7806
9        892 144 6773
6       (858) 6725653
0    001-693-708-9470
1    001-881-785-8561
3    001-235-809-0341
4   001 (808) 6256866
5   001 (993) 7090910
[7]:
#
# Reemplazo espacios en blanco
#
df.phone = df.phone.str.replace(' ','-')
df.sort_values('phone', key=lambda series: [len(x) for x in series])
[7]:
                phone
10              38728
7         11-930-5833
2        742-672-3143
8        228-415-7806
9        892-144-6773
6       (858)-6725653
0    001-693-708-9470
1    001-881-785-8561
3    001-235-809-0341
4   001-(808)-6256866
5   001-(993)-7090910
[8]:
#
# Remoción paréntesis
#
df.phone = df.phone.str.replace('[()]','')
df.sort_values('phone', key=lambda series: [len(x) for x in series])
[8]:
               phone
10             38728
6        858-6725653
7        11-930-5833
2       742-672-3143
8       228-415-7806
9       892-144-6773
4    001-808-6256866
5    001-993-7090910
0   001-693-708-9470
1   001-881-785-8561
3   001-235-809-0341
[9]:
#
# Adición del 001 inicial
#
df.phone = df.phone.map(lambda x: "001-" + x if re.search(r"^\d{3}-\d{3}-\d{4}$", x) else x)
df.sort_values('phone', key=lambda series: [len(x) for x in series])
[9]:
               phone
10             38728
6        858-6725653
7        11-930-5833
4    001-808-6256866
5    001-993-7090910
0   001-693-708-9470
1   001-881-785-8561
2   001-742-672-3143
3   001-235-809-0341
8   001-228-415-7806
9   001-892-144-6773
[10]:
#
# Segundo registro de la tabla anterior
#
df.phone = df.phone.map(lambda x: '001-' + x if re.search(r"^\d{3}-\d{7}$", x) else x)
df.sort_values('phone', key=lambda series: [len(x) for x in series])
[10]:
               phone
10             38728
7        11-930-5833
4    001-808-6256866
5    001-993-7090910
6    001-858-6725653
0   001-693-708-9470
1   001-881-785-8561
2   001-742-672-3143
3   001-235-809-0341
8   001-228-415-7806
9   001-892-144-6773
[11]:
#
# Segundo registro de la tabla anterior
#
df.phone = df.phone.map(lambda x: '001-0' + x if re.search(r"^\d{2}-\d{3}-\d{4}$", x) else x)
df.sort_values('phone', key=lambda series: [len(x) for x in series])
[11]:
               phone
10             38728
4    001-808-6256866
5    001-993-7090910
6    001-858-6725653
0   001-693-708-9470
1   001-881-785-8561
2   001-742-672-3143
3   001-235-809-0341
7   001-011-930-5833
8   001-228-415-7806
9   001-892-144-6773
[12]:
#
# Se corrigen los '-' donde es posible
#
df.phone = df.phone.map(lambda x: x.replace('-','') if re.search(r"^001-\d{3}-\d{3}-\d{4}$", x) else x)
df.phone = df.phone.map(lambda x: x.replace('-','') if re.search(r"^001-\d{3}-\d{7}$", x) else x)
df.sort_values('phone', key=lambda series: [len(x) for x in series])
[12]:
            phone
10          38728
0   0016937089470
1   0018817858561
2   0017426723143
3   0012358090341
4   0018086256866
5   0019937090910
6   0018586725653
7   0010119305833
8   0012284157806
9   0018921446773
[13]:
#
# Se reemplazan los números invalidos por Nan
#
df.phone = df.phone.map(lambda x: np.nan if len(x) != 13 else x)
df
[13]:
            phone
0   0016937089470
1   0018817858561
2   0017426723143
3   0012358090341
4   0018086256866
5   0019937090910
6   0018586725653
7   0010119305833
8   0012284157806
9   0018921446773
10            NaN

Comparación de cadenas de texto

[14]:
!pip3 install --quiet fuzzywuzzy
!pip3 install --quiet fuzzywuzzy python-Levenshtein
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
[15]:
from fuzzywuzzy import fuzz, process

#
# Comparación de strings
#
display(
    fuzz.WRatio('hola mundo', 'hola mundo'),
    fuzz.WRatio('hola mundo', 'Hola mundo'),
    fuzz.WRatio('hola mundo', 'Hola Mundo'),
    fuzz.WRatio('hola mundo', 'hola mundo!')
)
100
100
100
100
[16]:
#
# Comparación de strings
#
display(
    fuzz.WRatio('hola mundo!', 'hola mundo'),
    fuzz.WRatio('hola mundo', 'Hola mundo!'),
    fuzz.WRatio('hola mundo', 'Hola mundoo'),
    fuzz.WRatio('hola mundo', 'hola mundooo'),
    fuzz.WRatio('hola mundo', 'hola mundoooo'),
    fuzz.WRatio('hola mundo', 'hola'),
    fuzz.WRatio('hola', 'hola mundo')
)
100
100
95
91
87
90
90
[17]:
#
# Comparación con arrays
#
base_string = "hola mundo"
array = pd.Series(
    [
        "Hola mundo",
        "mundo",
        "Hola",
        "Hola mundo cruel!",
    ]
)


process.extract(
    base_string,
    array,
)
[17]:
[('Hola mundo', 100, 0),
 ('mundo', 90, 1),
 ('Hola', 90, 2),
 ('Hola mundo cruel!', 90, 3)]

Corrección y unificación básica cuando se conoce la clave

[18]:
%%writefile /tmp/data.csv
ciudad
Medellin
Bogota
bogota
bogote
Mdellin
medellin
Cali
cali
CALI
Overwriting /tmp/data.csv
[19]:
df = pd.read_csv('/tmp/data.csv')
df
[19]:
     ciudad
0  Medellin
1    Bogota
2    bogota
3    bogote
4   Mdellin
5  medellin
6      Cali
7      cali
8      CALI
[20]:
#
# Nombres validos
#
valid_names = ["Medellin", "Bogota", "Cali"]
df["ciudad_"] = df.ciudad.copy()

#
# Valor mínimo de similitud para hacer el cambio
#
min_threshold = 80

#
# Estructuta básica
#
for valid_name in valid_names:

    potential_matches = process.extract(
        valid_name,
        df.ciudad,
        limit=df.shape[0],
    )

    for potential_match in potential_matches:

        if potential_match[1] >= min_threshold:

            df.loc[df.ciudad == potential_match[0], "ciudad_"] = valid_name

df
[20]:
     ciudad   ciudad_
0  Medellin  Medellin
1    Bogota    Bogota
2    bogota    Bogota
3    bogote    Bogota
4   Mdellin  Medellin
5  medellin  Medellin
6      Cali      Cali
7      cali      Cali
8      CALI      Cali