Expresiones regulares en NLTK

  • 90 min | Última modificación: Diciembre 3, 2020

import nltk

wsj = sorted(set(nltk.corpus.treebank.words()))
[w for w in wsj if re.search('^[0-9]+\.[0-9]+$', w)][:10]
['0.0085', '0.05', '0.1', '0.16', '0.2', '0.25', '0.28', '0.3', '0.4', '0.5']
[w for w in wsj if re.search('^[A-Z]+\$$', w)]
['C$', 'US$']
[w for w in wsj if re.search('^[0-9]{4}$', w)][:10]
[w for w in wsj if re.search('^[0-9]+-[a-z]{3,5}$', w)][:10]
[w for w in wsj if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)]
[w for w in wsj if re.search('(ed|ing)$', w)][:10]
word = 'supercalifragilisticexpialidocious'
re.findall(r'[aeiou]', word)
wsj = sorted(set(nltk.corpus.treebank.words()))
fd = nltk.FreqDist(vs for word in wsj  for vs in re.findall(r'[aeiou]{2,}', word))
[('io', 549),
 ('ea', 476),
 ('ie', 331),
 ('ou', 329),
 ('ai', 261),
 ('ia', 253),
 ('ee', 217),
 ('oo', 174),
 ('ua', 109),
 ('au', 106),
 ('ue', 105),
 ('ui', 95)]
regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
def compress(word):
    pieces = re.findall(regexp, word)
    return ''.join(pieces)

english_udhr = nltk.corpus.udhr.words('English-Latin1')
nltk.tokenwrap(compress(w) for w in english_udhr[:75])
'Unvrsl Dclrtn of Hmn Rghts Prmble Whrs rcgntn of the inhrnt dgnty and\nof the eql and inlnble rghts of all mmbrs of the hmn fmly is the fndtn\nof frdm , jstce and pce in the wrld , Whrs dsrgrd and cntmpt fr hmn\nrghts hve rsltd in brbrs acts whch hve outrgd the cnscnce of mnknd ,\nand the advnt of a wrld in whch hmn bngs shll enjy frdm of spch and'

from nltk.corpus import gutenberg

moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))

## Captura la palabra en  `a XXX man`
moby.findall(r"<a> (<.*>) <man>")
monied; nervous; dangerous; white; white; white; pious; queer; good;
mature; white; Cape; great; wise; wise; butterless; white; fiendish;
pale; furious; better; certain; complete; dismasted; younger; brave;
brave; brave; brave

from nltk.corpus import gutenberg, nps_chat

chat = nltk.Text(nps_chat.words())
chat.findall(r"<.*> <.*> <bro>")
you rule bro; telling you bro; u twizted bro
lol lol lol; lmao lol lol; lol lol lol; la la la la la; la la la; la
la la; lovely lol lol love; lol lol lol.; la la la; la la la

from nltk.corpus import brown

hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")
speed and other activities; water and other liquids; tomb and other
landmarks; Statues and other monuments; pearls and other jewels;
charts and other items; roads and other features; figures and other
objects; military and other areas; demands and other factors;
abstracts and other compilations; iron and other metals

Baby names exercise from Google Education

  • Descargue el archivo https://developers.google.com/edu/python/google-python-exercises.zip y descomprimalo.

  • Los archivos baby1990.html, baby1991.html contienen el código HTML de la página web que publica los nombres más populares para bebes nacidos en el correspondiente año.

  • Escriba una función que retorne una lista simple que contiene el año, y posteriormente el nombre y su posición. La lista solicitada debe presentar los nombres en orden alfabético y debe considerar simultaneamente los nombres de niños y niñas.