{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Expresiones regulares en NLTK\n", "\n", "* *90 min* | Última modificación: Diciembre 3, 2020" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package treebank to /root/nltk_data...\n", "[nltk_data] Package treebank is already up-to-date!\n" ] }, { "data": { "text/plain": [ "['0.0085', '0.05', '0.1', '0.16', '0.2', '0.25', '0.28', '0.3', '0.4', '0.5']" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import nltk\n", "\n", "nltk.download('treebank')\n", "wsj = sorted(set(nltk.corpus.treebank.words()))\n", "[w for w in wsj if re.search('^[0-9]+\\.[0-9]+$', w)][:10]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['C$', 'US$']" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[w for w in wsj if re.search('^[A-Z]+\\$$', w)]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['1614',\n", " '1637',\n", " '1787',\n", " '1901',\n", " '1903',\n", " '1917',\n", " '1925',\n", " '1929',\n", " '1933',\n", " '1934']" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[w for w in wsj if re.search('^[0-9]{4}$', w)][:10]" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['10-day',\n", " '10-lap',\n", " '10-year',\n", " '100-share',\n", " '12-point',\n", " '12-year',\n", " '14-hour',\n", " '15-day',\n", " '150-point',\n", " '190-point']" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[w for w in wsj if re.search('^[0-9]+-[a-z]{3,5}$', w)][:10]" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['black-and-white',\n", " 'bread-and-butter',\n", " 'father-in-law',\n", " 'machine-gun-toting',\n", " 'savings-and-loan']" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[w for w in wsj if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)]" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['62%-owned',\n", " 'Absorbed',\n", " 'According',\n", " 'Adopting',\n", " 'Advanced',\n", " 'Advancing',\n", " 'Alfred',\n", " 'Allied',\n", " 'Annualized',\n", " 'Anything']" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[w for w in wsj if re.search('(ed|ing)$', w)][:10]" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['u',\n", " 'e',\n", " 'a',\n", " 'i',\n", " 'a',\n", " 'i',\n", " 'i',\n", " 'i',\n", " 'e',\n", " 'i',\n", " 'a',\n", " 'i',\n", " 'o',\n", " 'i',\n", " 'o',\n", " 'u']" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "word = 'supercalifragilisticexpialidocious'\n", "re.findall(r'[aeiou]', word)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('io', 549),\n", " ('ea', 476),\n", " ('ie', 331),\n", " ('ou', 329),\n", " ('ai', 261),\n", " ('ia', 253),\n", " ('ee', 217),\n", " ('oo', 174),\n", " ('ua', 109),\n", " ('au', 106),\n", " ('ue', 105),\n", " ('ui', 95)]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wsj = sorted(set(nltk.corpus.treebank.words()))\n", "fd = nltk.FreqDist(vs for word in wsj for vs in re.findall(r'[aeiou]{2,}', word))\n", "fd.most_common(12)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package udhr to /root/nltk_data...\n", "[nltk_data] Package udhr is already up-to-date!\n" ] }, { "data": { "text/plain": [ "'Unvrsl Dclrtn of Hmn Rghts Prmble Whrs rcgntn of the inhrnt dgnty and\\nof the eql and inlnble rghts of all mmbrs of the hmn fmly is the fndtn\\nof frdm , jstce and pce in the wrld , Whrs dsrgrd and cntmpt fr hmn\\nrghts hve rsltd in brbrs acts whch hve outrgd the cnscnce of mnknd ,\\nand the advnt of a wrld in whch hmn bngs shll enjy frdm of spch and'" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nltk.download('udhr')\n", "regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'\n", "def compress(word):\n", " pieces = re.findall(regexp, word)\n", " return ''.join(pieces)\n", "\n", "english_udhr = nltk.corpus.udhr.words('English-Latin1')\n", "nltk.tokenwrap(compress(w) for w in english_udhr[:75])" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package gutenberg to /root/nltk_data...\n", "[nltk_data] Package gutenberg is already up-to-date!\n", "monied; nervous; dangerous; white; white; white; pious; queer; good;\n", "mature; white; Cape; great; wise; wise; butterless; white; fiendish;\n", "pale; furious; better; certain; complete; dismasted; younger; brave;\n", "brave; brave; brave\n" ] } ], "source": [ "nltk.download('gutenberg')\n", "\n", "from nltk.corpus import gutenberg\n", "\n", "moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))\n", "\n", "##\n", "## Captura la palabra en `a XXX man` \n", "##\n", "moby.findall(r\" (<.*>) \")" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package nps_chat to /root/nltk_data...\n", "[nltk_data] Package nps_chat is already up-to-date!\n", "you rule bro; telling you bro; u twizted bro\n" ] } ], "source": [ "nltk.download('nps_chat')\n", "\n", "from nltk.corpus import gutenberg, nps_chat\n", "\n", "chat = nltk.Text(nps_chat.words())\n", "chat.findall(r\"<.*> <.*> \")" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "lol lol lol; lmao lol lol; lol lol lol; la la la la la; la la la; la\n", "la la; lovely lol lol love; lol lol lol.; la la la; la la la\n" ] } ], "source": [ "chat.findall(r\"{3,}\") " ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package brown to /root/nltk_data...\n", "[nltk_data] Package brown is already up-to-date!\n", "speed and other activities; water and other liquids; tomb and other\n", "landmarks; Statues and other monuments; pearls and other jewels;\n", "charts and other items; roads and other features; figures and other\n", "objects; military and other areas; demands and other factors;\n", "abstracts and other compilations; iron and other metals\n" ] } ], "source": [ "nltk.download('brown')\n", "\n", "from nltk.corpus import brown\n", "\n", "hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))\n", "hobbies_learned.findall(r\"<\\w*> <\\w*s>\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Baby names exercise from Google Education" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* Descargue el archivo https://developers.google.com/edu/python/google-python-exercises.zip y descomprimalo.\n", "\n", "* Los archivos baby1990.html, baby1991.html contienen el código HTML de la página web que publica los nombres más populares para bebes nacidos en el correspondiente año.\n", "\n", "* Escriba una función que retorne una lista simple que contiene el año, y posteriormente el nombre y su posición. La lista solicitada debe presentar los nombres en orden alfabético y debe considerar simultaneamente los nombres de niños y niñas. " ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 4 }