{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Extracción de información\n", "\n", "* *30 min* | Última modificación: Diciembre 9, 2020" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "http://www.nltk.org/book/\n", "\n", "Text Analytics with Python" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package conll2000 to /root/nltk_data...\n", "[nltk_data] Package conll2000 is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import nltk\n", "\n", "nltk.download('conll2000')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "##\n", "## Preprocesamiento básico\n", "##\n", "def ie_preprocess(document):\n", " ##\n", " ## Separación del documento en sentencias\n", " ##\n", " sentences = nltk.sent_tokenize(document)\n", " \n", " ##\n", " ## Separación de las sentencias en palabras\n", " ##\n", " sentences = [nltk.word_tokenize(sent) for sent in sentences]\n", " \n", " ##\n", " ## POS-Tagging\n", " ##\n", " sentences = [nltk.pos_tag(sent) for sent in sentences]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(S\n", " (NP the/DT little/JJ yellow/JJ dog/NN)\n", " barked/VBD\n", " at/IN\n", " (NP the/DT cat/NN))\n" ] } ], "source": [ "##\n", "## Noun phrase chunking\n", "## Frases nominales\n", "##\n", "\n", "## Define la frase\n", "sentence = [\n", " (\"the\", \"DT\"),\n", " (\"little\", \"JJ\"),\n", " (\"yellow\", \"JJ\"),\n", " (\"dog\", \"NN\"),\n", " (\"barked\", \"VBD\"),\n", " (\"at\", \"IN\"),\n", " (\"the\", \"DT\"),\n", " (\"cat\", \"NN\"),\n", "]\n", "\n", "## tag pattern\n", "grammar = \"NP: {
?*}\"\n", "\n", "cp = nltk.RegexpParser(grammar)\n", "result = cp.parse(sentence)\n", "print(result)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(S\n", " (NP the/DT little/JJ yellow/JJ dog/NN)\n", " barked/VBD\n", " at/IN\n", " (NP the/DT cat/NN))\n" ] } ], "source": [ "##\n", "## Regla alternativa\n", "##\n", "grammar = \"NP: {
?*+}\"\n", "\n", "cp = nltk.RegexpParser(grammar)\n", "result = cp.parse(sentence)\n", "print(result)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(S\n", " (NP Rapunzel/NNP)\n", " let/VBD\n", " down/RP\n", " (NP her/PP$ long/JJ golden/JJ hair/NN))\n" ] } ], "source": [ "##\n", "## Chunking con expresiones regulares\n", "##\n", "grammar = r\"\"\"\n", " NP: {?*} # chunk determiner/possessive, adjectives and noun\n", " {+} # chunk sequences of proper nouns\n", "\"\"\"\n", "\n", "cp = nltk.RegexpParser(grammar)\n", "sentence = [\n", " (\"Rapunzel\", \"NNP\"),\n", " (\"let\", \"VBD\"),\n", " (\"down\", \"RP\"),\n", " (\"her\", \"PP$\"),\n", " (\"long\", \"JJ\"),\n", " (\"golden\", \"JJ\"),\n", " (\"hair\", \"NN\"),\n", "]\n", "\n", "print(cp.parse(sentence))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(S (NP money/NN market/NN) fund/NN)\n" ] } ], "source": [ "##\n", "## Chunk de dos sustantivos consecutivos\n", "##\n", "nouns = [(\"money\", \"NN\"), (\"market\", \"NN\"), (\"fund\", \"NN\")]\n", "grammar = \"NP: {}\"\n", "cp = nltk.RegexpParser(grammar)\n", "print(cp.parse(nouns))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(CHUNK combined/VBN to/TO achieve/VB)\n", "(CHUNK continue/VB to/TO place/VB)\n", "(CHUNK serve/VB to/TO protect/VB)\n", "(CHUNK wanted/VBD to/TO wait/VB)\n", "(CHUNK allowed/VBN to/TO place/VB)\n", "(CHUNK expected/VBN to/TO become/VB)\n", "(CHUNK expected/VBN to/TO approve/VB)\n", "(CHUNK expected/VBN to/TO make/VB)\n", "(CHUNK intends/VBZ to/TO make/VB)\n", "(CHUNK seek/VB to/TO set/VB)\n", "(CHUNK like/VB to/TO see/VB)\n" ] } ], "source": [ "##\n", "## Ejemplo aplicado a un texto\n", "##\n", "cp = nltk.RegexpParser('CHUNK: { }')\n", "brown = nltk.corpus.brown\n", "\n", "counter = 0\n", "for sent in brown.tagged_sents():\n", "\n", " tree = cp.parse(sent)\n", "\n", " for subtree in tree.subtrees():\n", " if subtree.label() == 'CHUNK': \n", " print(subtree)\n", " counter += 1\n", "\n", " if counter > 10:\n", " break" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(S\n", " (NP the/DT little/JJ yellow/JJ dog/NN)\n", " barked/VBD\n", " at/IN\n", " (NP the/DT cat/NN))\n" ] } ], "source": [ "##\n", "## Chinking\n", "## Se refiere a definir que se excluye en vez\n", "## de definir que se reconoce\n", "##\n", "grammar = r\"\"\"\n", " NP:\n", " {<.*>+} # Chunk everything\n", " }+{ # Chink sequences of VBD and IN\n", " \"\"\"\n", "sentence = [\n", " (\"the\", \"DT\"),\n", " (\"little\", \"JJ\"),\n", " (\"yellow\", \"JJ\"),\n", " (\"dog\", \"NN\"),\n", " (\"barked\", \"VBD\"),\n", " (\"at\", \"IN\"),\n", " (\"the\", \"DT\"),\n", " (\"cat\", \"NN\"),\n", "]\n", "cp = nltk.RegexpParser(grammar)\n", "print(cp.parse(sentence))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(S\n", " (PP Over/IN)\n", " (NP a/DT cup/NN)\n", " (PP of/IN)\n", " (NP coffee/NN)\n", " ,/,\n", " (NP Mr./NNP Stone/NNP)\n", " (VP told/VBD)\n", " (NP his/PRP$ story/NN)\n", " ./.)\n" ] } ], "source": [ "##\n", "## Entrenamiento de chunkers\n", "##\n", "from nltk.corpus import conll2000\n", "\n", "##\n", "## Ejemplo de una secuencia tageada\n", "## El corpus CoNLL 2000 tiene 270K palabras divididas\n", "## en train/test con POS-tags y chunk tags\n", "##\n", "print(conll2000.chunked_sents('train.txt')[99])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(S\n", " Over/IN\n", " (NP a/DT cup/NN)\n", " of/IN\n", " (NP coffee/NN)\n", " ,/,\n", " (NP Mr./NNP Stone/NNP)\n", " told/VBD\n", " (NP his/PRP$ story/NN)\n", " ./.)\n" ] } ], "source": [ "##\n", "## CoNLL tiene tres tipos de chunks\n", "## NP: ya revisados\n", "## VP: ejemplo: has already delivered\n", "## PP: `because of`\n", "##\n", "## En el siguiente ejemplo se seleccionan los\n", "## chunks de tipo NP\n", "##\n", "print(conll2000.chunked_sents('train.txt', chunk_types=['NP'])[99])" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ChunkParse score:\n", " IOB Accuracy: 43.4%%\n", " Precision: 0.0%%\n", " Recall: 0.0%%\n", " F-Measure: 0.0%%\n" ] } ], "source": [ "##\n", "## Evaluación de la precisión de \n", "## un Regular expression parser\n", "## usando un chunked corpus. Este parser no\n", "## crea chunks\n", "##\n", "cp = nltk.RegexpParser(\"\")\n", "test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])\n", "print(cp.evaluate(test_sents))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ChunkParse score:\n", " IOB Accuracy: 87.7%%\n", " Precision: 70.6%%\n", " Recall: 67.8%%\n", " F-Measure: 69.2%%\n" ] } ], "source": [ "grammar = r\"NP: {<[CDJNP].*>+}\"\n", "cp = nltk.RegexpParser(grammar)\n", "print(cp.evaluate(test_sents))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ChunkParse score:\n", " IOB Accuracy: 92.9%%\n", " Precision: 79.9%%\n", " Recall: 86.8%%\n", " F-Measure: 83.2%%\n" ] } ], "source": [ "##\n", "## Se usa un unigram tagger para marcar sentencias con chunk tags.\n", "##\n", "class UnigramChunker(nltk.ChunkParserI):\n", " def __init__(self, train_sents):\n", " \n", " ##\n", " ## Entrena el modelo\n", " ##\n", " train_data = [\n", " [(t, c) for _, t, c in nltk.chunk.tree2conlltags(sent)]\n", " for sent in train_sents\n", " ]\n", " self.tagger = nltk.UnigramTagger(train_data)\n", "\n", " def parse(self, sentence):\n", " ##\n", " ## Aplica el tagger\n", " ##\n", " pos_tags = [pos for (word, pos) in sentence]\n", " tagged_pos_tags = self.tagger.tag(pos_tags)\n", " chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]\n", " conlltags = [\n", " (word, pos, chunktag)\n", " for ((word, pos), chunktag) in zip(sentence, chunktags)\n", " ]\n", " return nltk.chunk.conlltags2tree(conlltags)\n", "\n", "\n", "test_sents = conll2000.chunked_sents(\"test.txt\", chunk_types=[\"NP\"])\n", "train_sents = conll2000.chunked_sents(\"train.txt\", chunk_types=[\"NP\"])\n", "unigram_chunker = UnigramChunker(train_sents)\n", "print(unigram_chunker.evaluate(test_sents))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('#', 'B-NP'), ('$', 'B-NP'), (\"''\", 'O'), ('(', 'O'), (')', 'O'), (',', 'O'), ('.', 'O'), (':', 'O'), ('CC', 'O'), ('CD', 'I-NP'), ('DT', 'B-NP'), ('EX', 'B-NP'), ('FW', 'I-NP'), ('IN', 'O'), ('JJ', 'I-NP'), ('JJR', 'B-NP'), ('JJS', 'I-NP'), ('MD', 'O'), ('NN', 'I-NP'), ('NNP', 'I-NP'), ('NNPS', 'I-NP'), ('NNS', 'I-NP'), ('PDT', 'B-NP'), ('POS', 'B-NP'), ('PRP', 'B-NP'), ('PRP$', 'B-NP'), ('RB', 'O'), ('RBR', 'O'), ('RBS', 'B-NP'), ('RP', 'O'), ('SYM', 'O'), ('TO', 'O'), ('UH', 'O'), ('VB', 'O'), ('VBD', 'O'), ('VBG', 'O'), ('VBN', 'O'), ('VBP', 'O'), ('VBZ', 'O'), ('WDT', 'B-NP'), ('WP', 'B-NP'), ('WP$', 'B-NP'), ('WRB', 'O'), ('``', 'O')]\n" ] } ], "source": [ "##\n", "## Para determinar que aprendio el tagger, se asigna un tag\n", "## a cada uno de los tags usandos en POS-tagging\n", "##\n", "postags = sorted(set(pos for sent in train_sents for (word, pos) in sent.leaves()))\n", "print(unigram_chunker.tagger.tag(postags))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ChunkParse score:\n", " IOB Accuracy: 93.3%%\n", " Precision: 82.3%%\n", " Recall: 86.8%%\n", " F-Measure: 84.5%%\n" ] } ], "source": [ "##\n", "## Codigo similar al anterior, pero usa un bigram-tagger\n", "##\n", "class BigramChunker(nltk.ChunkParserI):\n", " def __init__(self, train_sents):\n", " train_data = [\n", " [(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)]\n", " for sent in train_sents\n", " ]\n", " self.tagger = nltk.BigramTagger(train_data)\n", "\n", " def parse(self, sentence):\n", " pos_tags = [pos for (word, pos) in sentence]\n", " tagged_pos_tags = self.tagger.tag(pos_tags)\n", " chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]\n", " conlltags = [\n", " (word, pos, chunktag)\n", " for ((word, pos), chunktag) in zip(sentence, chunktags)\n", " ]\n", " return nltk.chunk.conlltags2tree(conlltags)\n", "\n", "\n", "bigram_chunker = BigramChunker(train_sents)\n", "print(bigram_chunker.evaluate(test_sents))" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(S\n", " (NP Mary/NN)\n", " saw/VBD\n", " (CLAUSE\n", " (NP the/DT cat/NN)\n", " (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))\n" ] } ], "source": [ "##\n", "## Cascade chunkers\n", "## Se crean especificando una estructura multi-estado\n", "## con reglas recursivas.\n", "##\n", "grammar = r\"\"\"\n", " NP: {+} # Chunk sequences of DT, JJ, NN\n", " PP: {} # Chunk prepositions followed by NP\n", " VP: {+$} # Chunk verbs and their arguments\n", " CLAUSE: {} # Chunk NP, VP\n", " \"\"\"\n", "cp = nltk.RegexpParser(grammar)\n", "\n", "##\n", "## Note como es taggeada la frase \n", "##\n", "## Mary saw the cat sit on the mat\n", "##\n", "sentence = [\n", " (\"Mary\", \"NN\"),\n", " (\"saw\", \"VBD\"),\n", " (\"the\", \"DT\"),\n", " (\"cat\", \"NN\"),\n", " (\"sit\", \"VB\"),\n", " (\"on\", \"IN\"),\n", " (\"the\", \"DT\"),\n", " (\"mat\", \"NN\"),\n", "]\n", "\n", "##\n", "## En la parte que dice CLAUSE el chunker debio\n", "## identificar el patrón VP\n", "##\n", "print(cp.parse(sentence))" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(S\n", " (NP John/NNP)\n", " thinks/VBZ\n", " (NP Mary/NN)\n", " saw/VBD\n", " (CLAUSE\n", " (NP the/DT cat/NN)\n", " (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))\n" ] } ], "source": [ "##\n", "## Para la frase:\n", "##\n", "## John thinks Mary saw the cat sit on the mat\n", "##\n", "## el chunker no identifica correctamente la estructura\n", "##\n", "sentence = [\n", " (\"John\", \"NNP\"),\n", " (\"thinks\", \"VBZ\"),\n", " (\"Mary\", \"NN\"),\n", " (\"saw\", \"VBD\"),\n", " (\"the\", \"DT\"),\n", " (\"cat\", \"NN\"),\n", " (\"sit\", \"VB\"),\n", " (\"on\", \"IN\"),\n", " (\"the\", \"DT\"),\n", " (\"mat\", \"NN\"),\n", "]\n", "print(cp.parse(sentence))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(S\n", " (NP John/NNP)\n", " thinks/VBZ\n", " (CLAUSE\n", " (NP Mary/NN)\n", " (VP\n", " saw/VBD\n", " (CLAUSE\n", " (NP the/DT cat/NN)\n", " (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))\n" ] } ], "source": [ "##\n", "## La solución es permitir que el chunker itere varias veces sobre\n", "## los patrones para que pueda identificar todas las estructuras\n", "##\n", "cp = nltk.RegexpParser(grammar, loop=2)\n", "print(cp.parse(sentence))" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(NP Alice)\n" ] } ], "source": [ "##\n", "## En los siguientes ejemplos se ilustra\n", "## como construir un arbol como el\n", "## presentado en los ejemplos anteriores\n", "##\n", "tree1 = nltk.Tree('NP', ['Alice'])\n", "print(tree1)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(NP the rabbit)\n" ] } ], "source": [ "tree2 = nltk.Tree('NP', ['the', 'rabbit'])\n", "print(tree2)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(S (NP Alice) (VP chased (NP the rabbit)))\n" ] } ], "source": [ "tree3 = nltk.Tree('VP', ['chased', tree2])\n", "tree4 = nltk.Tree('S', [tree1, tree3])\n", "print(tree4)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(VP chased (NP the rabbit))\n" ] } ], "source": [ "print(tree4[1])" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Alice', 'chased', 'the', 'rabbit']" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tree4.leaves()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 4 }