{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Extracción de información\n",
    "\n",
    "* *30 min* | Última modificación: Diciembre 9, 2020"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "http://www.nltk.org/book/\n",
    "\n",
    "Text Analytics with Python"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package conll2000 to /root/nltk_data...\n",
      "[nltk_data]   Package conll2000 is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import nltk\n",
    "\n",
    "nltk.download('conll2000')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "##\n",
    "## Preprocesamiento básico\n",
    "##\n",
    "def ie_preprocess(document):\n",
    "    ##\n",
    "    ## Separación del documento en sentencias\n",
    "    ##\n",
    "    sentences = nltk.sent_tokenize(document)\n",
    "    \n",
    "    ##\n",
    "    ## Separación de las sentencias en palabras\n",
    "    ##\n",
    "    sentences = [nltk.word_tokenize(sent) for sent in sentences]\n",
    "    \n",
    "    ##\n",
    "    ## POS-Tagging\n",
    "    ##\n",
    "    sentences = [nltk.pos_tag(sent) for sent in sentences]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(S\n",
      "  (NP the/DT little/JJ yellow/JJ dog/NN)\n",
      "  barked/VBD\n",
      "  at/IN\n",
      "  (NP the/DT cat/NN))\n"
     ]
    }
   ],
   "source": [
    "##\n",
    "## Noun phrase chunking\n",
    "##   Frases nominales\n",
    "##\n",
    "\n",
    "## Define la frase\n",
    "sentence = [\n",
    "    (\"the\", \"DT\"),\n",
    "    (\"little\", \"JJ\"),\n",
    "    (\"yellow\", \"JJ\"),\n",
    "    (\"dog\", \"NN\"),\n",
    "    (\"barked\", \"VBD\"),\n",
    "    (\"at\", \"IN\"),\n",
    "    (\"the\", \"DT\"),\n",
    "    (\"cat\", \"NN\"),\n",
    "]\n",
    "\n",
    "## tag pattern\n",
    "grammar = \"NP: {<DT>?<JJ>*<NN>}\"\n",
    "\n",
    "cp = nltk.RegexpParser(grammar)\n",
    "result = cp.parse(sentence)\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(S\n",
      "  (NP the/DT little/JJ yellow/JJ dog/NN)\n",
      "  barked/VBD\n",
      "  at/IN\n",
      "  (NP the/DT cat/NN))\n"
     ]
    }
   ],
   "source": [
    "##\n",
    "## Regla alternativa\n",
    "##\n",
    "grammar = \"NP: {<DT>?<JJ.*>*<NN.*>+}\"\n",
    "\n",
    "cp = nltk.RegexpParser(grammar)\n",
    "result = cp.parse(sentence)\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(S\n",
      "  (NP Rapunzel/NNP)\n",
      "  let/VBD\n",
      "  down/RP\n",
      "  (NP her/PP$ long/JJ golden/JJ hair/NN))\n"
     ]
    }
   ],
   "source": [
    "##\n",
    "## Chunking con expresiones regulares\n",
    "##\n",
    "grammar = r\"\"\"\n",
    "  NP: {<DT|PP\\$>?<JJ>*<NN>}   # chunk determiner/possessive, adjectives and noun\n",
    "      {<NNP>+}                # chunk sequences of proper nouns\n",
    "\"\"\"\n",
    "\n",
    "cp = nltk.RegexpParser(grammar)\n",
    "sentence = [\n",
    "    (\"Rapunzel\", \"NNP\"),\n",
    "    (\"let\", \"VBD\"),\n",
    "    (\"down\", \"RP\"),\n",
    "    (\"her\", \"PP$\"),\n",
    "    (\"long\", \"JJ\"),\n",
    "    (\"golden\", \"JJ\"),\n",
    "    (\"hair\", \"NN\"),\n",
    "]\n",
    "\n",
    "print(cp.parse(sentence))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(S (NP money/NN market/NN) fund/NN)\n"
     ]
    }
   ],
   "source": [
    "##\n",
    "## Chunk de dos sustantivos consecutivos\n",
    "##\n",
    "nouns = [(\"money\", \"NN\"), (\"market\", \"NN\"), (\"fund\", \"NN\")]\n",
    "grammar = \"NP: {<NN><NN>}\"\n",
    "cp = nltk.RegexpParser(grammar)\n",
    "print(cp.parse(nouns))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(CHUNK combined/VBN to/TO achieve/VB)\n",
      "(CHUNK continue/VB to/TO place/VB)\n",
      "(CHUNK serve/VB to/TO protect/VB)\n",
      "(CHUNK wanted/VBD to/TO wait/VB)\n",
      "(CHUNK allowed/VBN to/TO place/VB)\n",
      "(CHUNK expected/VBN to/TO become/VB)\n",
      "(CHUNK expected/VBN to/TO approve/VB)\n",
      "(CHUNK expected/VBN to/TO make/VB)\n",
      "(CHUNK intends/VBZ to/TO make/VB)\n",
      "(CHUNK seek/VB to/TO set/VB)\n",
      "(CHUNK like/VB to/TO see/VB)\n"
     ]
    }
   ],
   "source": [
    "##\n",
    "## Ejemplo aplicado a un texto\n",
    "##\n",
    "cp = nltk.RegexpParser('CHUNK: {<V.*> <TO> <V.*>}')\n",
    "brown = nltk.corpus.brown\n",
    "\n",
    "counter = 0\n",
    "for sent in brown.tagged_sents():\n",
    "\n",
    "    tree = cp.parse(sent)\n",
    "\n",
    "    for subtree in tree.subtrees():\n",
    "        if subtree.label() == 'CHUNK': \n",
    "            print(subtree)\n",
    "            counter += 1\n",
    "\n",
    "    if counter > 10:\n",
    "        break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(S\n",
      "  (NP the/DT little/JJ yellow/JJ dog/NN)\n",
      "  barked/VBD\n",
      "  at/IN\n",
      "  (NP the/DT cat/NN))\n"
     ]
    }
   ],
   "source": [
    "##\n",
    "## Chinking\n",
    "##   Se refiere a definir que se excluye en vez\n",
    "##   de definir que se reconoce\n",
    "##\n",
    "grammar = r\"\"\"\n",
    "  NP:\n",
    "    {<.*>+}          # Chunk everything\n",
    "    }<VBD|IN>+{      # Chink sequences of VBD and IN\n",
    "  \"\"\"\n",
    "sentence = [\n",
    "    (\"the\", \"DT\"),\n",
    "    (\"little\", \"JJ\"),\n",
    "    (\"yellow\", \"JJ\"),\n",
    "    (\"dog\", \"NN\"),\n",
    "    (\"barked\", \"VBD\"),\n",
    "    (\"at\", \"IN\"),\n",
    "    (\"the\", \"DT\"),\n",
    "    (\"cat\", \"NN\"),\n",
    "]\n",
    "cp = nltk.RegexpParser(grammar)\n",
    "print(cp.parse(sentence))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(S\n",
      "  (PP Over/IN)\n",
      "  (NP a/DT cup/NN)\n",
      "  (PP of/IN)\n",
      "  (NP coffee/NN)\n",
      "  ,/,\n",
      "  (NP Mr./NNP Stone/NNP)\n",
      "  (VP told/VBD)\n",
      "  (NP his/PRP$ story/NN)\n",
      "  ./.)\n"
     ]
    }
   ],
   "source": [
    "##\n",
    "## Entrenamiento de chunkers\n",
    "##\n",
    "from nltk.corpus import conll2000\n",
    "\n",
    "##\n",
    "## Ejemplo de una secuencia tageada\n",
    "## El corpus CoNLL 2000 tiene 270K palabras divididas\n",
    "## en train/test con POS-tags y chunk tags\n",
    "##\n",
    "print(conll2000.chunked_sents('train.txt')[99])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(S\n",
      "  Over/IN\n",
      "  (NP a/DT cup/NN)\n",
      "  of/IN\n",
      "  (NP coffee/NN)\n",
      "  ,/,\n",
      "  (NP Mr./NNP Stone/NNP)\n",
      "  told/VBD\n",
      "  (NP his/PRP$ story/NN)\n",
      "  ./.)\n"
     ]
    }
   ],
   "source": [
    "##\n",
    "## CoNLL tiene tres tipos de chunks\n",
    "##   NP: ya revisados\n",
    "##   VP: ejemplo: has already delivered\n",
    "##   PP: `because of`\n",
    "##\n",
    "## En el siguiente ejemplo se seleccionan los\n",
    "## chunks de tipo NP\n",
    "##\n",
    "print(conll2000.chunked_sents('train.txt', chunk_types=['NP'])[99])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ChunkParse score:\n",
      "    IOB Accuracy:  43.4%%\n",
      "    Precision:      0.0%%\n",
      "    Recall:         0.0%%\n",
      "    F-Measure:      0.0%%\n"
     ]
    }
   ],
   "source": [
    "##\n",
    "## Evaluación de la precisión de \n",
    "## un Regular expression parser\n",
    "## usando un chunked corpus. Este parser no\n",
    "## crea chunks\n",
    "##\n",
    "cp = nltk.RegexpParser(\"\")\n",
    "test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])\n",
    "print(cp.evaluate(test_sents))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ChunkParse score:\n",
      "    IOB Accuracy:  87.7%%\n",
      "    Precision:     70.6%%\n",
      "    Recall:        67.8%%\n",
      "    F-Measure:     69.2%%\n"
     ]
    }
   ],
   "source": [
    "grammar = r\"NP: {<[CDJNP].*>+}\"\n",
    "cp = nltk.RegexpParser(grammar)\n",
    "print(cp.evaluate(test_sents))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ChunkParse score:\n",
      "    IOB Accuracy:  92.9%%\n",
      "    Precision:     79.9%%\n",
      "    Recall:        86.8%%\n",
      "    F-Measure:     83.2%%\n"
     ]
    }
   ],
   "source": [
    "##\n",
    "## Se usa un unigram tagger para marcar sentencias con chunk tags.\n",
    "##\n",
    "class UnigramChunker(nltk.ChunkParserI):\n",
    "    def __init__(self, train_sents):\n",
    "        \n",
    "        ##\n",
    "        ## Entrena el modelo\n",
    "        ##\n",
    "        train_data = [\n",
    "            [(t, c) for _, t, c in nltk.chunk.tree2conlltags(sent)]\n",
    "            for sent in train_sents\n",
    "        ]\n",
    "        self.tagger = nltk.UnigramTagger(train_data)\n",
    "\n",
    "    def parse(self, sentence):\n",
    "        ##\n",
    "        ## Aplica el tagger\n",
    "        ##\n",
    "        pos_tags = [pos for (word, pos) in sentence]\n",
    "        tagged_pos_tags = self.tagger.tag(pos_tags)\n",
    "        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]\n",
    "        conlltags = [\n",
    "            (word, pos, chunktag)\n",
    "            for ((word, pos), chunktag) in zip(sentence, chunktags)\n",
    "        ]\n",
    "        return nltk.chunk.conlltags2tree(conlltags)\n",
    "\n",
    "\n",
    "test_sents = conll2000.chunked_sents(\"test.txt\", chunk_types=[\"NP\"])\n",
    "train_sents = conll2000.chunked_sents(\"train.txt\", chunk_types=[\"NP\"])\n",
    "unigram_chunker = UnigramChunker(train_sents)\n",
    "print(unigram_chunker.evaluate(test_sents))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('#', 'B-NP'), ('$', 'B-NP'), (\"''\", 'O'), ('(', 'O'), (')', 'O'), (',', 'O'), ('.', 'O'), (':', 'O'), ('CC', 'O'), ('CD', 'I-NP'), ('DT', 'B-NP'), ('EX', 'B-NP'), ('FW', 'I-NP'), ('IN', 'O'), ('JJ', 'I-NP'), ('JJR', 'B-NP'), ('JJS', 'I-NP'), ('MD', 'O'), ('NN', 'I-NP'), ('NNP', 'I-NP'), ('NNPS', 'I-NP'), ('NNS', 'I-NP'), ('PDT', 'B-NP'), ('POS', 'B-NP'), ('PRP', 'B-NP'), ('PRP$', 'B-NP'), ('RB', 'O'), ('RBR', 'O'), ('RBS', 'B-NP'), ('RP', 'O'), ('SYM', 'O'), ('TO', 'O'), ('UH', 'O'), ('VB', 'O'), ('VBD', 'O'), ('VBG', 'O'), ('VBN', 'O'), ('VBP', 'O'), ('VBZ', 'O'), ('WDT', 'B-NP'), ('WP', 'B-NP'), ('WP$', 'B-NP'), ('WRB', 'O'), ('``', 'O')]\n"
     ]
    }
   ],
   "source": [
    "##\n",
    "## Para determinar que aprendio el tagger, se asigna un tag\n",
    "## a cada uno de los tags usandos en POS-tagging\n",
    "##\n",
    "postags = sorted(set(pos for sent in train_sents for (word, pos) in sent.leaves()))\n",
    "print(unigram_chunker.tagger.tag(postags))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ChunkParse score:\n",
      "    IOB Accuracy:  93.3%%\n",
      "    Precision:     82.3%%\n",
      "    Recall:        86.8%%\n",
      "    F-Measure:     84.5%%\n"
     ]
    }
   ],
   "source": [
    "##\n",
    "## Codigo similar al anterior, pero usa un bigram-tagger\n",
    "##\n",
    "class BigramChunker(nltk.ChunkParserI):\n",
    "    def __init__(self, train_sents):\n",
    "        train_data = [\n",
    "            [(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)]\n",
    "            for sent in train_sents\n",
    "        ]\n",
    "        self.tagger = nltk.BigramTagger(train_data)\n",
    "\n",
    "    def parse(self, sentence):\n",
    "        pos_tags = [pos for (word, pos) in sentence]\n",
    "        tagged_pos_tags = self.tagger.tag(pos_tags)\n",
    "        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]\n",
    "        conlltags = [\n",
    "            (word, pos, chunktag)\n",
    "            for ((word, pos), chunktag) in zip(sentence, chunktags)\n",
    "        ]\n",
    "        return nltk.chunk.conlltags2tree(conlltags)\n",
    "\n",
    "\n",
    "bigram_chunker = BigramChunker(train_sents)\n",
    "print(bigram_chunker.evaluate(test_sents))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(S\n",
      "  (NP Mary/NN)\n",
      "  saw/VBD\n",
      "  (CLAUSE\n",
      "    (NP the/DT cat/NN)\n",
      "    (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))\n"
     ]
    }
   ],
   "source": [
    "##\n",
    "## Cascade chunkers\n",
    "##  Se crean especificando una estructura multi-estado\n",
    "##  con reglas recursivas.\n",
    "##\n",
    "grammar = r\"\"\"\n",
    "  NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN\n",
    "  PP: {<IN><NP>}               # Chunk prepositions followed by NP\n",
    "  VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments\n",
    "  CLAUSE: {<NP><VP>}           # Chunk NP, VP\n",
    "  \"\"\"\n",
    "cp = nltk.RegexpParser(grammar)\n",
    "\n",
    "##\n",
    "## Note como es taggeada la frase \n",
    "##\n",
    "##    Mary saw the cat sit on the mat\n",
    "##\n",
    "sentence = [\n",
    "    (\"Mary\", \"NN\"),\n",
    "    (\"saw\", \"VBD\"),\n",
    "    (\"the\", \"DT\"),\n",
    "    (\"cat\", \"NN\"),\n",
    "    (\"sit\", \"VB\"),\n",
    "    (\"on\", \"IN\"),\n",
    "    (\"the\", \"DT\"),\n",
    "    (\"mat\", \"NN\"),\n",
    "]\n",
    "\n",
    "##\n",
    "## En la parte que dice CLAUSE el chunker debio\n",
    "## identificar el patrón VP\n",
    "##\n",
    "print(cp.parse(sentence))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(S\n",
      "  (NP John/NNP)\n",
      "  thinks/VBZ\n",
      "  (NP Mary/NN)\n",
      "  saw/VBD\n",
      "  (CLAUSE\n",
      "    (NP the/DT cat/NN)\n",
      "    (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))\n"
     ]
    }
   ],
   "source": [
    "##\n",
    "## Para la frase:\n",
    "##\n",
    "##    John thinks Mary saw the cat sit on the mat\n",
    "##\n",
    "## el chunker no identifica correctamente la estructura\n",
    "##\n",
    "sentence = [\n",
    "    (\"John\", \"NNP\"),\n",
    "    (\"thinks\", \"VBZ\"),\n",
    "    (\"Mary\", \"NN\"),\n",
    "    (\"saw\", \"VBD\"),\n",
    "    (\"the\", \"DT\"),\n",
    "    (\"cat\", \"NN\"),\n",
    "    (\"sit\", \"VB\"),\n",
    "    (\"on\", \"IN\"),\n",
    "    (\"the\", \"DT\"),\n",
    "    (\"mat\", \"NN\"),\n",
    "]\n",
    "print(cp.parse(sentence))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(S\n",
      "  (NP John/NNP)\n",
      "  thinks/VBZ\n",
      "  (CLAUSE\n",
      "    (NP Mary/NN)\n",
      "    (VP\n",
      "      saw/VBD\n",
      "      (CLAUSE\n",
      "        (NP the/DT cat/NN)\n",
      "        (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))\n"
     ]
    }
   ],
   "source": [
    "##\n",
    "## La solución es permitir que el chunker itere varias veces sobre\n",
    "## los patrones para que pueda identificar todas las estructuras\n",
    "##\n",
    "cp = nltk.RegexpParser(grammar, loop=2)\n",
    "print(cp.parse(sentence))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(NP Alice)\n"
     ]
    }
   ],
   "source": [
    "##\n",
    "## En los siguientes ejemplos se ilustra\n",
    "## como construir un arbol como el\n",
    "## presentado en los ejemplos anteriores\n",
    "##\n",
    "tree1 = nltk.Tree('NP', ['Alice'])\n",
    "print(tree1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(NP the rabbit)\n"
     ]
    }
   ],
   "source": [
    "tree2 = nltk.Tree('NP', ['the', 'rabbit'])\n",
    "print(tree2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(S (NP Alice) (VP chased (NP the rabbit)))\n"
     ]
    }
   ],
   "source": [
    "tree3 = nltk.Tree('VP', ['chased', tree2])\n",
    "tree4 = nltk.Tree('S', [tree1, tree3])\n",
    "print(tree4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(VP chased (NP the rabbit))\n"
     ]
    }
   ],
   "source": [
    "print(tree4[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Alice', 'chased', 'the', 'rabbit']"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tree4.leaves()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}