{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Reconocimiento de entidades y extracción de relaciones\n",
    "\n",
    "* *30 min* | Última modificación: Diciembre 9, 2020"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "http://www.nltk.org/book/\n",
    "\n",
    "Text Analytics with Python"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package maxent_ne_chunker to\n",
      "[nltk_data]     /root/nltk_data...\n",
      "[nltk_data]   Package maxent_ne_chunker is already up-to-date!\n",
      "[nltk_data] Downloading package words to /root/nltk_data...\n",
      "[nltk_data]   Package words is already up-to-date!\n",
      "[nltk_data] Downloading package ieer to /root/nltk_data...\n",
      "[nltk_data]   Package ieer is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import nltk\n",
    "\n",
    "nltk.download('maxent_ne_chunker')\n",
    "nltk.download('words')\n",
    "nltk.download('ieer')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(S\n",
      "  The/DT\n",
      "  (NE U.S./NNP)\n",
      "  is/VBZ\n",
      "  one/CD\n",
      "  of/IN\n",
      "  the/DT\n",
      "  few/JJ\n",
      "  industrialized/VBN\n",
      "  nations/NNS\n",
      "  that/WDT\n",
      "  *T*-7/-NONE-\n",
      "  does/VBZ\n",
      "  n't/RB\n",
      "  have/VB\n",
      "  a/DT\n",
      "  higher/JJR\n",
      "  standard/NN\n",
      "  of/IN\n",
      "  regulation/NN\n",
      "  for/IN\n",
      "  the/DT\n",
      "  smooth/JJ\n",
      "  ,/,\n",
      "  needle-like/JJ\n",
      "  fibers/NNS\n",
      "  such/JJ\n",
      "  as/IN\n",
      "  crocidolite/NN\n",
      "  that/WDT\n",
      "  *T*-1/-NONE-\n",
      "  are/VBP\n",
      "  classified/VBN\n",
      "  *-5/-NONE-\n",
      "  as/IN\n",
      "  amphobiles/NNS\n",
      "  ,/,\n",
      "  according/VBG\n",
      "  to/TO\n",
      "  (NE Brooke/NNP)\n",
      "  T./NNP\n",
      "  Mossman/NNP\n",
      "  ,/,\n",
      "  a/DT\n",
      "  professor/NN\n",
      "  of/IN\n",
      "  pathlogy/NN\n",
      "  at/IN\n",
      "  the/DT\n",
      "  (NE University/NNP)\n",
      "  of/IN\n",
      "  (NE Vermont/NNP College/NNP)\n",
      "  of/IN\n",
      "  (NE Medicine/NNP)\n",
      "  ./.)\n"
     ]
    }
   ],
   "source": [
    "##\n",
    "## Reconocimiento de entidades en una frase\n",
    "##\n",
    "sent = nltk.corpus.treebank.tagged_sents()[22]\n",
    "\n",
    "##\n",
    "## El parametro binary=True hace que las entidades\n",
    "## sean taggeadas unicamente como NE\n",
    "##\n",
    "print(nltk.ne_chunk(sent, binary=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(S\n",
      "  The/DT\n",
      "  (GPE U.S./NNP)\n",
      "  is/VBZ\n",
      "  one/CD\n",
      "  of/IN\n",
      "  the/DT\n",
      "  few/JJ\n",
      "  industrialized/VBN\n",
      "  nations/NNS\n",
      "  that/WDT\n",
      "  *T*-7/-NONE-\n",
      "  does/VBZ\n",
      "  n't/RB\n",
      "  have/VB\n",
      "  a/DT\n",
      "  higher/JJR\n",
      "  standard/NN\n",
      "  of/IN\n",
      "  regulation/NN\n",
      "  for/IN\n",
      "  the/DT\n",
      "  smooth/JJ\n",
      "  ,/,\n",
      "  needle-like/JJ\n",
      "  fibers/NNS\n",
      "  such/JJ\n",
      "  as/IN\n",
      "  crocidolite/NN\n",
      "  that/WDT\n",
      "  *T*-1/-NONE-\n",
      "  are/VBP\n",
      "  classified/VBN\n",
      "  *-5/-NONE-\n",
      "  as/IN\n",
      "  amphobiles/NNS\n",
      "  ,/,\n",
      "  according/VBG\n",
      "  to/TO\n",
      "  (PERSON Brooke/NNP T./NNP Mossman/NNP)\n",
      "  ,/,\n",
      "  a/DT\n",
      "  professor/NN\n",
      "  of/IN\n",
      "  pathlogy/NN\n",
      "  at/IN\n",
      "  the/DT\n",
      "  (ORGANIZATION University/NNP)\n",
      "  of/IN\n",
      "  (PERSON Vermont/NNP College/NNP)\n",
      "  of/IN\n",
      "  (GPE Medicine/NNP)\n",
      "  ./.)\n"
     ]
    }
   ],
   "source": [
    "##\n",
    "## El parametro binary=False hace que las entidades\n",
    "## sean taggeadas adicionando categorias:\n",
    "##\n",
    "##  ORGANIZATION\n",
    "##  LOCATION\n",
    "##  DATE\n",
    "##  TIME\n",
    "##  MONEY\n",
    "##  PERCENT\n",
    "##  FACILITY  human-made artifacts in the domains of architecture and civil engineering\n",
    "##  GPE geo-political entities such as city, state/province, and country.\n",
    "##\n",
    "print(nltk.ne_chunk(sent)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']\n",
      "[ORG: 'McGlashan &AMP; Sarrail'] 'firm in' [LOC: 'San Mateo']\n",
      "[ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']\n",
      "[ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']\n",
      "[ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']\n",
      "[ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']\n",
      "[ORG: 'WGBH'] 'in' [LOC: 'Boston']\n",
      "[ORG: 'Bastille Opera'] 'in' [LOC: 'Paris']\n",
      "[ORG: 'Omnicom'] 'in' [LOC: 'New York']\n",
      "[ORG: 'DDB Needham'] 'in' [LOC: 'New York']\n",
      "[ORG: 'Kaplan Thaler Group'] 'in' [LOC: 'New York']\n",
      "[ORG: 'BBDO South'] 'in' [LOC: 'Atlanta']\n",
      "[ORG: 'Georgia-Pacific'] 'in' [LOC: 'Atlanta']\n"
     ]
    }
   ],
   "source": [
    "##\n",
    "## Las relaciones son extraidas como una tripleta (X, a, Y)\n",
    "## donde X y Y son entidades y `a` representa la relación\n",
    "##\n",
    "import re \n",
    "IN = re.compile(r\".*\\bin\\b(?!\\b.+ing)\")\n",
    "for doc in nltk.corpus.ieer.parsed_docs(\"NYT_19980315\"):\n",
    "    for rel in nltk.sem.extract_rels(\"ORG\", \"LOC\", doc, corpus=\"ieer\", pattern=IN):\n",
    "        print(nltk.sem.rtuple(rel))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}