{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Reconocimiento de entidades y extracción de relaciones\n", "\n", "* *30 min* | Última modificación: Diciembre 9, 2020" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "http://www.nltk.org/book/\n", "\n", "Text Analytics with Python" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package maxent_ne_chunker to\n", "[nltk_data] /root/nltk_data...\n", "[nltk_data] Package maxent_ne_chunker is already up-to-date!\n", "[nltk_data] Downloading package words to /root/nltk_data...\n", "[nltk_data] Package words is already up-to-date!\n", "[nltk_data] Downloading package ieer to /root/nltk_data...\n", "[nltk_data] Package ieer is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import nltk\n", "\n", "nltk.download('maxent_ne_chunker')\n", "nltk.download('words')\n", "nltk.download('ieer')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(S\n", " The/DT\n", " (NE U.S./NNP)\n", " is/VBZ\n", " one/CD\n", " of/IN\n", " the/DT\n", " few/JJ\n", " industrialized/VBN\n", " nations/NNS\n", " that/WDT\n", " *T*-7/-NONE-\n", " does/VBZ\n", " n't/RB\n", " have/VB\n", " a/DT\n", " higher/JJR\n", " standard/NN\n", " of/IN\n", " regulation/NN\n", " for/IN\n", " the/DT\n", " smooth/JJ\n", " ,/,\n", " needle-like/JJ\n", " fibers/NNS\n", " such/JJ\n", " as/IN\n", " crocidolite/NN\n", " that/WDT\n", " *T*-1/-NONE-\n", " are/VBP\n", " classified/VBN\n", " *-5/-NONE-\n", " as/IN\n", " amphobiles/NNS\n", " ,/,\n", " according/VBG\n", " to/TO\n", " (NE Brooke/NNP)\n", " T./NNP\n", " Mossman/NNP\n", " ,/,\n", " a/DT\n", " professor/NN\n", " of/IN\n", " pathlogy/NN\n", " at/IN\n", " the/DT\n", " (NE University/NNP)\n", " of/IN\n", " (NE Vermont/NNP College/NNP)\n", " of/IN\n", " (NE Medicine/NNP)\n", " ./.)\n" ] } ], "source": [ "##\n", "## Reconocimiento de entidades en una frase\n", "##\n", "sent = nltk.corpus.treebank.tagged_sents()[22]\n", "\n", "##\n", "## El parametro binary=True hace que las entidades\n", "## sean taggeadas unicamente como NE\n", "##\n", "print(nltk.ne_chunk(sent, binary=True))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(S\n", " The/DT\n", " (GPE U.S./NNP)\n", " is/VBZ\n", " one/CD\n", " of/IN\n", " the/DT\n", " few/JJ\n", " industrialized/VBN\n", " nations/NNS\n", " that/WDT\n", " *T*-7/-NONE-\n", " does/VBZ\n", " n't/RB\n", " have/VB\n", " a/DT\n", " higher/JJR\n", " standard/NN\n", " of/IN\n", " regulation/NN\n", " for/IN\n", " the/DT\n", " smooth/JJ\n", " ,/,\n", " needle-like/JJ\n", " fibers/NNS\n", " such/JJ\n", " as/IN\n", " crocidolite/NN\n", " that/WDT\n", " *T*-1/-NONE-\n", " are/VBP\n", " classified/VBN\n", " *-5/-NONE-\n", " as/IN\n", " amphobiles/NNS\n", " ,/,\n", " according/VBG\n", " to/TO\n", " (PERSON Brooke/NNP T./NNP Mossman/NNP)\n", " ,/,\n", " a/DT\n", " professor/NN\n", " of/IN\n", " pathlogy/NN\n", " at/IN\n", " the/DT\n", " (ORGANIZATION University/NNP)\n", " of/IN\n", " (PERSON Vermont/NNP College/NNP)\n", " of/IN\n", " (GPE Medicine/NNP)\n", " ./.)\n" ] } ], "source": [ "##\n", "## El parametro binary=False hace que las entidades\n", "## sean taggeadas adicionando categorias:\n", "##\n", "## ORGANIZATION\n", "## LOCATION\n", "## DATE\n", "## TIME\n", "## MONEY\n", "## PERCENT\n", "## FACILITY human-made artifacts in the domains of architecture and civil engineering\n", "## GPE geo-political entities such as city, state/province, and country.\n", "##\n", "print(nltk.ne_chunk(sent)) " ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']\n", "[ORG: 'McGlashan & Sarrail'] 'firm in' [LOC: 'San Mateo']\n", "[ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']\n", "[ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']\n", "[ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']\n", "[ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']\n", "[ORG: 'WGBH'] 'in' [LOC: 'Boston']\n", "[ORG: 'Bastille Opera'] 'in' [LOC: 'Paris']\n", "[ORG: 'Omnicom'] 'in' [LOC: 'New York']\n", "[ORG: 'DDB Needham'] 'in' [LOC: 'New York']\n", "[ORG: 'Kaplan Thaler Group'] 'in' [LOC: 'New York']\n", "[ORG: 'BBDO South'] 'in' [LOC: 'Atlanta']\n", "[ORG: 'Georgia-Pacific'] 'in' [LOC: 'Atlanta']\n" ] } ], "source": [ "##\n", "## Las relaciones son extraidas como una tripleta (X, a, Y)\n", "## donde X y Y son entidades y `a` representa la relación\n", "##\n", "import re \n", "IN = re.compile(r\".*\\bin\\b(?!\\b.+ing)\")\n", "for doc in nltk.corpus.ieer.parsed_docs(\"NYT_19980315\"):\n", " for rel in nltk.sem.extract_rels(\"ORG\", \"LOC\", doc, corpus=\"ieer\", pattern=IN):\n", " print(nltk.sem.rtuple(rel))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 4 }