{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Extracción de información\n", "\n", "* *30 min* | Última modificación: Diciembre 9, 2020" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "http://www.nltk.org/book/\n", "\n", "Text Analytics with Python" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package conll2000 to /root/nltk_data...\n", "[nltk_data] Package conll2000 is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import nltk\n", "\n", "nltk.download('conll2000')" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "##\n", "## Preprocesamiento básico\n", "##\n", "def ie_preprocess(document):\n", " ##\n", " ## Separación del documento en sentencias\n", " ##\n", " sentences = nltk.sent_tokenize(document)\n", " \n", " ##\n", " ## Separación de las sentencias en palabras\n", " ##\n", " sentences = [nltk.word_tokenize(sent) for sent in sentences]\n", " \n", " ##\n", " ## POS-Tagging\n", " ##\n", " sentences = [nltk.pos_tag(sent) for sent in sentences]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(S\n", " (NP the/DT little/JJ yellow/JJ dog/NN)\n", " barked/VBD\n", " at/IN\n", " (NP the/DT cat/NN))\n" ] } ], "source": [ "##\n", "## Noun phrase chunking\n", "## Frases nominales\n", "##\n", "\n", "## Define la frase\n", "sentence = [\n", " (\"the\", \"DT\"),\n", " (\"little\", \"JJ\"),\n", " (\"yellow\", \"JJ\"),\n", " (\"dog\", \"NN\"),\n", " (\"barked\", \"VBD\"),\n", " (\"at\", \"IN\"),\n", " (\"the\", \"DT\"),\n", " (\"cat\", \"NN\"),\n", "]\n", "\n", "## tag pattern\n", "grammar = \"NP: {