{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Shallow Parsing\n", "\n", "* *30 min* | Última modificación: Diciembre 1, 2020" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "http://www.nltk.org/book/\n", "\n", "Text Analytics with Python" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Shallow parsing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "* Noun phrase (NP): El sustantivo (noun) encabeza la frase. Una NP atua como el sujeto u objeto de un verbo.\n", "\n", "\n", "* Verb phrase (VP): El verbo (verb) encabeza la frase. \n", "\n", "\n", "* Adjective phrase (ADJP): El adjetivo es la cabeza. Califica sustantivsos y pronombres en la sentencia.\n", "\n", "\n", "* Adverb phrase (ADVP): frases que actuan como adverbios\n", "\n", "\n", "* Preprositional phrase (PP): tienen una preposicion al inicio de la frase." ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package treebank to /root/nltk_data...\n", "[nltk_data] Package treebank is already up-to-date!\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.6/dist-packages/nltk/tokenize/regexp.py:123: FutureWarning: split() requires a non-empty pattern match.\n", " return [tok for tok in self._regexp.split(text) if tok]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "(S\n", " (NP A/DT Lorillard/NNP spokewoman/NN)\n", " said/VBD\n", " ,/,\n", " ``/``\n", " (NP This/DT)\n", " is/VBZ\n", " (NP an/DT old/JJ story/NN)\n", " ./.)\n" ] } ], "source": [ "##\n", "## Ejemplo del resultado de una shallow parser\n", "##\n", "nltk.download('treebank')\n", "\n", "from nltk.corpus import treebank_chunk\n", "\n", "data = treebank_chunk.chunked_sents()\n", "\n", "train_data = data[:3500] \n", "test_data = data[3500:]\n", "\n", "print(train_data[7])" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "POS Tags: [('US', 'NNP'), ('unveils', 'JJ'), ('world', 'NN'), (\"'s\", 'POS'), ('most', 'RBS'), ('powerful', 'JJ'), ('supercomputer', 'NN'), (',', ','), ('beats', 'VBZ'), ('China', 'NNP'), ('.', '.')]\n", "\n", "(S\n", " (NP US/NNP)\n", " (NP unveils/JJ world/NN)\n", " 's/POS\n", " most/RBS\n", " (NP powerful/JJ supercomputer/NN)\n", " ,/,\n", " beats/VBZ\n", " (NP China/NNP)\n", " ./.)\n" ] } ], "source": [ "##\n", "## Especificacion de un regexp parser\n", "##\n", "from nltk.chunk import RegexpParser\n", "\n", "sentence = \"US unveils world's most powerful supercomputer, beats China.\"\n", "\n", "tagged_simple_sent = nltk.pos_tag(nltk.word_tokenize(sentence))\n", "print('POS Tags:', tagged_simple_sent)\n", "\n", "chunk_grammar = \"\"\"\n", "NP: {