Shallow Parsing

30 min | Última modificación: Diciembre 1, 2020

http://www.nltk.org/book/

Text Analytics with Python

Shallow parsing

Noun phrase (NP): El sustantivo (noun) encabeza la frase. Una NP atua como el sujeto u objeto de un verbo.
Verb phrase (VP): El verbo (verb) encabeza la frase.
Adjective phrase (ADJP): El adjetivo es la cabeza. Califica sustantivsos y pronombres en la sentencia.
Adverb phrase (ADVP): frases que actuan como adverbios
Preprositional phrase (PP): tienen una preposicion al inicio de la frase.

[19]:

##
## Ejemplo del resultado de una shallow parser
##
nltk.download('treebank')

from nltk.corpus import treebank_chunk

data = treebank_chunk.chunked_sents()

train_data = data[:3500]
test_data = data[3500:]

print(train_data[7])

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!

/usr/local/lib/python3.6/dist-packages/nltk/tokenize/regexp.py:123: FutureWarning: split() requires a non-empty pattern match.
  return [tok for tok in self._regexp.split(text) if tok]

(S
  (NP A/DT Lorillard/NNP spokewoman/NN)
  said/VBD
  ,/,
  ``/``
  (NP This/DT)
  is/VBZ
  (NP an/DT old/JJ story/NN)
  ./.)

[20]:

##
## Especificacion de un regexp parser
##
from nltk.chunk import RegexpParser

sentence = "US unveils world's most powerful supercomputer, beats China."

tagged_simple_sent = nltk.pos_tag(nltk.word_tokenize(sentence))
print('POS Tags:', tagged_simple_sent)

chunk_grammar = """
NP: {<DT>?<JJ>*<NN.*>}
"""

rc = RegexpParser(chunk_grammar)
c = rc.parse(tagged_simple_sent)
print()
print(c)

POS Tags: [('US', 'NNP'), ('unveils', 'JJ'), ('world', 'NN'), ("'s", 'POS'), ('most', 'RBS'), ('powerful', 'JJ'), ('supercomputer', 'NN'), (',', ','), ('beats', 'VBZ'), ('China', 'NNP'), ('.', '.')]

(S
  (NP US/NNP)
  (NP unveils/JJ world/NN)
  's/POS
  most/RBS
  (NP powerful/JJ supercomputer/NN)
  ,/,
  beats/VBZ
  (NP China/NNP)
  ./.)

[21]:

##
## Chink --- lo que no es reconocido
##
chink_grammar = """
NP:
   {<.*>+}                # Chunk everything as NP
   }<VBZ|VBD|JJ|IN>+{     # Chink sequences of VBD\VBZ\JJ\IN
"""

rc = RegexpParser(chink_grammar)
c = rc.parse(tagged_simple_sent)
print(c)

(S
  (NP US/NNP)
  unveils/JJ
  (NP world/NN 's/POS most/RBS)
  powerful/JJ
  (NP supercomputer/NN ,/,)
  beats/VBZ
  (NP China/NNP ./.))

[22]:

##
## Mejoras
##
grammar = """
NP:   {<DT>?<JJ>?<NN.*>}
ADJP: {<JJ>}
ADVP: {<RB.*>}
PP:   {<IN>}
VP:   {<MD>?<VB.*>+}
"""

rc = RegexpParser(grammar)
c = rc.parse(tagged_simple_sent)
print(c)

(S
  (NP US/NNP)
  (NP unveils/JJ world/NN)
  's/POS
  (ADVP most/RBS)
  (NP powerful/JJ supercomputer/NN)
  ,/,
  (VP beats/VBZ)
  (NP China/NNP)
  ./.)

[23]:

##
## Evaluacion
##
print(rc.evaluate(test_data))

ChunkParse score:
    IOB Accuracy:  46.1%%
    Precision:     19.9%%
    Recall:        43.3%%
    F-Measure:     27.3%%

[24]:

##
## B- begining of the chunk
## I- inside a chunk
## O- no pertenece a ningun chunk
##
from nltk.chunk.util import tree2conlltags, conlltags2tree

train_sent = train_data[7]
print(train_sent)

(S
  (NP A/DT Lorillard/NNP spokewoman/NN)
  said/VBD
  ,/,
  ``/``
  (NP This/DT)
  is/VBZ
  (NP an/DT old/JJ story/NN)
  ./.)

[25]:

wtc = tree2conlltags(train_sent)
wtc

[25]:

[('A', 'DT', 'B-NP'),
 ('Lorillard', 'NNP', 'I-NP'),
 ('spokewoman', 'NN', 'I-NP'),
 ('said', 'VBD', 'O'),
 (',', ',', 'O'),
 ('``', '``', 'O'),
 ('This', 'DT', 'B-NP'),
 ('is', 'VBZ', 'O'),
 ('an', 'DT', 'B-NP'),
 ('old', 'JJ', 'I-NP'),
 ('story', 'NN', 'I-NP'),
 ('.', '.', 'O')]

[26]:

tree = conlltags2tree(wtc)
print(tree)

(S
  (NP A/DT Lorillard/NNP spokewoman/NN)
  said/VBD
  ,/,
  ``/``
  (NP This/DT)
  is/VBZ
  (NP an/DT old/JJ story/NN)
  ./.)

[27]:

def conll_tag_chunks(chunk_sents):
    tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
    return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]

def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff

[28]:

from nltk.tag import UnigramTagger, BigramTagger
from nltk.chunk import ChunkParserI

class NGramTagChunker(ChunkParserI):
    def __init__(self, train_sentences, tagger_classes=[UnigramTagger, BigramTagger]):
        train_sent_tags = conll_tag_chunks(train_sentences)
        self.chunk_tagger = combined_tagger(train_sent_tags, tagger_classes)

    def parse(self, tagged_sentence):
        if not tagged_sentence:
            return None

        pos_tags = [tag for word, tag in tagged_sentence]
        chunk_pos_tags = self.chunk_tagger.tag(pos_tags)
        chunk_tags = [chunk_tag for (pos_tag, chunk_tag) in chunk_pos_tags]
        wpc_tags = [(word, pos_tag, chunk_tag) for ((word, pos_tag), chunk_tag) in zip(tagged_sentence, chunk_tags)]
        return conlltags2tree(wpc_tags)

ntc = NGramTagChunker(train_data)
print(ntc.evaluate(test_data))

ChunkParse score:
    IOB Accuracy:  97.2%%
    Precision:     91.4%%
    Recall:        94.3%%
    F-Measure:     92.8%%

[29]:

sentence_nlp = nlp(sentence)
tagged_sentence = [(word.text, word.tag_) for word in sentence_nlp]
tree = ntc.parse(tagged_sentence)
print(tree)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-29-833c8b4a0527> in <module>
----> 1 sentence_nlp = nlp(sentence)
      2 tagged_sentence = [(word.text, word.tag_) for word in sentence_nlp]
      3 tree = ntc.parse(tagged_sentence)
      4 print(tree)

NameError: name 'nlp' is not defined

[ ]:

from nltk.corpus import conll2000

wsj_data = conll2000.chunked_sents()
train_wsj_data = wsj_data[:10000]
test_wsj_data = wsj_data[10000:]

print(train_wsj_data[10])

[ ]:

tc = NGramTagChunker(train_wsj_data)
print(tc.evaluate(test_wsj_data))

[ ]:

tree = tc.parse(tagged_sentence)
print(tree)

[ ]:

from nltk.corpus import conll2000
wsj_data = conll2000.chunked_sents()
train_wsj_data = wsj_data[:10000]
test_wsj_data = wsj_data[10000:]
print(train_wsj_data[10])

[ ]:

tc = NGramTagChunker(train_wsj_data)

print(tc.evaluate(test_wsj_data))

[ ]:

tree = tc.parse(tagged_sentence)
print(tree)

[ ]: