Shallow Parsing
30 min | Última modificación: Diciembre 1, 2020
Text Analytics with Python
Shallow parsing
Noun phrase (NP): El sustantivo (noun) encabeza la frase. Una NP atua como el sujeto u objeto de un verbo.
Verb phrase (VP): El verbo (verb) encabeza la frase.
Adjective phrase (ADJP): El adjetivo es la cabeza. Califica sustantivsos y pronombres en la sentencia.
Adverb phrase (ADVP): frases que actuan como adverbios
Preprositional phrase (PP): tienen una preposicion al inicio de la frase.
[19]:
##
## Ejemplo del resultado de una shallow parser
##
nltk.download('treebank')
from nltk.corpus import treebank_chunk
data = treebank_chunk.chunked_sents()
train_data = data[:3500]
test_data = data[3500:]
print(train_data[7])
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data] Package treebank is already up-to-date!
/usr/local/lib/python3.6/dist-packages/nltk/tokenize/regexp.py:123: FutureWarning: split() requires a non-empty pattern match.
return [tok for tok in self._regexp.split(text) if tok]
(S
(NP A/DT Lorillard/NNP spokewoman/NN)
said/VBD
,/,
``/``
(NP This/DT)
is/VBZ
(NP an/DT old/JJ story/NN)
./.)
[20]:
##
## Especificacion de un regexp parser
##
from nltk.chunk import RegexpParser
sentence = "US unveils world's most powerful supercomputer, beats China."
tagged_simple_sent = nltk.pos_tag(nltk.word_tokenize(sentence))
print('POS Tags:', tagged_simple_sent)
chunk_grammar = """
NP: {<DT>?<JJ>*<NN.*>}
"""
rc = RegexpParser(chunk_grammar)
c = rc.parse(tagged_simple_sent)
print()
print(c)
POS Tags: [('US', 'NNP'), ('unveils', 'JJ'), ('world', 'NN'), ("'s", 'POS'), ('most', 'RBS'), ('powerful', 'JJ'), ('supercomputer', 'NN'), (',', ','), ('beats', 'VBZ'), ('China', 'NNP'), ('.', '.')]
(S
(NP US/NNP)
(NP unveils/JJ world/NN)
's/POS
most/RBS
(NP powerful/JJ supercomputer/NN)
,/,
beats/VBZ
(NP China/NNP)
./.)
[21]:
##
## Chink --- lo que no es reconocido
##
chink_grammar = """
NP:
{<.*>+} # Chunk everything as NP
}<VBZ|VBD|JJ|IN>+{ # Chink sequences of VBD\VBZ\JJ\IN
"""
rc = RegexpParser(chink_grammar)
c = rc.parse(tagged_simple_sent)
print(c)
(S
(NP US/NNP)
unveils/JJ
(NP world/NN 's/POS most/RBS)
powerful/JJ
(NP supercomputer/NN ,/,)
beats/VBZ
(NP China/NNP ./.))
[22]:
##
## Mejoras
##
grammar = """
NP: {<DT>?<JJ>?<NN.*>}
ADJP: {<JJ>}
ADVP: {<RB.*>}
PP: {<IN>}
VP: {<MD>?<VB.*>+}
"""
rc = RegexpParser(grammar)
c = rc.parse(tagged_simple_sent)
print(c)
(S
(NP US/NNP)
(NP unveils/JJ world/NN)
's/POS
(ADVP most/RBS)
(NP powerful/JJ supercomputer/NN)
,/,
(VP beats/VBZ)
(NP China/NNP)
./.)
[23]:
##
## Evaluacion
##
print(rc.evaluate(test_data))
ChunkParse score:
IOB Accuracy: 46.1%%
Precision: 19.9%%
Recall: 43.3%%
F-Measure: 27.3%%
[24]:
##
## B- begining of the chunk
## I- inside a chunk
## O- no pertenece a ningun chunk
##
from nltk.chunk.util import tree2conlltags, conlltags2tree
train_sent = train_data[7]
print(train_sent)
(S
(NP A/DT Lorillard/NNP spokewoman/NN)
said/VBD
,/,
``/``
(NP This/DT)
is/VBZ
(NP an/DT old/JJ story/NN)
./.)
[25]:
wtc = tree2conlltags(train_sent)
wtc
[25]:
[('A', 'DT', 'B-NP'),
('Lorillard', 'NNP', 'I-NP'),
('spokewoman', 'NN', 'I-NP'),
('said', 'VBD', 'O'),
(',', ',', 'O'),
('``', '``', 'O'),
('This', 'DT', 'B-NP'),
('is', 'VBZ', 'O'),
('an', 'DT', 'B-NP'),
('old', 'JJ', 'I-NP'),
('story', 'NN', 'I-NP'),
('.', '.', 'O')]
[26]:
tree = conlltags2tree(wtc)
print(tree)
(S
(NP A/DT Lorillard/NNP spokewoman/NN)
said/VBD
,/,
``/``
(NP This/DT)
is/VBZ
(NP an/DT old/JJ story/NN)
./.)
[27]:
def conll_tag_chunks(chunk_sents):
tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
def combined_tagger(train_data, taggers, backoff=None):
for tagger in taggers:
backoff = tagger(train_data, backoff=backoff)
return backoff
[28]:
from nltk.tag import UnigramTagger, BigramTagger
from nltk.chunk import ChunkParserI
class NGramTagChunker(ChunkParserI):
def __init__(self, train_sentences, tagger_classes=[UnigramTagger, BigramTagger]):
train_sent_tags = conll_tag_chunks(train_sentences)
self.chunk_tagger = combined_tagger(train_sent_tags, tagger_classes)
def parse(self, tagged_sentence):
if not tagged_sentence:
return None
pos_tags = [tag for word, tag in tagged_sentence]
chunk_pos_tags = self.chunk_tagger.tag(pos_tags)
chunk_tags = [chunk_tag for (pos_tag, chunk_tag) in chunk_pos_tags]
wpc_tags = [(word, pos_tag, chunk_tag) for ((word, pos_tag), chunk_tag) in zip(tagged_sentence, chunk_tags)]
return conlltags2tree(wpc_tags)
ntc = NGramTagChunker(train_data)
print(ntc.evaluate(test_data))
ChunkParse score:
IOB Accuracy: 97.2%%
Precision: 91.4%%
Recall: 94.3%%
F-Measure: 92.8%%
[29]:
sentence_nlp = nlp(sentence)
tagged_sentence = [(word.text, word.tag_) for word in sentence_nlp]
tree = ntc.parse(tagged_sentence)
print(tree)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-29-833c8b4a0527> in <module>
----> 1 sentence_nlp = nlp(sentence)
2 tagged_sentence = [(word.text, word.tag_) for word in sentence_nlp]
3 tree = ntc.parse(tagged_sentence)
4 print(tree)
NameError: name 'nlp' is not defined
[ ]:
from nltk.corpus import conll2000
wsj_data = conll2000.chunked_sents()
train_wsj_data = wsj_data[:10000]
test_wsj_data = wsj_data[10000:]
print(train_wsj_data[10])
[ ]:
tc = NGramTagChunker(train_wsj_data)
print(tc.evaluate(test_wsj_data))
[ ]:
tree = tc.parse(tagged_sentence)
print(tree)
[ ]:
from nltk.corpus import conll2000
wsj_data = conll2000.chunked_sents()
train_wsj_data = wsj_data[:10000]
test_wsj_data = wsj_data[10000:]
print(train_wsj_data[10])
[ ]:
tc = NGramTagChunker(train_wsj_data)
print(tc.evaluate(test_wsj_data))
[ ]:
tree = tc.parse(tagged_sentence)
print(tree)
[ ]: