Extracción de información
30 min | Última modificación: Diciembre 9, 2020
Text Analytics with Python
[1]:
import nltk
nltk.download('conll2000')
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data] Package conll2000 is already up-to-date!
[1]:
True
[2]:
##
## Preprocesamiento básico
##
def ie_preprocess(document):
##
## Separación del documento en sentencias
##
sentences = nltk.sent_tokenize(document)
##
## Separación de las sentencias en palabras
##
sentences = [nltk.word_tokenize(sent) for sent in sentences]
##
## POS-Tagging
##
sentences = [nltk.pos_tag(sent) for sent in sentences]
[3]:
##
## Noun phrase chunking
## Frases nominales
##
## Define la frase
sentence = [
("the", "DT"),
("little", "JJ"),
("yellow", "JJ"),
("dog", "NN"),
("barked", "VBD"),
("at", "IN"),
("the", "DT"),
("cat", "NN"),
]
## tag pattern
grammar = "NP: {<DT>?<JJ>*<NN>}"
cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence)
print(result)
(S
(NP the/DT little/JJ yellow/JJ dog/NN)
barked/VBD
at/IN
(NP the/DT cat/NN))
[4]:
##
## Regla alternativa
##
grammar = "NP: {<DT>?<JJ.*>*<NN.*>+}"
cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence)
print(result)
(S
(NP the/DT little/JJ yellow/JJ dog/NN)
barked/VBD
at/IN
(NP the/DT cat/NN))
[5]:
##
## Chunking con expresiones regulares
##
grammar = r"""
NP: {<DT|PP\$>?<JJ>*<NN>} # chunk determiner/possessive, adjectives and noun
{<NNP>+} # chunk sequences of proper nouns
"""
cp = nltk.RegexpParser(grammar)
sentence = [
("Rapunzel", "NNP"),
("let", "VBD"),
("down", "RP"),
("her", "PP$"),
("long", "JJ"),
("golden", "JJ"),
("hair", "NN"),
]
print(cp.parse(sentence))
(S
(NP Rapunzel/NNP)
let/VBD
down/RP
(NP her/PP$ long/JJ golden/JJ hair/NN))
[6]:
##
## Chunk de dos sustantivos consecutivos
##
nouns = [("money", "NN"), ("market", "NN"), ("fund", "NN")]
grammar = "NP: {<NN><NN>}"
cp = nltk.RegexpParser(grammar)
print(cp.parse(nouns))
(S (NP money/NN market/NN) fund/NN)
[7]:
##
## Ejemplo aplicado a un texto
##
cp = nltk.RegexpParser('CHUNK: {<V.*> <TO> <V.*>}')
brown = nltk.corpus.brown
counter = 0
for sent in brown.tagged_sents():
tree = cp.parse(sent)
for subtree in tree.subtrees():
if subtree.label() == 'CHUNK':
print(subtree)
counter += 1
if counter > 10:
break
(CHUNK combined/VBN to/TO achieve/VB)
(CHUNK continue/VB to/TO place/VB)
(CHUNK serve/VB to/TO protect/VB)
(CHUNK wanted/VBD to/TO wait/VB)
(CHUNK allowed/VBN to/TO place/VB)
(CHUNK expected/VBN to/TO become/VB)
(CHUNK expected/VBN to/TO approve/VB)
(CHUNK expected/VBN to/TO make/VB)
(CHUNK intends/VBZ to/TO make/VB)
(CHUNK seek/VB to/TO set/VB)
(CHUNK like/VB to/TO see/VB)
[8]:
##
## Chinking
## Se refiere a definir que se excluye en vez
## de definir que se reconoce
##
grammar = r"""
NP:
{<.*>+} # Chunk everything
}<VBD|IN>+{ # Chink sequences of VBD and IN
"""
sentence = [
("the", "DT"),
("little", "JJ"),
("yellow", "JJ"),
("dog", "NN"),
("barked", "VBD"),
("at", "IN"),
("the", "DT"),
("cat", "NN"),
]
cp = nltk.RegexpParser(grammar)
print(cp.parse(sentence))
(S
(NP the/DT little/JJ yellow/JJ dog/NN)
barked/VBD
at/IN
(NP the/DT cat/NN))
[9]:
##
## Entrenamiento de chunkers
##
from nltk.corpus import conll2000
##
## Ejemplo de una secuencia tageada
## El corpus CoNLL 2000 tiene 270K palabras divididas
## en train/test con POS-tags y chunk tags
##
print(conll2000.chunked_sents('train.txt')[99])
(S
(PP Over/IN)
(NP a/DT cup/NN)
(PP of/IN)
(NP coffee/NN)
,/,
(NP Mr./NNP Stone/NNP)
(VP told/VBD)
(NP his/PRP$ story/NN)
./.)
[10]:
##
## CoNLL tiene tres tipos de chunks
## NP: ya revisados
## VP: ejemplo: has already delivered
## PP: `because of`
##
## En el siguiente ejemplo se seleccionan los
## chunks de tipo NP
##
print(conll2000.chunked_sents('train.txt', chunk_types=['NP'])[99])
(S
Over/IN
(NP a/DT cup/NN)
of/IN
(NP coffee/NN)
,/,
(NP Mr./NNP Stone/NNP)
told/VBD
(NP his/PRP$ story/NN)
./.)
[11]:
##
## Evaluación de la precisión de
## un Regular expression parser
## usando un chunked corpus. Este parser no
## crea chunks
##
cp = nltk.RegexpParser("")
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
print(cp.evaluate(test_sents))
ChunkParse score:
IOB Accuracy: 43.4%%
Precision: 0.0%%
Recall: 0.0%%
F-Measure: 0.0%%
[12]:
grammar = r"NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammar)
print(cp.evaluate(test_sents))
ChunkParse score:
IOB Accuracy: 87.7%%
Precision: 70.6%%
Recall: 67.8%%
F-Measure: 69.2%%
[13]:
##
## Se usa un unigram tagger para marcar sentencias con chunk tags.
##
class UnigramChunker(nltk.ChunkParserI):
def __init__(self, train_sents):
##
## Entrena el modelo
##
train_data = [
[(t, c) for _, t, c in nltk.chunk.tree2conlltags(sent)]
for sent in train_sents
]
self.tagger = nltk.UnigramTagger(train_data)
def parse(self, sentence):
##
## Aplica el tagger
##
pos_tags = [pos for (word, pos) in sentence]
tagged_pos_tags = self.tagger.tag(pos_tags)
chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
conlltags = [
(word, pos, chunktag)
for ((word, pos), chunktag) in zip(sentence, chunktags)
]
return nltk.chunk.conlltags2tree(conlltags)
test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"])
train_sents = conll2000.chunked_sents("train.txt", chunk_types=["NP"])
unigram_chunker = UnigramChunker(train_sents)
print(unigram_chunker.evaluate(test_sents))
ChunkParse score:
IOB Accuracy: 92.9%%
Precision: 79.9%%
Recall: 86.8%%
F-Measure: 83.2%%
[14]:
##
## Para determinar que aprendio el tagger, se asigna un tag
## a cada uno de los tags usandos en POS-tagging
##
postags = sorted(set(pos for sent in train_sents for (word, pos) in sent.leaves()))
print(unigram_chunker.tagger.tag(postags))
[('#', 'B-NP'), ('$', 'B-NP'), ("''", 'O'), ('(', 'O'), (')', 'O'), (',', 'O'), ('.', 'O'), (':', 'O'), ('CC', 'O'), ('CD', 'I-NP'), ('DT', 'B-NP'), ('EX', 'B-NP'), ('FW', 'I-NP'), ('IN', 'O'), ('JJ', 'I-NP'), ('JJR', 'B-NP'), ('JJS', 'I-NP'), ('MD', 'O'), ('NN', 'I-NP'), ('NNP', 'I-NP'), ('NNPS', 'I-NP'), ('NNS', 'I-NP'), ('PDT', 'B-NP'), ('POS', 'B-NP'), ('PRP', 'B-NP'), ('PRP$', 'B-NP'), ('RB', 'O'), ('RBR', 'O'), ('RBS', 'B-NP'), ('RP', 'O'), ('SYM', 'O'), ('TO', 'O'), ('UH', 'O'), ('VB', 'O'), ('VBD', 'O'), ('VBG', 'O'), ('VBN', 'O'), ('VBP', 'O'), ('VBZ', 'O'), ('WDT', 'B-NP'), ('WP', 'B-NP'), ('WP$', 'B-NP'), ('WRB', 'O'), ('``', 'O')]
[15]:
##
## Codigo similar al anterior, pero usa un bigram-tagger
##
class BigramChunker(nltk.ChunkParserI):
def __init__(self, train_sents):
train_data = [
[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)]
for sent in train_sents
]
self.tagger = nltk.BigramTagger(train_data)
def parse(self, sentence):
pos_tags = [pos for (word, pos) in sentence]
tagged_pos_tags = self.tagger.tag(pos_tags)
chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
conlltags = [
(word, pos, chunktag)
for ((word, pos), chunktag) in zip(sentence, chunktags)
]
return nltk.chunk.conlltags2tree(conlltags)
bigram_chunker = BigramChunker(train_sents)
print(bigram_chunker.evaluate(test_sents))
ChunkParse score:
IOB Accuracy: 93.3%%
Precision: 82.3%%
Recall: 86.8%%
F-Measure: 84.5%%
[16]:
##
## Cascade chunkers
## Se crean especificando una estructura multi-estado
## con reglas recursivas.
##
grammar = r"""
NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN
PP: {<IN><NP>} # Chunk prepositions followed by NP
VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
CLAUSE: {<NP><VP>} # Chunk NP, VP
"""
cp = nltk.RegexpParser(grammar)
##
## Note como es taggeada la frase
##
## Mary saw the cat sit on the mat
##
sentence = [
("Mary", "NN"),
("saw", "VBD"),
("the", "DT"),
("cat", "NN"),
("sit", "VB"),
("on", "IN"),
("the", "DT"),
("mat", "NN"),
]
##
## En la parte que dice CLAUSE el chunker debio
## identificar el patrón VP
##
print(cp.parse(sentence))
(S
(NP Mary/NN)
saw/VBD
(CLAUSE
(NP the/DT cat/NN)
(VP sit/VB (PP on/IN (NP the/DT mat/NN)))))
[17]:
##
## Para la frase:
##
## John thinks Mary saw the cat sit on the mat
##
## el chunker no identifica correctamente la estructura
##
sentence = [
("John", "NNP"),
("thinks", "VBZ"),
("Mary", "NN"),
("saw", "VBD"),
("the", "DT"),
("cat", "NN"),
("sit", "VB"),
("on", "IN"),
("the", "DT"),
("mat", "NN"),
]
print(cp.parse(sentence))
(S
(NP John/NNP)
thinks/VBZ
(NP Mary/NN)
saw/VBD
(CLAUSE
(NP the/DT cat/NN)
(VP sit/VB (PP on/IN (NP the/DT mat/NN)))))
[18]:
##
## La solución es permitir que el chunker itere varias veces sobre
## los patrones para que pueda identificar todas las estructuras
##
cp = nltk.RegexpParser(grammar, loop=2)
print(cp.parse(sentence))
(S
(NP John/NNP)
thinks/VBZ
(CLAUSE
(NP Mary/NN)
(VP
saw/VBD
(CLAUSE
(NP the/DT cat/NN)
(VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))
[19]:
##
## En los siguientes ejemplos se ilustra
## como construir un arbol como el
## presentado en los ejemplos anteriores
##
tree1 = nltk.Tree('NP', ['Alice'])
print(tree1)
(NP Alice)
[20]:
tree2 = nltk.Tree('NP', ['the', 'rabbit'])
print(tree2)
(NP the rabbit)
[21]:
tree3 = nltk.Tree('VP', ['chased', tree2])
tree4 = nltk.Tree('S', [tree1, tree3])
print(tree4)
(S (NP Alice) (VP chased (NP the rabbit)))
[22]:
print(tree4[1])
(VP chased (NP the rabbit))
[23]:
tree4.leaves()
[23]:
['Alice', 'chased', 'the', 'rabbit']