Extracción de información

  • 30 min | Última modificación: Diciembre 9, 2020

http://www.nltk.org/book/

Text Analytics with Python

[1]:
import nltk

nltk.download('conll2000')
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[1]:
True
[2]:
##
## Preprocesamiento básico
##
def ie_preprocess(document):
    ##
    ## Separación del documento en sentencias
    ##
    sentences = nltk.sent_tokenize(document)

    ##
    ## Separación de las sentencias en palabras
    ##
    sentences = [nltk.word_tokenize(sent) for sent in sentences]

    ##
    ## POS-Tagging
    ##
    sentences = [nltk.pos_tag(sent) for sent in sentences]
[3]:
##
## Noun phrase chunking
##   Frases nominales
##

## Define la frase
sentence = [
    ("the", "DT"),
    ("little", "JJ"),
    ("yellow", "JJ"),
    ("dog", "NN"),
    ("barked", "VBD"),
    ("at", "IN"),
    ("the", "DT"),
    ("cat", "NN"),
]

## tag pattern
grammar = "NP: {<DT>?<JJ>*<NN>}"

cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence)
print(result)
(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))
[4]:
##
## Regla alternativa
##
grammar = "NP: {<DT>?<JJ.*>*<NN.*>+}"

cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence)
print(result)
(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))
[5]:
##
## Chunking con expresiones regulares
##
grammar = r"""
  NP: {<DT|PP\$>?<JJ>*<NN>}   # chunk determiner/possessive, adjectives and noun
      {<NNP>+}                # chunk sequences of proper nouns
"""

cp = nltk.RegexpParser(grammar)
sentence = [
    ("Rapunzel", "NNP"),
    ("let", "VBD"),
    ("down", "RP"),
    ("her", "PP$"),
    ("long", "JJ"),
    ("golden", "JJ"),
    ("hair", "NN"),
]

print(cp.parse(sentence))
(S
  (NP Rapunzel/NNP)
  let/VBD
  down/RP
  (NP her/PP$ long/JJ golden/JJ hair/NN))
[6]:
##
## Chunk de dos sustantivos consecutivos
##
nouns = [("money", "NN"), ("market", "NN"), ("fund", "NN")]
grammar = "NP: {<NN><NN>}"
cp = nltk.RegexpParser(grammar)
print(cp.parse(nouns))
(S (NP money/NN market/NN) fund/NN)
[7]:
##
## Ejemplo aplicado a un texto
##
cp = nltk.RegexpParser('CHUNK: {<V.*> <TO> <V.*>}')
brown = nltk.corpus.brown

counter = 0
for sent in brown.tagged_sents():

    tree = cp.parse(sent)

    for subtree in tree.subtrees():
        if subtree.label() == 'CHUNK':
            print(subtree)
            counter += 1

    if counter > 10:
        break
(CHUNK combined/VBN to/TO achieve/VB)
(CHUNK continue/VB to/TO place/VB)
(CHUNK serve/VB to/TO protect/VB)
(CHUNK wanted/VBD to/TO wait/VB)
(CHUNK allowed/VBN to/TO place/VB)
(CHUNK expected/VBN to/TO become/VB)
(CHUNK expected/VBN to/TO approve/VB)
(CHUNK expected/VBN to/TO make/VB)
(CHUNK intends/VBZ to/TO make/VB)
(CHUNK seek/VB to/TO set/VB)
(CHUNK like/VB to/TO see/VB)
[8]:
##
## Chinking
##   Se refiere a definir que se excluye en vez
##   de definir que se reconoce
##
grammar = r"""
  NP:
    {<.*>+}          # Chunk everything
    }<VBD|IN>+{      # Chink sequences of VBD and IN
  """
sentence = [
    ("the", "DT"),
    ("little", "JJ"),
    ("yellow", "JJ"),
    ("dog", "NN"),
    ("barked", "VBD"),
    ("at", "IN"),
    ("the", "DT"),
    ("cat", "NN"),
]
cp = nltk.RegexpParser(grammar)
print(cp.parse(sentence))
(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))
[9]:
##
## Entrenamiento de chunkers
##
from nltk.corpus import conll2000

##
## Ejemplo de una secuencia tageada
## El corpus CoNLL 2000 tiene 270K palabras divididas
## en train/test con POS-tags y chunk tags
##
print(conll2000.chunked_sents('train.txt')[99])
(S
  (PP Over/IN)
  (NP a/DT cup/NN)
  (PP of/IN)
  (NP coffee/NN)
  ,/,
  (NP Mr./NNP Stone/NNP)
  (VP told/VBD)
  (NP his/PRP$ story/NN)
  ./.)
[10]:
##
## CoNLL tiene tres tipos de chunks
##   NP: ya revisados
##   VP: ejemplo: has already delivered
##   PP: `because of`
##
## En el siguiente ejemplo se seleccionan los
## chunks de tipo NP
##
print(conll2000.chunked_sents('train.txt', chunk_types=['NP'])[99])
(S
  Over/IN
  (NP a/DT cup/NN)
  of/IN
  (NP coffee/NN)
  ,/,
  (NP Mr./NNP Stone/NNP)
  told/VBD
  (NP his/PRP$ story/NN)
  ./.)
[11]:
##
## Evaluación de la precisión de
## un Regular expression parser
## usando un chunked corpus. Este parser no
## crea chunks
##
cp = nltk.RegexpParser("")
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
print(cp.evaluate(test_sents))
ChunkParse score:
    IOB Accuracy:  43.4%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%
[12]:
grammar = r"NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammar)
print(cp.evaluate(test_sents))
ChunkParse score:
    IOB Accuracy:  87.7%%
    Precision:     70.6%%
    Recall:        67.8%%
    F-Measure:     69.2%%
[13]:
##
## Se usa un unigram tagger para marcar sentencias con chunk tags.
##
class UnigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):

        ##
        ## Entrena el modelo
        ##
        train_data = [
            [(t, c) for _, t, c in nltk.chunk.tree2conlltags(sent)]
            for sent in train_sents
        ]
        self.tagger = nltk.UnigramTagger(train_data)

    def parse(self, sentence):
        ##
        ## Aplica el tagger
        ##
        pos_tags = [pos for (word, pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [
            (word, pos, chunktag)
            for ((word, pos), chunktag) in zip(sentence, chunktags)
        ]
        return nltk.chunk.conlltags2tree(conlltags)


test_sents = conll2000.chunked_sents("test.txt", chunk_types=["NP"])
train_sents = conll2000.chunked_sents("train.txt", chunk_types=["NP"])
unigram_chunker = UnigramChunker(train_sents)
print(unigram_chunker.evaluate(test_sents))
ChunkParse score:
    IOB Accuracy:  92.9%%
    Precision:     79.9%%
    Recall:        86.8%%
    F-Measure:     83.2%%
[14]:
##
## Para determinar que aprendio el tagger, se asigna un tag
## a cada uno de los tags usandos en POS-tagging
##
postags = sorted(set(pos for sent in train_sents for (word, pos) in sent.leaves()))
print(unigram_chunker.tagger.tag(postags))
[('#', 'B-NP'), ('$', 'B-NP'), ("''", 'O'), ('(', 'O'), (')', 'O'), (',', 'O'), ('.', 'O'), (':', 'O'), ('CC', 'O'), ('CD', 'I-NP'), ('DT', 'B-NP'), ('EX', 'B-NP'), ('FW', 'I-NP'), ('IN', 'O'), ('JJ', 'I-NP'), ('JJR', 'B-NP'), ('JJS', 'I-NP'), ('MD', 'O'), ('NN', 'I-NP'), ('NNP', 'I-NP'), ('NNPS', 'I-NP'), ('NNS', 'I-NP'), ('PDT', 'B-NP'), ('POS', 'B-NP'), ('PRP', 'B-NP'), ('PRP$', 'B-NP'), ('RB', 'O'), ('RBR', 'O'), ('RBS', 'B-NP'), ('RP', 'O'), ('SYM', 'O'), ('TO', 'O'), ('UH', 'O'), ('VB', 'O'), ('VBD', 'O'), ('VBG', 'O'), ('VBN', 'O'), ('VBP', 'O'), ('VBZ', 'O'), ('WDT', 'B-NP'), ('WP', 'B-NP'), ('WP$', 'B-NP'), ('WRB', 'O'), ('``', 'O')]
[15]:
##
## Codigo similar al anterior, pero usa un bigram-tagger
##
class BigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [
            [(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)]
            for sent in train_sents
        ]
        self.tagger = nltk.BigramTagger(train_data)

    def parse(self, sentence):
        pos_tags = [pos for (word, pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [
            (word, pos, chunktag)
            for ((word, pos), chunktag) in zip(sentence, chunktags)
        ]
        return nltk.chunk.conlltags2tree(conlltags)


bigram_chunker = BigramChunker(train_sents)
print(bigram_chunker.evaluate(test_sents))
ChunkParse score:
    IOB Accuracy:  93.3%%
    Precision:     82.3%%
    Recall:        86.8%%
    F-Measure:     84.5%%
[16]:
##
## Cascade chunkers
##  Se crean especificando una estructura multi-estado
##  con reglas recursivas.
##
grammar = r"""
  NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
  PP: {<IN><NP>}               # Chunk prepositions followed by NP
  VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
  CLAUSE: {<NP><VP>}           # Chunk NP, VP
  """
cp = nltk.RegexpParser(grammar)

##
## Note como es taggeada la frase
##
##    Mary saw the cat sit on the mat
##
sentence = [
    ("Mary", "NN"),
    ("saw", "VBD"),
    ("the", "DT"),
    ("cat", "NN"),
    ("sit", "VB"),
    ("on", "IN"),
    ("the", "DT"),
    ("mat", "NN"),
]

##
## En la parte que dice CLAUSE el chunker debio
## identificar el patrón VP
##
print(cp.parse(sentence))
(S
  (NP Mary/NN)
  saw/VBD
  (CLAUSE
    (NP the/DT cat/NN)
    (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))
[17]:
##
## Para la frase:
##
##    John thinks Mary saw the cat sit on the mat
##
## el chunker no identifica correctamente la estructura
##
sentence = [
    ("John", "NNP"),
    ("thinks", "VBZ"),
    ("Mary", "NN"),
    ("saw", "VBD"),
    ("the", "DT"),
    ("cat", "NN"),
    ("sit", "VB"),
    ("on", "IN"),
    ("the", "DT"),
    ("mat", "NN"),
]
print(cp.parse(sentence))
(S
  (NP John/NNP)
  thinks/VBZ
  (NP Mary/NN)
  saw/VBD
  (CLAUSE
    (NP the/DT cat/NN)
    (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))
[18]:
##
## La solución es permitir que el chunker itere varias veces sobre
## los patrones para que pueda identificar todas las estructuras
##
cp = nltk.RegexpParser(grammar, loop=2)
print(cp.parse(sentence))
(S
  (NP John/NNP)
  thinks/VBZ
  (CLAUSE
    (NP Mary/NN)
    (VP
      saw/VBD
      (CLAUSE
        (NP the/DT cat/NN)
        (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))
[19]:
##
## En los siguientes ejemplos se ilustra
## como construir un arbol como el
## presentado en los ejemplos anteriores
##
tree1 = nltk.Tree('NP', ['Alice'])
print(tree1)
(NP Alice)
[20]:
tree2 = nltk.Tree('NP', ['the', 'rabbit'])
print(tree2)
(NP the rabbit)
[21]:
tree3 = nltk.Tree('VP', ['chased', tree2])
tree4 = nltk.Tree('S', [tree1, tree3])
print(tree4)
(S (NP Alice) (VP chased (NP the rabbit)))
[22]:
print(tree4[1])
(VP chased (NP the rabbit))
[23]:
tree4.leaves()
[23]:
['Alice', 'chased', 'the', 'rabbit']