Gramáticas con características
30 min | Última modificación: Diciembre 10, 2020
Text Analytics with Python
[1]:
##
## Uso de diccionarios para definir características
## de entidades gramaticales
##
## CAT: categoria gramatical
## ORTH: ortografía
## REF: referente
## REL: relación
##
kim = {'CAT': 'NP', 'ORTH': 'Kim', 'REF': 'k'}
chase = {'CAT': 'V', 'ORTH': 'chased', 'REL': 'chase'}
Concordancia gramatical
Formas de conjugación de un verbo
singular plural
-------------------------------------
1st per I run we run
2nd per you run you run
3rd per he/she/it runs rhey run
Aproximación directa. Reglas para singular y plural. No es adecuado para gramaticas con muchas reglas.
S -> NP_SG VP_SG
S -> NP_PL VP_PL
NP_SG -> Det_SG N_SG
NP_PL -> Det_PL N_PL
VP_SG -> V_SG
VP_PL -> V_PL
Det_SG -> 'this'
Det_PL -> 'these'
N_SG -> 'dog'
N_PL -> 'dogs'
V_SG -> 'runs'
V_PL -> 'run'
Adición de propiedades a las categorías gramaticales.
Si sg
es singular y pl
es plural.
Det[NUM=sg] -> 'this'
Det[NUM=pl] -> 'these'
N[NUM=sg] -> 'dog'
N[NUM=pl] -> 'dogs'
V[NUM=sg] -> 'runs'
V[NUM=pl] -> 'run'
Resulta más apropiado permitir variables en las propiedades. ?n
denota los posibles valores de NUM
.
S -> NP[NUM=?n] VP[NUM=?n]
NP[NUM=?n] -> Det[NUM=?n] N[NUM=?n]
VP[NUM=?n] -> V[NUM=?n]
[2]:
%%writefile feat0.fcfg
% start S
# ###################
# Grammar Productions
# ###################
# S expansion productions
S -> NP[NUM=?n] VP[NUM=?n]
# NP expansion productions
NP[NUM=?n] -> N[NUM=?n]
NP[NUM=?n] -> PropN[NUM=?n]
NP[NUM=?n] -> Det[NUM=?n] N[NUM=?n]
NP[NUM=pl] -> N[NUM=pl]
# VP expansion productions
VP[TENSE=?t, NUM=?n] -> IV[TENSE=?t, NUM=?n]
VP[TENSE=?t, NUM=?n] -> TV[TENSE=?t, NUM=?n] NP
# ###################
# Lexical Productions
# ###################
Det[NUM=sg] -> 'this' | 'every'
Det[NUM=pl] -> 'these' | 'all'
Det -> 'the' | 'some' | 'several'
PropN[NUM=sg]-> 'Kim' | 'Jody'
N[NUM=sg] -> 'dog' | 'girl' | 'car' | 'child'
N[NUM=pl] -> 'dogs' | 'girls' | 'cars' | 'children'
IV[TENSE=pres, NUM=sg] -> 'disappears' | 'walks'
IV[TENSE=pres, NUM=pl] -> 'disappear' | 'walk'
IV[TENSE=past] -> 'disappeared' | 'walked'
TV[TENSE=pres, NUM=sg] -> 'sees' | 'likes'
TV[TENSE=pres, NUM=pl] -> 'see' | 'like'
TV[TENSE=past] -> 'saw' | 'liked'
Overwriting feat0.fcfg
[3]:
from nltk import load_parser
## crea el parser
parser = load_parser('feat0.fcfg', trace=0)
## frase a analizar
tokens = 'Kim likes children'.split()
## arboles
for tree in parser.parse(tokens):
print(tree)
# TV: verbos transitivos
# IV: verbos intransitivos
(S[]
(NP[NUM='sg'] (PropN[NUM='sg'] Kim))
(VP[NUM='sg', TENSE='pres']
(TV[NUM='sg', TENSE='pres'] likes)
(NP[NUM='pl'] (N[NUM='pl'] children))))
Terminologia
La notación +/-
se usa para representar true/false
. En el siguiente ejemplo AUX
indica si el verbo es usado como auxiliar.
V[TENSE=pres, AUX=+] -> 'can'
V[TENSE=pres, AUX=-] -> 'may'
V[TENSE=pres, AUX=+] -> 'walks'
V[TENSE=pres, AUX=-] -> 'likes'
No obstante, se suele reemplazar AUX=+
por +AUX
V[TENSE=pres, +AUX] -> 'can'
V[TENSE=pres, +AUX] -> 'may'
V[TENSE=pres, -AUX] -> 'walks'
V[TENSE=pres, -AUX] -> 'likes'
Concordancia de características:
S -> NP[AGR=?n] VP[AGR=?n]
NP[AGR=?n] -> PropN[AGR=?n]
VP[TENSE=?t, AGR=?n] -> Cop[TENSE=?t, AGR=?n] Adj
Cop[TENSE=pres, AGR=[NUM=sg, PER=3]] -> 'is'
PropN[AGR=[NUM=sg, PER=3]] -> 'Kim'
Adj -> 'happy'
[4]:
##
## Manipulación de características en NLTK
##
import nltk
## características atomicas (strings o enteros)
fs1 = nltk.FeatStruct(TENSE='past', NUM='sg')
print(fs1)
[ NUM = 'sg' ]
[ TENSE = 'past' ]
[5]:
##
## Las estructuras de características operan como diccionarios
##
fs1 = nltk.FeatStruct(PER=3, NUM='pl', GND='fem')
## extracción del valor asociado a la clave
print(fs1['GND'])
fem
[6]:
##
## Asignación de una nueva característica
##
fs1['CASE'] = 'acc'
print(fs1)
[ CASE = 'acc' ]
[ GND = 'fem' ]
[ NUM = 'pl' ]
[ PER = 3 ]
[7]:
##
## Estructuras con valores complejos
##
fs2 = nltk.FeatStruct(POS='N', AGR=fs1)
print(fs2)
[ [ CASE = 'acc' ] ]
[ AGR = [ GND = 'fem' ] ]
[ [ NUM = 'pl' ] ]
[ [ PER = 3 ] ]
[ ]
[ POS = 'N' ]
[8]:
##
## Extracción de un valor complejo por clave
##
print(fs2['AGR'])
[ CASE = 'acc' ]
[ GND = 'fem' ]
[ NUM = 'pl' ]
[ PER = 3 ]
[9]:
##
## Extracción de un valor dentro de un diccionario
##
print(fs2['AGR']['PER'])
3
[10]:
##
## Creación de una estructura compleja usando corchetes
##
print(nltk.FeatStruct("[POS='N', AGR=[PER=3, NUM='pl', GND='fem']]"))
[ [ GND = 'fem' ] ]
[ AGR = [ NUM = 'pl' ] ]
[ [ PER = 3 ] ]
[ ]
[ POS = 'N' ]
[11]:
##
## Las estructuras pueden guardar cualquier valor
##
print(nltk.FeatStruct(NAME='Lee', TELNO='01 27 86 42 96', AGE=33))
[ AGE = 33 ]
[ NAME = 'Lee' ]
[ TELNO = '01 27 86 42 96' ]
[12]:
##
## Uso de referencias a valoes ya asignados.
## Note el uso de ->(1)
##
print(
nltk.FeatStruct(
"""
[
NAME='Lee',
ADDRESS=(1)[
NUMBER=74,
STREET='rue Pascal'
],
SPOUSE=[
NAME='Kim',
ADDRESS->(1)
]
]
"""
)
)
[ ADDRESS = (1) [ NUMBER = 74 ] ]
[ [ STREET = 'rue Pascal' ] ]
[ ]
[ NAME = 'Lee' ]
[ ]
[ SPOUSE = [ ADDRESS -> (1) ] ]
[ [ NAME = 'Kim' ] ]
[13]:
##
## Forma alternativa para indicar referencias
##
print(
nltk.FeatStruct(
"""
[
A='a',
B=(1)[C='c'],
D->(1),
E->(1)
]
"""
)
)
[ A = 'a' ]
[ ]
[ B = (1) [ C = 'c' ] ]
[ ]
[ D -> (1) ]
[ E -> (1) ]
[14]:
##
## Unificación de características (unión de conjuntos)
##
fs1 = nltk.FeatStruct(NUMBER=74, STREET='rue Pascal')
fs2 = nltk.FeatStruct(CITY='Paris')
print(fs1.unify(fs2))
[ CITY = 'Paris' ]
[ NUMBER = 74 ]
[ STREET = 'rue Pascal' ]
[15]:
##
## Unificación de características con el mismo nombre
##
fs0 = nltk.FeatStruct(A='a')
fs1 = nltk.FeatStruct(A='b')
fs2 = fs0.unify(fs1)
print(fs2)
None
[16]:
##
## Unificación con compartición de estructuras
##
fs0 = nltk.FeatStruct(
"""
[
NAME=Lee,
ADDRESS=[
NUMBER=74,
STREET='rue Pascal'
],
SPOUSE=[
NAME=Kim,
ADDRESS=[
NUMBER=74,
STREET='rue Pascal'
]
]
]
"""
)
print(fs0)
[ ADDRESS = [ NUMBER = 74 ] ]
[ [ STREET = 'rue Pascal' ] ]
[ ]
[ NAME = 'Lee' ]
[ ]
[ [ ADDRESS = [ NUMBER = 74 ] ] ]
[ SPOUSE = [ [ STREET = 'rue Pascal' ] ] ]
[ [ ] ]
[ [ NAME = 'Kim' ] ]
[17]:
##
## Aumento con nuevos datos
##
fs1 = nltk.FeatStruct(
"""
[
SPOUSE = [
ADDRESS = [CITY = Paris]
]
]
"""
)
##
## Note que la unificación agrega CITY a ADDRESS
##
print(fs1.unify(fs0))
[ ADDRESS = [ NUMBER = 74 ] ]
[ [ STREET = 'rue Pascal' ] ]
[ ]
[ NAME = 'Lee' ]
[ ]
[ [ [ CITY = 'Paris' ] ] ]
[ [ ADDRESS = [ NUMBER = 74 ] ] ]
[ SPOUSE = [ [ STREET = 'rue Pascal' ] ] ]
[ [ ] ]
[ [ NAME = 'Kim' ] ]
[18]:
##
## Comportamiento diferente
##
fs2 = nltk.FeatStruct(
"""
[
NAME = Lee,
ADDRESS = (1)[NUMBER=74, STREET='rue Pascal'],
SPOUSE=[NAME=Kim, ADDRESS->(1)]
]
"""
)
##
## Note que se agrega CITY = 'Paris' a la direccion
## principal
##
print(fs1.unify(fs2))
[ [ CITY = 'Paris' ] ]
[ ADDRESS = (1) [ NUMBER = 74 ] ]
[ [ STREET = 'rue Pascal' ] ]
[ ]
[ NAME = 'Lee' ]
[ ]
[ SPOUSE = [ ADDRESS -> (1) ] ]
[ [ NAME = 'Kim' ] ]
[19]:
##
## Uso de variables
##
fs1 = nltk.FeatStruct("[ADDRESS1=[NUMBER=74, STREET='rue Pascal']]")
fs2 = nltk.FeatStruct("[ADDRESS1=?x, ADDRESS2=?x]")
print(fs2)
[ ADDRESS1 = ?x ]
[ ADDRESS2 = ?x ]
[20]:
##
## Asignación de las variables a partir
## de los datos en fs1
##
print(fs2.unify(fs1))
[ ADDRESS1 = (1) [ NUMBER = 74 ] ]
[ [ STREET = 'rue Pascal' ] ]
[ ]
[ ADDRESS2 -> (1) ]
Subcategorización
Gramática original.
S -> NP[NUM=?n] VP[NUM=?n]
NP[NUM=?n] -> N[NUM=?n]
NP[NUM=?n] -> PropN[NUM=?n]
NP[NUM=?n] -> Det[NUM=?n] N[NUM=?n]
NP[NUM=pl] -> N[NUM=pl]
##
## Cambia esta definición. Se pueden manejar
## mediante propiedades
##
VP[TENSE=?t, NUM=?n] -> IV[TENSE=?t, NUM=?n]
VP[TENSE=?t, NUM=?n] -> TV[TENSE=?t, NUM=?n] NP
# igual a partir de aca
Det[NUM=sg] -> 'this' | 'every'
Det[NUM=pl] -> 'these' | 'all'
Det -> 'the' | 'some' | 'several'
PropN[NUM=sg]-> 'Kim' | 'Jody'
N[NUM=sg] -> 'dog' | 'girl' | 'car' | 'child'
N[NUM=pl] -> 'dogs' | 'girls' | 'cars' | 'children'
##
## Se hace innecesaria esta parte
##
IV[TENSE=pres, NUM=sg] -> 'disappears' | 'walks'
IV[TENSE=pres, NUM=pl] -> 'disappear' | 'walk'
IV[TENSE=past] -> 'disappeared' | 'walked'
TV[TENSE=pres, NUM=sg] -> 'sees' | 'likes'
TV[TENSE=pres, NUM=pl] -> 'see' | 'like'
TV[TENSE=past] -> 'saw' | 'liked`
Gramática modificada.
[21]:
%%writefile feat1_exam.fcfg
%start S
S -> NP[NUM=?n] VP[NUM=?n]
NP[NUM=?n] -> N[NUM=?n]
NP[NUM=?n] -> PropN[NUM=?n]
NP[NUM=?n] -> Det[NUM=?n] N[NUM=?n]
NP[NUM=pl] -> N[NUM=pl]
##
## Nueva definición:
## agreaga SUBCAT = {intrans, trans, clause}
##
VP[TENSE=?t, NUM=?n] -> V[SUBCAT=intrans, TENSE=?t, NUM=?n]
VP[TENSE=?t, NUM=?n] -> V[SUBCAT=trans, TENSE=?t, NUM=?n] NP
VP[TENSE=?t, NUM=?n] -> V[SUBCAT=clause, TENSE=?t, NUM=?n] SBar
V[SUBCAT=intrans, TENSE=pres, NUM=sg] -> 'disappears' | 'walks' | 'puts'
V[SUBCAT=trans, TENSE=pres, NUM=sg] -> 'sees' | 'likes'
V[SUBCAT=clause, TENSE=pres, NUM=sg] -> 'says' | 'claims'
V[SUBCAT=intrans, TENSE=pres, NUM=pl] -> 'disappear' | 'walk' | 'put'
V[SUBCAT=trans, TENSE=pres, NUM=pl] -> 'see' | 'like'
V[SUBCAT=clause, TENSE=pres, NUM=pl] -> 'say' | 'claim'
V[SUBCAT=intrans, TENSE=past, NUM=?n] -> 'disappeared' | 'walked' | 'put'
V[SUBCAT=trans, TENSE=past, NUM=?n] -> 'saw' | 'liked' | 'put'
V[SUBCAT=clause, TENSE=past, NUM=?n] -> 'said' | 'claimed' | 'put'
# igual a partir de aca
Det[NUM=sg] -> 'this' | 'every'
Det[NUM=pl] -> 'these' | 'all'
Det -> 'the' | 'some' | 'several'
PropN[NUM=sg]-> 'Kim' | 'Jody'
N[NUM=sg] -> 'dog' | 'girl' | 'car' | 'child' | 'table' | 'book'
N[NUM=pl] -> 'dogs' | 'girls' | 'cars' | 'children' | 'tables' | 'books'
##
## Agrega dos producciones
##
SBar -> Comp S
Comp -> 'that'
Overwriting feat1_exam.fcfg
[22]:
from nltk import load_parser
## crea el parser
parser = load_parser('feat1_exam.fcfg', trace=0)
## frase a analizar
tokens = 'Kim claims that Jody likes children'.split()
## arboles
for tree in parser.parse(tokens):
print(tree)
(S[]
(NP[NUM='sg'] (PropN[NUM='sg'] Kim))
(VP[NUM='sg', TENSE='pres']
(V[NUM='sg', SUBCAT='clause', TENSE='pres'] claims)
(SBar[]
(Comp[] that)
(S[]
(NP[NUM='sg'] (PropN[NUM='sg'] Jody))
(VP[NUM='sg', TENSE='pres']
(V[NUM='sg', SUBCAT='trans', TENSE='pres'] likes)
(NP[NUM='pl'] (N[NUM='pl'] children)))))))
[23]:
##
## ------------------------------- Hasta aqui -------------------------------
##
[24]:
##
## Generalized Phrase Structure Grammar (GPSG)
##
[25]:
%%writefile feat1.fcfg
% start S
# ###################
# Grammar Productions
# ###################
S[-INV] -> NP VP
S[-INV]/?x -> NP VP/?x
S[-INV] -> NP S/NP
S[-INV] -> Adv[+NEG] S[+INV]
S[+INV] -> V[+AUX] NP VP
S[+INV]/?x -> V[+AUX] NP VP/?x
SBar -> Comp S[-INV]
SBar/?x -> Comp S[-INV]/?x
VP -> V[SUBCAT=intrans, -AUX]
VP -> V[SUBCAT=trans, -AUX] NP
VP/?x -> V[SUBCAT=trans, -AUX] NP/?x
VP -> V[SUBCAT=clause, -AUX] SBar
VP/?x -> V[SUBCAT=clause, -AUX] SBar/?x
VP -> V[+AUX] VP
VP/?x -> V[+AUX] VP/?x
# ###################
# Lexical Productions
# ###################
V[SUBCAT=intrans, -AUX] -> 'walk' | 'sing'
V[SUBCAT=trans, -AUX] -> 'see' | 'like'
V[SUBCAT=clause, -AUX] -> 'say' | 'claim'
V[+AUX] -> 'do' | 'can'
NP[-WH] -> 'you' | 'cats'
NP[+WH] -> 'who'
Adv[+NEG] -> 'rarely' | 'never'
NP/NP ->
Comp -> 'that'
Overwriting feat1.fcfg
[26]:
tokens = 'who do you claim that you like'.split()
from nltk import load_parser
cp = load_parser('feat1.fcfg')
for tree in cp.parse(tokens):
print(tree)
(S[-INV]
(NP[+WH] who)
(S[+INV]/NP[]
(V[+AUX] do)
(NP[-WH] you)
(VP[]/NP[]
(V[-AUX, SUBCAT='clause'] claim)
(SBar[]/NP[]
(Comp[] that)
(S[-INV]/NP[]
(NP[-WH] you)
(VP[]/NP[] (V[-AUX, SUBCAT='trans'] like) (NP[]/NP[] )))))))
[27]:
tokens = 'you claim that you like cats'.split()
for tree in cp.parse(tokens):
print(tree)
(S[-INV]
(NP[-WH] you)
(VP[]
(V[-AUX, SUBCAT='clause'] claim)
(SBar[]
(Comp[] that)
(S[-INV]
(NP[-WH] you)
(VP[] (V[-AUX, SUBCAT='trans'] like) (NP[-WH] cats))))))
[28]:
tokens = 'rarely do you sing'.split()
for tree in cp.parse(tokens):
print(tree)
(S[-INV]
(Adv[+NEG] rarely)
(S[+INV]
(V[+AUX] do)
(NP[-WH] you)
(VP[] (V[-AUX, SUBCAT='intrans'] sing))))
[ ]:
[29]:
text = '''
>>> tokens = 'rarely do you sing'.split()
>>> for tree in cp.parse(tokens):
... print(tree)
'''
text = text.replace(">>> ", "").replace("... ", "").replace("...", "").replace("\t", "")
print(text)
tokens = 'rarely do you sing'.split()
for tree in cp.parse(tokens):
print(tree)
[ ]: