Esquemas de representacion

  • 30 min | Última modificación: Sept 22, 2020

Preparacion de los datos

[15]:
%%writefile documents.txt
The sky is blue and beautiful.
Love this blue and beautiful sky!
The quick brown fox jumps over the lazy dog.
A king's breakfast has sausages, ham, bacon, eggs, toast and beans
I love green eggs, ham, sausages and bacon!
The brown fox is quick and the blue dog is lazy!
The sky is very blue and the sky is very beautiful today
The dog is lazy but the brown fox is quick!
Writing documents.txt
[16]:
##
## Lee el corpus
##
with open('documents.txt' , 'r') as f:
    documents = f.readlines()

documents = [t.replace('\n', '') for t in documents]
documents
[16]:
['The sky is blue and beautiful.',
 'Love this blue and beautiful sky!',
 'The quick brown fox jumps over the lazy dog.',
 "A king's breakfast has sausages, ham, bacon, eggs, toast and beans",
 'I love green eggs, ham, sausages and bacon!',
 'The brown fox is quick and the blue dog is lazy!',
 'The sky is very blue and the sky is very beautiful today',
 'The dog is lazy but the brown fox is quick!']
[17]:
%%writefile labels.txt
weather
weather
animals
food
food
animals
weather
animals
Overwriting labels.txt
[18]:
##
## Lee las etiquetas
##
with open('labels.txt' , 'r') as f:
    labels = f.readlines()

labels = [t.replace('\n', '') for t in labels]
labels
[18]:
['weather',
 'weather',
 'animals',
 'food',
 'food',
 'animals',
 'weather',
 'animals']
[19]:
corpus = pd.DataFrame({'Document': documents, 'Labels': labels})
corpus
[19]:
Document Labels
0 The sky is blue and beautiful. weather
1 Love this blue and beautiful sky! weather
2 The quick brown fox jumps over the lazy dog. animals
3 A king's breakfast has sausages, ham, bacon, e... food
4 I love green eggs, ham, sausages and bacon! food
5 The brown fox is quick and the blue dog is lazy! animals
6 The sky is very blue and the sky is very beaut... weather
7 The dog is lazy but the brown fox is quick! animals

Preprocesamiento

[23]:
import nltk
import re

tokenizer = nltk.WordPunctTokenizer()

STOPWORDS = nltk.corpus.stopwords.words('english')

def normalize_document(document):
    document = re.sub(r'[^a-zA-Z\s]', '', document, re.I|re.A)
    document = document.lower().strip()
    tokens = tokenizer.tokenize(document)
    tokens = [token for token in tokens if token not in STOPWORDS]
    document = ' '.join(tokens)
    return document

corpus['Normalized_Document'] = corpus.Document.map(normalize_document)
corpus
[23]:
Document Labels Normalized_Document
0 The sky is blue and beautiful. weather sky blue beautiful
1 Love this blue and beautiful sky! weather love blue beautiful sky
2 The quick brown fox jumps over the lazy dog. animals quick brown fox jumps lazy dog
3 A king's breakfast has sausages, ham, bacon, e... food kings breakfast sausages ham bacon eggs toast ...
4 I love green eggs, ham, sausages and bacon! food love green eggs ham sausages bacon
5 The brown fox is quick and the blue dog is lazy! animals brown fox quick blue dog lazy
6 The sky is very blue and the sky is very beaut... weather sky blue sky beautiful today
7 The dog is lazy but the brown fox is quick! animals dog lazy brown fox quick

Bag of Words (BoW) model

[35]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(
    # encoding='utf-8',              # codificacion del texto
    # decode_error='strict',         #
    # strip_accents=None,            # remocion de acentos
    # lowercase=True,                #
    # preprocessor=None,             #
    # tokenizer=None,                #
    # stop_words=None,               #
    # token_pattern='(?u)\b\w\w+\b', #
    # ngram_range=(1, 1),            #
    # analyzer='word',               #
    max_df=1.0,                      #
    min_df=1,                        #
    # max_features=None,             #
    # vocabulary=None,               #
    # binary=False                   #
)

bow = cv.fit_transform(corpus.Normalized_Document.tolist())

## sparse matrix
print(bow)
  (0, 17)       1
  (0, 3)        1
  (0, 2)        1
  (1, 17)       1
  (1, 3)        1
  (1, 2)        1
  (1, 14)       1
  (2, 15)       1
  (2, 5)        1
  (2, 8)        1
  (2, 11)       1
  (2, 13)       1
  (2, 6)        1
  (3, 12)       1
  (3, 4)        1
  (3, 16)       1
  (3, 10)       1
  (3, 0)        1
  (3, 7)        1
  (3, 18)       1
  (3, 1)        1
  (4, 14)       1
  (4, 16)       1
  (4, 10)       1
  (4, 0)        1
  (4, 7)        1
  (4, 9)        1
  (5, 3)        1
  (5, 15)       1
  (5, 5)        1
  (5, 8)        1
  (5, 13)       1
  (5, 6)        1
  (6, 17)       2
  (6, 3)        1
  (6, 2)        1
  (6, 19)       1
  (7, 15)       1
  (7, 5)        1
  (7, 8)        1
  (7, 13)       1
  (7, 6)        1
[36]:
bow = bow.toarray()
bow
[36]:
array([[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0],
       [1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1],
       [0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0]])
[37]:
##
## Representacion como un dataframe
##
pd.DataFrame(bow, columns=cv.get_feature_names())
[37]:
bacon beans beautiful blue breakfast brown dog eggs fox green ham jumps kings lazy love quick sausages sky toast today
0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0
2 0 0 0 0 0 1 1 0 1 0 0 1 0 1 0 1 0 0 0 0
3 1 1 0 0 1 0 0 1 0 0 1 0 1 0 0 0 1 0 1 0
4 1 0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 1 0 0 0
5 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 1 0 0 0 0
6 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1
7 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 1 0 0 0 0

Bag of N-Grams model

[41]:
cv = CountVectorizer(
    # encoding='utf-8',              # codificacion del texto
    # decode_error='strict',         #
    # strip_accents=None,            # remocion de acentos
    # lowercase=True,                #
    # preprocessor=None,             #
    # tokenizer=None,                #
    # stop_words=None,               #
    # token_pattern='(?u)\b\w\w+\b', #
    ngram_range=(2, 2),              #
    # analyzer='word',               #
    max_df=1.0,                      #
    min_df=1,                        #
    # max_features=None,             #
    # vocabulary=None,               #
    # binary=False                   #
)

bon = cv.fit_transform(corpus.Normalized_Document.tolist())
pd.DataFrame(bon.toarray(), columns=cv.get_feature_names()).T

[41]:
0 1 2 3 4 5 6 7
bacon eggs 0 0 0 1 0 0 0 0
beautiful sky 0 1 0 0 0 0 0 0
beautiful today 0 0 0 0 0 0 1 0
blue beautiful 1 1 0 0 0 0 0 0
blue dog 0 0 0 0 0 1 0 0
blue sky 0 0 0 0 0 0 1 0
breakfast sausages 0 0 0 1 0 0 0 0
brown fox 0 0 1 0 0 1 0 1
dog lazy 0 0 0 0 0 1 0 1
eggs ham 0 0 0 0 1 0 0 0
eggs toast 0 0 0 1 0 0 0 0
fox jumps 0 0 1 0 0 0 0 0
fox quick 0 0 0 0 0 1 0 1
green eggs 0 0 0 0 1 0 0 0
ham bacon 0 0 0 1 0 0 0 0
ham sausages 0 0 0 0 1 0 0 0
jumps lazy 0 0 1 0 0 0 0 0
kings breakfast 0 0 0 1 0 0 0 0
lazy brown 0 0 0 0 0 0 0 1
lazy dog 0 0 1 0 0 0 0 0
love blue 0 1 0 0 0 0 0 0
love green 0 0 0 0 1 0 0 0
quick blue 0 0 0 0 0 1 0 0
quick brown 0 0 1 0 0 0 0 0
sausages bacon 0 0 0 0 1 0 0 0
sausages ham 0 0 0 1 0 0 0 0
sky beautiful 0 0 0 0 0 0 1 0
sky blue 1 0 0 0 0 0 1 0
toast beans 0 0 0 1 0 0 0 0

TF-IDF model

\[tfidf= tf \times idf\]
  • tf: term-frequency

  • idf:

\[1+\log\frac{N}{1+df(w)}\]
[43]:
import numpy as np

from sklearn.feature_extraction.text import TfidfTransformer

cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(corpus.Normalized_Document.tolist())
cv_matrix = cv_matrix.toarray()


tt = TfidfTransformer(norm='l2', use_idf=True)
tt_matrix = tt.fit_transform(cv_matrix)
tt_matrix = tt_matrix.toarray()
vocab = cv.get_feature_names()
pd.DataFrame(np.round(tt_matrix, 2), columns=vocab)

[43]:
bacon beans beautiful blue breakfast brown dog eggs fox green ham jumps kings lazy love quick sausages sky toast today
0 0.00 0.00 0.60 0.53 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.60 0.00 0.0
1 0.00 0.00 0.49 0.43 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.57 0.00 0.00 0.49 0.00 0.0
2 0.00 0.00 0.00 0.00 0.00 0.38 0.38 0.00 0.38 0.00 0.00 0.53 0.00 0.38 0.00 0.38 0.00 0.00 0.00 0.0
3 0.32 0.38 0.00 0.00 0.38 0.00 0.00 0.32 0.00 0.00 0.32 0.00 0.38 0.00 0.00 0.00 0.32 0.00 0.38 0.0
4 0.39 0.00 0.00 0.00 0.00 0.00 0.00 0.39 0.00 0.47 0.39 0.00 0.00 0.00 0.39 0.00 0.39 0.00 0.00 0.0
5 0.00 0.00 0.00 0.37 0.00 0.42 0.42 0.00 0.42 0.00 0.00 0.00 0.00 0.42 0.00 0.42 0.00 0.00 0.00 0.0
6 0.00 0.00 0.36 0.32 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.72 0.00 0.5
7 0.00 0.00 0.00 0.00 0.00 0.45 0.45 0.00 0.45 0.00 0.00 0.00 0.00 0.45 0.00 0.45 0.00 0.00 0.00 0.0
[44]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0.0, max_df=1.0, norm="l2", use_idf=True, smooth_idf=True)
tv_matrix = tv.fit_transform(corpus.Normalized_Document.tolist())
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)
[44]:
bacon beans beautiful blue breakfast brown dog eggs fox green ham jumps kings lazy love quick sausages sky toast today
0 0.00 0.00 0.60 0.53 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.60 0.00 0.0
1 0.00 0.00 0.49 0.43 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.57 0.00 0.00 0.49 0.00 0.0
2 0.00 0.00 0.00 0.00 0.00 0.38 0.38 0.00 0.38 0.00 0.00 0.53 0.00 0.38 0.00 0.38 0.00 0.00 0.00 0.0
3 0.32 0.38 0.00 0.00 0.38 0.00 0.00 0.32 0.00 0.00 0.32 0.00 0.38 0.00 0.00 0.00 0.32 0.00 0.38 0.0
4 0.39 0.00 0.00 0.00 0.00 0.00 0.00 0.39 0.00 0.47 0.39 0.00 0.00 0.00 0.39 0.00 0.39 0.00 0.00 0.0
5 0.00 0.00 0.00 0.37 0.00 0.42 0.42 0.00 0.42 0.00 0.00 0.00 0.00 0.42 0.00 0.42 0.00 0.00 0.00 0.0
6 0.00 0.00 0.36 0.32 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.72 0.00 0.5
7 0.00 0.00 0.00 0.00 0.00 0.45 0.45 0.00 0.45 0.00 0.00 0.00 0.00 0.45 0.00 0.45 0.00 0.00 0.00 0.0

Similaridad de documentos

[45]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tv_matrix)
similarity_df = pd.DataFrame(similarity_matrix)
similarity_df
[45]:
0 1 2 3 4 5 6 7
0 1.000000 0.820599 0.000000 0.000000 0.000000 0.192353 0.817246 0.000000
1 0.820599 1.000000 0.000000 0.000000 0.225489 0.157845 0.670631 0.000000
2 0.000000 0.000000 1.000000 0.000000 0.000000 0.791821 0.000000 0.850516
3 0.000000 0.000000 0.000000 1.000000 0.506866 0.000000 0.000000 0.000000
4 0.000000 0.225489 0.000000 0.506866 1.000000 0.000000 0.000000 0.000000
5 0.192353 0.157845 0.791821 0.000000 0.000000 1.000000 0.115488 0.930989
6 0.817246 0.670631 0.000000 0.000000 0.000000 0.115488 1.000000 0.000000
7 0.000000 0.000000 0.850516 0.000000 0.000000 0.930989 0.000000 1.000000
[46]:
from scipy.cluster.hierarchy import dendrogram, linkage

Z = linkage(similarity_matrix, "ward")
pd.DataFrame(
    Z,
    columns=["Document\Cluster 1", "Document\Cluster 2", "Distance", "Cluster Size"],
    dtype="object",
)
[46]:
Document\Cluster 1 Document\Cluster 2 Distance Cluster Size
0 2 7 0.253098 2
1 0 6 0.308539 2
2 5 8 0.386952 3
3 1 9 0.489845 3
4 3 4 0.732945 2
5 11 12 2.69565 5
6 10 13 3.45108 8
[48]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 3))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Data point')
plt.ylabel('Distance')
dendrogram(Z)
plt.axhline(y=1.0, c='k', ls='--', lw=0.5)
[48]:
<matplotlib.lines.Line2D at 0x12dfb1748>
../../../_images/ciencia_datos_mineria_de_texto_notebooks_3-03_esquemas_de_representacion_22_1.png
[50]:
from scipy.cluster.hierarchy import fcluster

max_dist = 1.0
cluster_labels = fcluster(Z, max_dist, criterion='distance')
cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel'])
pd.concat([corpus, cluster_labels], axis=1)
[50]:
Document Labels Normalized_Document ClusterLabel
0 The sky is blue and beautiful. weather sky blue beautiful 2
1 Love this blue and beautiful sky! weather love blue beautiful sky 2
2 The quick brown fox jumps over the lazy dog. animals quick brown fox jumps lazy dog 1
3 A king's breakfast has sausages, ham, bacon, e... food kings breakfast sausages ham bacon eggs toast ... 3
4 I love green eggs, ham, sausages and bacon! food love green eggs ham sausages bacon 3
5 The brown fox is quick and the blue dog is lazy! animals brown fox quick blue dog lazy 1
6 The sky is very blue and the sky is very beaut... weather sky blue sky beautiful today 2
7 The dog is lazy but the brown fox is quick! animals dog lazy brown fox quick 1

Topic Modeling — Latent Dirichlet Allocation

[52]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=3, max_iter=10000, random_state=0)
dt_matrix = lda.fit_transform(cv_matrix)
features = pd.DataFrame(dt_matrix, columns=['T1', 'T2', 'T3'])
features
[52]:
T1 T2 T3
0 0.832191 0.083480 0.084329
1 0.863554 0.069100 0.067346
2 0.047794 0.047776 0.904430
3 0.037243 0.925559 0.037198
4 0.049121 0.903076 0.047802
5 0.054902 0.047778 0.897321
6 0.888287 0.055697 0.056016
7 0.055704 0.055689 0.888607
[53]:
tt_matrix = lda.components_
for topic_weights in tt_matrix:
    topic = [(token, weight) for token, weight in zip(vocab, topic_weights)]
    topic = sorted(topic, key=lambda x: -x[1])
    topic = [item for item in topic if item[1] > 0.6]
    print(topic)
    print()
[('sky', 4.332439442470133), ('blue', 3.373774254787669), ('beautiful', 3.3323650509884386), ('today', 1.3325579855138987), ('love', 1.330415818217548)]

[('bacon', 2.33269586574902), ('eggs', 2.33269586574902), ('ham', 2.33269586574902), ('sausages', 2.33269586574902), ('love', 1.3354610533796558), ('beans', 1.3327735190105536), ('breakfast', 1.3327735190105536), ('kings', 1.3327735190105536), ('toast', 1.3327735190105536), ('green', 1.3325431515674175)]

[('brown', 3.3323473548404405), ('dog', 3.3323473548404405), ('fox', 3.3323473548404405), ('lazy', 3.3323473548404405), ('quick', 3.3323473548404405), ('jumps', 1.3324193772908193), ('blue', 1.2919423137963386)]