{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Esquemas de representacion\n",
"\n",
"* *30 min* | Última modificación: Sept 22, 2020"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Preparacion de los datos"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Writing documents.txt\n"
]
}
],
"source": [
"%%writefile documents.txt\n",
"The sky is blue and beautiful.\n",
"Love this blue and beautiful sky!\n",
"The quick brown fox jumps over the lazy dog.\n",
"A king's breakfast has sausages, ham, bacon, eggs, toast and beans\n",
"I love green eggs, ham, sausages and bacon!\n",
"The brown fox is quick and the blue dog is lazy!\n",
"The sky is very blue and the sky is very beautiful today\n",
"The dog is lazy but the brown fox is quick!"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['The sky is blue and beautiful.',\n",
" 'Love this blue and beautiful sky!',\n",
" 'The quick brown fox jumps over the lazy dog.',\n",
" \"A king's breakfast has sausages, ham, bacon, eggs, toast and beans\",\n",
" 'I love green eggs, ham, sausages and bacon!',\n",
" 'The brown fox is quick and the blue dog is lazy!',\n",
" 'The sky is very blue and the sky is very beautiful today',\n",
" 'The dog is lazy but the brown fox is quick!']"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"##\n",
"## Lee el corpus\n",
"##\n",
"with open('documents.txt' , 'r') as f:\n",
" documents = f.readlines()\n",
" \n",
"documents = [t.replace('\\n', '') for t in documents]\n",
"documents"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Overwriting labels.txt\n"
]
}
],
"source": [
"%%writefile labels.txt\n",
"weather\n",
"weather\n",
"animals\n",
"food\n",
"food\n",
"animals\n",
"weather\n",
"animals"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['weather',\n",
" 'weather',\n",
" 'animals',\n",
" 'food',\n",
" 'food',\n",
" 'animals',\n",
" 'weather',\n",
" 'animals']"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"##\n",
"## Lee las etiquetas\n",
"##\n",
"with open('labels.txt' , 'r') as f:\n",
" labels = f.readlines()\n",
" \n",
"labels = [t.replace('\\n', '') for t in labels]\n",
"labels"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Document \n",
" Labels \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" The sky is blue and beautiful. \n",
" weather \n",
" \n",
" \n",
" 1 \n",
" Love this blue and beautiful sky! \n",
" weather \n",
" \n",
" \n",
" 2 \n",
" The quick brown fox jumps over the lazy dog. \n",
" animals \n",
" \n",
" \n",
" 3 \n",
" A king's breakfast has sausages, ham, bacon, e... \n",
" food \n",
" \n",
" \n",
" 4 \n",
" I love green eggs, ham, sausages and bacon! \n",
" food \n",
" \n",
" \n",
" 5 \n",
" The brown fox is quick and the blue dog is lazy! \n",
" animals \n",
" \n",
" \n",
" 6 \n",
" The sky is very blue and the sky is very beaut... \n",
" weather \n",
" \n",
" \n",
" 7 \n",
" The dog is lazy but the brown fox is quick! \n",
" animals \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Document Labels\n",
"0 The sky is blue and beautiful. weather\n",
"1 Love this blue and beautiful sky! weather\n",
"2 The quick brown fox jumps over the lazy dog. animals\n",
"3 A king's breakfast has sausages, ham, bacon, e... food\n",
"4 I love green eggs, ham, sausages and bacon! food\n",
"5 The brown fox is quick and the blue dog is lazy! animals\n",
"6 The sky is very blue and the sky is very beaut... weather\n",
"7 The dog is lazy but the brown fox is quick! animals"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"corpus = pd.DataFrame({'Document': documents, 'Labels': labels})\n",
"corpus"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Preprocesamiento"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Document \n",
" Labels \n",
" Normalized_Document \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" The sky is blue and beautiful. \n",
" weather \n",
" sky blue beautiful \n",
" \n",
" \n",
" 1 \n",
" Love this blue and beautiful sky! \n",
" weather \n",
" love blue beautiful sky \n",
" \n",
" \n",
" 2 \n",
" The quick brown fox jumps over the lazy dog. \n",
" animals \n",
" quick brown fox jumps lazy dog \n",
" \n",
" \n",
" 3 \n",
" A king's breakfast has sausages, ham, bacon, e... \n",
" food \n",
" kings breakfast sausages ham bacon eggs toast ... \n",
" \n",
" \n",
" 4 \n",
" I love green eggs, ham, sausages and bacon! \n",
" food \n",
" love green eggs ham sausages bacon \n",
" \n",
" \n",
" 5 \n",
" The brown fox is quick and the blue dog is lazy! \n",
" animals \n",
" brown fox quick blue dog lazy \n",
" \n",
" \n",
" 6 \n",
" The sky is very blue and the sky is very beaut... \n",
" weather \n",
" sky blue sky beautiful today \n",
" \n",
" \n",
" 7 \n",
" The dog is lazy but the brown fox is quick! \n",
" animals \n",
" dog lazy brown fox quick \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Document Labels \\\n",
"0 The sky is blue and beautiful. weather \n",
"1 Love this blue and beautiful sky! weather \n",
"2 The quick brown fox jumps over the lazy dog. animals \n",
"3 A king's breakfast has sausages, ham, bacon, e... food \n",
"4 I love green eggs, ham, sausages and bacon! food \n",
"5 The brown fox is quick and the blue dog is lazy! animals \n",
"6 The sky is very blue and the sky is very beaut... weather \n",
"7 The dog is lazy but the brown fox is quick! animals \n",
"\n",
" Normalized_Document \n",
"0 sky blue beautiful \n",
"1 love blue beautiful sky \n",
"2 quick brown fox jumps lazy dog \n",
"3 kings breakfast sausages ham bacon eggs toast ... \n",
"4 love green eggs ham sausages bacon \n",
"5 brown fox quick blue dog lazy \n",
"6 sky blue sky beautiful today \n",
"7 dog lazy brown fox quick "
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import nltk\n",
"import re\n",
"\n",
"tokenizer = nltk.WordPunctTokenizer()\n",
"\n",
"STOPWORDS = nltk.corpus.stopwords.words('english')\n",
"\n",
"def normalize_document(document):\n",
" document = re.sub(r'[^a-zA-Z\\s]', '', document, re.I|re.A)\n",
" document = document.lower().strip()\n",
" tokens = tokenizer.tokenize(document)\n",
" tokens = [token for token in tokens if token not in STOPWORDS]\n",
" document = ' '.join(tokens)\n",
" return document\n",
" \n",
"corpus['Normalized_Document'] = corpus.Document.map(normalize_document)\n",
"corpus"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Bag of Words (BoW) model"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" (0, 17)\t1\n",
" (0, 3)\t1\n",
" (0, 2)\t1\n",
" (1, 17)\t1\n",
" (1, 3)\t1\n",
" (1, 2)\t1\n",
" (1, 14)\t1\n",
" (2, 15)\t1\n",
" (2, 5)\t1\n",
" (2, 8)\t1\n",
" (2, 11)\t1\n",
" (2, 13)\t1\n",
" (2, 6)\t1\n",
" (3, 12)\t1\n",
" (3, 4)\t1\n",
" (3, 16)\t1\n",
" (3, 10)\t1\n",
" (3, 0)\t1\n",
" (3, 7)\t1\n",
" (3, 18)\t1\n",
" (3, 1)\t1\n",
" (4, 14)\t1\n",
" (4, 16)\t1\n",
" (4, 10)\t1\n",
" (4, 0)\t1\n",
" (4, 7)\t1\n",
" (4, 9)\t1\n",
" (5, 3)\t1\n",
" (5, 15)\t1\n",
" (5, 5)\t1\n",
" (5, 8)\t1\n",
" (5, 13)\t1\n",
" (5, 6)\t1\n",
" (6, 17)\t2\n",
" (6, 3)\t1\n",
" (6, 2)\t1\n",
" (6, 19)\t1\n",
" (7, 15)\t1\n",
" (7, 5)\t1\n",
" (7, 8)\t1\n",
" (7, 13)\t1\n",
" (7, 6)\t1\n"
]
}
],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n",
"\n",
"cv = CountVectorizer(\n",
" # encoding='utf-8', # codificacion del texto\n",
" # decode_error='strict', #\n",
" # strip_accents=None, # remocion de acentos\n",
" # lowercase=True, #\n",
" # preprocessor=None, #\n",
" # tokenizer=None, #\n",
" # stop_words=None, #\n",
" # token_pattern='(?u)\\b\\w\\w+\\b', #\n",
" # ngram_range=(1, 1), #\n",
" # analyzer='word', #\n",
" max_df=1.0, #\n",
" min_df=1, #\n",
" # max_features=None, #\n",
" # vocabulary=None, #\n",
" # binary=False #\n",
")\n",
"\n",
"bow = cv.fit_transform(corpus.Normalized_Document.tolist())\n",
"\n",
"## sparse matrix\n",
"print(bow)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],\n",
" [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0],\n",
" [0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0],\n",
" [1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0],\n",
" [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0],\n",
" [0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0],\n",
" [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1],\n",
" [0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0]])"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bow = bow.toarray()\n",
"bow"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" bacon \n",
" beans \n",
" beautiful \n",
" blue \n",
" breakfast \n",
" brown \n",
" dog \n",
" eggs \n",
" fox \n",
" green \n",
" ham \n",
" jumps \n",
" kings \n",
" lazy \n",
" love \n",
" quick \n",
" sausages \n",
" sky \n",
" toast \n",
" today \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" 2 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 1 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" 3 \n",
" 1 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" \n",
" \n",
" 4 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 1 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" 5 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 1 \n",
" 1 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" 6 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 2 \n",
" 0 \n",
" 1 \n",
" \n",
" \n",
" 7 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 1 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" bacon beans beautiful blue breakfast brown dog eggs fox green \\\n",
"0 0 0 1 1 0 0 0 0 0 0 \n",
"1 0 0 1 1 0 0 0 0 0 0 \n",
"2 0 0 0 0 0 1 1 0 1 0 \n",
"3 1 1 0 0 1 0 0 1 0 0 \n",
"4 1 0 0 0 0 0 0 1 0 1 \n",
"5 0 0 0 1 0 1 1 0 1 0 \n",
"6 0 0 1 1 0 0 0 0 0 0 \n",
"7 0 0 0 0 0 1 1 0 1 0 \n",
"\n",
" ham jumps kings lazy love quick sausages sky toast today \n",
"0 0 0 0 0 0 0 0 1 0 0 \n",
"1 0 0 0 0 1 0 0 1 0 0 \n",
"2 0 1 0 1 0 1 0 0 0 0 \n",
"3 1 0 1 0 0 0 1 0 1 0 \n",
"4 1 0 0 0 1 0 1 0 0 0 \n",
"5 0 0 0 1 0 1 0 0 0 0 \n",
"6 0 0 0 0 0 0 0 2 0 1 \n",
"7 0 0 0 1 0 1 0 0 0 0 "
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"##\n",
"## Representacion como un dataframe\n",
"##\n",
"pd.DataFrame(bow, columns=cv.get_feature_names())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Bag of N-Grams model"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" 0 \n",
" 1 \n",
" 2 \n",
" 3 \n",
" 4 \n",
" 5 \n",
" 6 \n",
" 7 \n",
" \n",
" \n",
" \n",
" \n",
" bacon eggs \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" beautiful sky \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" beautiful today \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" \n",
" \n",
" blue beautiful \n",
" 1 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" blue dog \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" blue sky \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" \n",
" \n",
" breakfast sausages \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" brown fox \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 1 \n",
" \n",
" \n",
" dog lazy \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 1 \n",
" \n",
" \n",
" eggs ham \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" eggs toast \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" fox jumps \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" fox quick \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 1 \n",
" \n",
" \n",
" green eggs \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" ham bacon \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" ham sausages \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" jumps lazy \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" kings breakfast \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" lazy brown \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" \n",
" \n",
" lazy dog \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" love blue \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" love green \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" quick blue \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" quick brown \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" sausages bacon \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" sausages ham \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
" sky beautiful \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" \n",
" \n",
" sky blue \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" \n",
" \n",
" toast beans \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 1 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" 0 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4 5 6 7\n",
"bacon eggs 0 0 0 1 0 0 0 0\n",
"beautiful sky 0 1 0 0 0 0 0 0\n",
"beautiful today 0 0 0 0 0 0 1 0\n",
"blue beautiful 1 1 0 0 0 0 0 0\n",
"blue dog 0 0 0 0 0 1 0 0\n",
"blue sky 0 0 0 0 0 0 1 0\n",
"breakfast sausages 0 0 0 1 0 0 0 0\n",
"brown fox 0 0 1 0 0 1 0 1\n",
"dog lazy 0 0 0 0 0 1 0 1\n",
"eggs ham 0 0 0 0 1 0 0 0\n",
"eggs toast 0 0 0 1 0 0 0 0\n",
"fox jumps 0 0 1 0 0 0 0 0\n",
"fox quick 0 0 0 0 0 1 0 1\n",
"green eggs 0 0 0 0 1 0 0 0\n",
"ham bacon 0 0 0 1 0 0 0 0\n",
"ham sausages 0 0 0 0 1 0 0 0\n",
"jumps lazy 0 0 1 0 0 0 0 0\n",
"kings breakfast 0 0 0 1 0 0 0 0\n",
"lazy brown 0 0 0 0 0 0 0 1\n",
"lazy dog 0 0 1 0 0 0 0 0\n",
"love blue 0 1 0 0 0 0 0 0\n",
"love green 0 0 0 0 1 0 0 0\n",
"quick blue 0 0 0 0 0 1 0 0\n",
"quick brown 0 0 1 0 0 0 0 0\n",
"sausages bacon 0 0 0 0 1 0 0 0\n",
"sausages ham 0 0 0 1 0 0 0 0\n",
"sky beautiful 0 0 0 0 0 0 1 0\n",
"sky blue 1 0 0 0 0 0 1 0\n",
"toast beans 0 0 0 1 0 0 0 0"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cv = CountVectorizer(\n",
" # encoding='utf-8', # codificacion del texto\n",
" # decode_error='strict', #\n",
" # strip_accents=None, # remocion de acentos\n",
" # lowercase=True, #\n",
" # preprocessor=None, #\n",
" # tokenizer=None, #\n",
" # stop_words=None, #\n",
" # token_pattern='(?u)\\b\\w\\w+\\b', #\n",
" ngram_range=(2, 2), #\n",
" # analyzer='word', #\n",
" max_df=1.0, #\n",
" min_df=1, #\n",
" # max_features=None, #\n",
" # vocabulary=None, #\n",
" # binary=False #\n",
")\n",
"\n",
"bon = cv.fit_transform(corpus.Normalized_Document.tolist())\n",
"pd.DataFrame(bon.toarray(), columns=cv.get_feature_names()).T\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## TF-IDF model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"$$tfidf= tf \\times idf$$\n",
"\n",
"* tf: term-frequency\n",
"* idf: \n",
"\n",
"$$1+\\log\\frac{N}{1+df(w)}$$"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" bacon \n",
" beans \n",
" beautiful \n",
" blue \n",
" breakfast \n",
" brown \n",
" dog \n",
" eggs \n",
" fox \n",
" green \n",
" ham \n",
" jumps \n",
" kings \n",
" lazy \n",
" love \n",
" quick \n",
" sausages \n",
" sky \n",
" toast \n",
" today \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 0.00 \n",
" 0.00 \n",
" 0.60 \n",
" 0.53 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.60 \n",
" 0.00 \n",
" 0.0 \n",
" \n",
" \n",
" 1 \n",
" 0.00 \n",
" 0.00 \n",
" 0.49 \n",
" 0.43 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.57 \n",
" 0.00 \n",
" 0.00 \n",
" 0.49 \n",
" 0.00 \n",
" 0.0 \n",
" \n",
" \n",
" 2 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.38 \n",
" 0.38 \n",
" 0.00 \n",
" 0.38 \n",
" 0.00 \n",
" 0.00 \n",
" 0.53 \n",
" 0.00 \n",
" 0.38 \n",
" 0.00 \n",
" 0.38 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.0 \n",
" \n",
" \n",
" 3 \n",
" 0.32 \n",
" 0.38 \n",
" 0.00 \n",
" 0.00 \n",
" 0.38 \n",
" 0.00 \n",
" 0.00 \n",
" 0.32 \n",
" 0.00 \n",
" 0.00 \n",
" 0.32 \n",
" 0.00 \n",
" 0.38 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.32 \n",
" 0.00 \n",
" 0.38 \n",
" 0.0 \n",
" \n",
" \n",
" 4 \n",
" 0.39 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.39 \n",
" 0.00 \n",
" 0.47 \n",
" 0.39 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.39 \n",
" 0.00 \n",
" 0.39 \n",
" 0.00 \n",
" 0.00 \n",
" 0.0 \n",
" \n",
" \n",
" 5 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.37 \n",
" 0.00 \n",
" 0.42 \n",
" 0.42 \n",
" 0.00 \n",
" 0.42 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.42 \n",
" 0.00 \n",
" 0.42 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.0 \n",
" \n",
" \n",
" 6 \n",
" 0.00 \n",
" 0.00 \n",
" 0.36 \n",
" 0.32 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.72 \n",
" 0.00 \n",
" 0.5 \n",
" \n",
" \n",
" 7 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.45 \n",
" 0.45 \n",
" 0.00 \n",
" 0.45 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.45 \n",
" 0.00 \n",
" 0.45 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.0 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" bacon beans beautiful blue breakfast brown dog eggs fox green \\\n",
"0 0.00 0.00 0.60 0.53 0.00 0.00 0.00 0.00 0.00 0.00 \n",
"1 0.00 0.00 0.49 0.43 0.00 0.00 0.00 0.00 0.00 0.00 \n",
"2 0.00 0.00 0.00 0.00 0.00 0.38 0.38 0.00 0.38 0.00 \n",
"3 0.32 0.38 0.00 0.00 0.38 0.00 0.00 0.32 0.00 0.00 \n",
"4 0.39 0.00 0.00 0.00 0.00 0.00 0.00 0.39 0.00 0.47 \n",
"5 0.00 0.00 0.00 0.37 0.00 0.42 0.42 0.00 0.42 0.00 \n",
"6 0.00 0.00 0.36 0.32 0.00 0.00 0.00 0.00 0.00 0.00 \n",
"7 0.00 0.00 0.00 0.00 0.00 0.45 0.45 0.00 0.45 0.00 \n",
"\n",
" ham jumps kings lazy love quick sausages sky toast today \n",
"0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.60 0.00 0.0 \n",
"1 0.00 0.00 0.00 0.00 0.57 0.00 0.00 0.49 0.00 0.0 \n",
"2 0.00 0.53 0.00 0.38 0.00 0.38 0.00 0.00 0.00 0.0 \n",
"3 0.32 0.00 0.38 0.00 0.00 0.00 0.32 0.00 0.38 0.0 \n",
"4 0.39 0.00 0.00 0.00 0.39 0.00 0.39 0.00 0.00 0.0 \n",
"5 0.00 0.00 0.00 0.42 0.00 0.42 0.00 0.00 0.00 0.0 \n",
"6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.72 0.00 0.5 \n",
"7 0.00 0.00 0.00 0.45 0.00 0.45 0.00 0.00 0.00 0.0 "
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"\n",
"from sklearn.feature_extraction.text import TfidfTransformer\n",
"\n",
"cv = CountVectorizer(min_df=0., max_df=1.)\n",
"cv_matrix = cv.fit_transform(corpus.Normalized_Document.tolist())\n",
"cv_matrix = cv_matrix.toarray()\n",
"\n",
"\n",
"tt = TfidfTransformer(norm='l2', use_idf=True)\n",
"tt_matrix = tt.fit_transform(cv_matrix)\n",
"tt_matrix = tt_matrix.toarray()\n",
"vocab = cv.get_feature_names() \n",
"pd.DataFrame(np.round(tt_matrix, 2), columns=vocab)\n"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" bacon \n",
" beans \n",
" beautiful \n",
" blue \n",
" breakfast \n",
" brown \n",
" dog \n",
" eggs \n",
" fox \n",
" green \n",
" ham \n",
" jumps \n",
" kings \n",
" lazy \n",
" love \n",
" quick \n",
" sausages \n",
" sky \n",
" toast \n",
" today \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 0.00 \n",
" 0.00 \n",
" 0.60 \n",
" 0.53 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.60 \n",
" 0.00 \n",
" 0.0 \n",
" \n",
" \n",
" 1 \n",
" 0.00 \n",
" 0.00 \n",
" 0.49 \n",
" 0.43 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.57 \n",
" 0.00 \n",
" 0.00 \n",
" 0.49 \n",
" 0.00 \n",
" 0.0 \n",
" \n",
" \n",
" 2 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.38 \n",
" 0.38 \n",
" 0.00 \n",
" 0.38 \n",
" 0.00 \n",
" 0.00 \n",
" 0.53 \n",
" 0.00 \n",
" 0.38 \n",
" 0.00 \n",
" 0.38 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.0 \n",
" \n",
" \n",
" 3 \n",
" 0.32 \n",
" 0.38 \n",
" 0.00 \n",
" 0.00 \n",
" 0.38 \n",
" 0.00 \n",
" 0.00 \n",
" 0.32 \n",
" 0.00 \n",
" 0.00 \n",
" 0.32 \n",
" 0.00 \n",
" 0.38 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.32 \n",
" 0.00 \n",
" 0.38 \n",
" 0.0 \n",
" \n",
" \n",
" 4 \n",
" 0.39 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.39 \n",
" 0.00 \n",
" 0.47 \n",
" 0.39 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.39 \n",
" 0.00 \n",
" 0.39 \n",
" 0.00 \n",
" 0.00 \n",
" 0.0 \n",
" \n",
" \n",
" 5 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.37 \n",
" 0.00 \n",
" 0.42 \n",
" 0.42 \n",
" 0.00 \n",
" 0.42 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.42 \n",
" 0.00 \n",
" 0.42 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.0 \n",
" \n",
" \n",
" 6 \n",
" 0.00 \n",
" 0.00 \n",
" 0.36 \n",
" 0.32 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.72 \n",
" 0.00 \n",
" 0.5 \n",
" \n",
" \n",
" 7 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.45 \n",
" 0.45 \n",
" 0.00 \n",
" 0.45 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.45 \n",
" 0.00 \n",
" 0.45 \n",
" 0.00 \n",
" 0.00 \n",
" 0.00 \n",
" 0.0 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" bacon beans beautiful blue breakfast brown dog eggs fox green \\\n",
"0 0.00 0.00 0.60 0.53 0.00 0.00 0.00 0.00 0.00 0.00 \n",
"1 0.00 0.00 0.49 0.43 0.00 0.00 0.00 0.00 0.00 0.00 \n",
"2 0.00 0.00 0.00 0.00 0.00 0.38 0.38 0.00 0.38 0.00 \n",
"3 0.32 0.38 0.00 0.00 0.38 0.00 0.00 0.32 0.00 0.00 \n",
"4 0.39 0.00 0.00 0.00 0.00 0.00 0.00 0.39 0.00 0.47 \n",
"5 0.00 0.00 0.00 0.37 0.00 0.42 0.42 0.00 0.42 0.00 \n",
"6 0.00 0.00 0.36 0.32 0.00 0.00 0.00 0.00 0.00 0.00 \n",
"7 0.00 0.00 0.00 0.00 0.00 0.45 0.45 0.00 0.45 0.00 \n",
"\n",
" ham jumps kings lazy love quick sausages sky toast today \n",
"0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.60 0.00 0.0 \n",
"1 0.00 0.00 0.00 0.00 0.57 0.00 0.00 0.49 0.00 0.0 \n",
"2 0.00 0.53 0.00 0.38 0.00 0.38 0.00 0.00 0.00 0.0 \n",
"3 0.32 0.00 0.38 0.00 0.00 0.00 0.32 0.00 0.38 0.0 \n",
"4 0.39 0.00 0.00 0.00 0.39 0.00 0.39 0.00 0.00 0.0 \n",
"5 0.00 0.00 0.00 0.42 0.00 0.42 0.00 0.00 0.00 0.0 \n",
"6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.72 0.00 0.5 \n",
"7 0.00 0.00 0.00 0.45 0.00 0.45 0.00 0.00 0.00 0.0 "
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"\n",
"tv = TfidfVectorizer(min_df=0.0, max_df=1.0, norm=\"l2\", use_idf=True, smooth_idf=True)\n",
"tv_matrix = tv.fit_transform(corpus.Normalized_Document.tolist())\n",
"tv_matrix = tv_matrix.toarray()\n",
"\n",
"vocab = tv.get_feature_names() \n",
"pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Similaridad de documentos"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" 0 \n",
" 1 \n",
" 2 \n",
" 3 \n",
" 4 \n",
" 5 \n",
" 6 \n",
" 7 \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 1.000000 \n",
" 0.820599 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.192353 \n",
" 0.817246 \n",
" 0.000000 \n",
" \n",
" \n",
" 1 \n",
" 0.820599 \n",
" 1.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.225489 \n",
" 0.157845 \n",
" 0.670631 \n",
" 0.000000 \n",
" \n",
" \n",
" 2 \n",
" 0.000000 \n",
" 0.000000 \n",
" 1.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.791821 \n",
" 0.000000 \n",
" 0.850516 \n",
" \n",
" \n",
" 3 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 1.000000 \n",
" 0.506866 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" \n",
" \n",
" 4 \n",
" 0.000000 \n",
" 0.225489 \n",
" 0.000000 \n",
" 0.506866 \n",
" 1.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" \n",
" \n",
" 5 \n",
" 0.192353 \n",
" 0.157845 \n",
" 0.791821 \n",
" 0.000000 \n",
" 0.000000 \n",
" 1.000000 \n",
" 0.115488 \n",
" 0.930989 \n",
" \n",
" \n",
" 6 \n",
" 0.817246 \n",
" 0.670631 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.115488 \n",
" 1.000000 \n",
" 0.000000 \n",
" \n",
" \n",
" 7 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.850516 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.930989 \n",
" 0.000000 \n",
" 1.000000 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4 5 6 \\\n",
"0 1.000000 0.820599 0.000000 0.000000 0.000000 0.192353 0.817246 \n",
"1 0.820599 1.000000 0.000000 0.000000 0.225489 0.157845 0.670631 \n",
"2 0.000000 0.000000 1.000000 0.000000 0.000000 0.791821 0.000000 \n",
"3 0.000000 0.000000 0.000000 1.000000 0.506866 0.000000 0.000000 \n",
"4 0.000000 0.225489 0.000000 0.506866 1.000000 0.000000 0.000000 \n",
"5 0.192353 0.157845 0.791821 0.000000 0.000000 1.000000 0.115488 \n",
"6 0.817246 0.670631 0.000000 0.000000 0.000000 0.115488 1.000000 \n",
"7 0.000000 0.000000 0.850516 0.000000 0.000000 0.930989 0.000000 \n",
"\n",
" 7 \n",
"0 0.000000 \n",
"1 0.000000 \n",
"2 0.850516 \n",
"3 0.000000 \n",
"4 0.000000 \n",
"5 0.930989 \n",
"6 0.000000 \n",
"7 1.000000 "
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.metrics.pairwise import cosine_similarity\n",
"\n",
"similarity_matrix = cosine_similarity(tv_matrix) \n",
"similarity_df = pd.DataFrame(similarity_matrix) \n",
"similarity_df"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Document\\Cluster 1 \n",
" Document\\Cluster 2 \n",
" Distance \n",
" Cluster Size \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 2 \n",
" 7 \n",
" 0.253098 \n",
" 2 \n",
" \n",
" \n",
" 1 \n",
" 0 \n",
" 6 \n",
" 0.308539 \n",
" 2 \n",
" \n",
" \n",
" 2 \n",
" 5 \n",
" 8 \n",
" 0.386952 \n",
" 3 \n",
" \n",
" \n",
" 3 \n",
" 1 \n",
" 9 \n",
" 0.489845 \n",
" 3 \n",
" \n",
" \n",
" 4 \n",
" 3 \n",
" 4 \n",
" 0.732945 \n",
" 2 \n",
" \n",
" \n",
" 5 \n",
" 11 \n",
" 12 \n",
" 2.69565 \n",
" 5 \n",
" \n",
" \n",
" 6 \n",
" 10 \n",
" 13 \n",
" 3.45108 \n",
" 8 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Document\\Cluster 1 Document\\Cluster 2 Distance Cluster Size\n",
"0 2 7 0.253098 2\n",
"1 0 6 0.308539 2\n",
"2 5 8 0.386952 3\n",
"3 1 9 0.489845 3\n",
"4 3 4 0.732945 2\n",
"5 11 12 2.69565 5\n",
"6 10 13 3.45108 8"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from scipy.cluster.hierarchy import dendrogram, linkage\n",
"\n",
"Z = linkage(similarity_matrix, \"ward\")\n",
"pd.DataFrame(\n",
" Z,\n",
" columns=[\"Document\\Cluster 1\", \"Document\\Cluster 2\", \"Distance\", \"Cluster Size\"],\n",
" dtype=\"object\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"plt.figure(figsize=(8, 3)) \n",
"plt.title('Hierarchical Clustering Dendrogram') \n",
"plt.xlabel('Data point')\n",
"plt.ylabel('Distance')\n",
"dendrogram(Z)\n",
"plt.axhline(y=1.0, c='k', ls='--', lw=0.5)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Document \n",
" Labels \n",
" Normalized_Document \n",
" ClusterLabel \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" The sky is blue and beautiful. \n",
" weather \n",
" sky blue beautiful \n",
" 2 \n",
" \n",
" \n",
" 1 \n",
" Love this blue and beautiful sky! \n",
" weather \n",
" love blue beautiful sky \n",
" 2 \n",
" \n",
" \n",
" 2 \n",
" The quick brown fox jumps over the lazy dog. \n",
" animals \n",
" quick brown fox jumps lazy dog \n",
" 1 \n",
" \n",
" \n",
" 3 \n",
" A king's breakfast has sausages, ham, bacon, e... \n",
" food \n",
" kings breakfast sausages ham bacon eggs toast ... \n",
" 3 \n",
" \n",
" \n",
" 4 \n",
" I love green eggs, ham, sausages and bacon! \n",
" food \n",
" love green eggs ham sausages bacon \n",
" 3 \n",
" \n",
" \n",
" 5 \n",
" The brown fox is quick and the blue dog is lazy! \n",
" animals \n",
" brown fox quick blue dog lazy \n",
" 1 \n",
" \n",
" \n",
" 6 \n",
" The sky is very blue and the sky is very beaut... \n",
" weather \n",
" sky blue sky beautiful today \n",
" 2 \n",
" \n",
" \n",
" 7 \n",
" The dog is lazy but the brown fox is quick! \n",
" animals \n",
" dog lazy brown fox quick \n",
" 1 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Document Labels \\\n",
"0 The sky is blue and beautiful. weather \n",
"1 Love this blue and beautiful sky! weather \n",
"2 The quick brown fox jumps over the lazy dog. animals \n",
"3 A king's breakfast has sausages, ham, bacon, e... food \n",
"4 I love green eggs, ham, sausages and bacon! food \n",
"5 The brown fox is quick and the blue dog is lazy! animals \n",
"6 The sky is very blue and the sky is very beaut... weather \n",
"7 The dog is lazy but the brown fox is quick! animals \n",
"\n",
" Normalized_Document ClusterLabel \n",
"0 sky blue beautiful 2 \n",
"1 love blue beautiful sky 2 \n",
"2 quick brown fox jumps lazy dog 1 \n",
"3 kings breakfast sausages ham bacon eggs toast ... 3 \n",
"4 love green eggs ham sausages bacon 3 \n",
"5 brown fox quick blue dog lazy 1 \n",
"6 sky blue sky beautiful today 2 \n",
"7 dog lazy brown fox quick 1 "
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from scipy.cluster.hierarchy import fcluster \n",
"\n",
"max_dist = 1.0\n",
"cluster_labels = fcluster(Z, max_dist, criterion='distance') \n",
"cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel']) \n",
"pd.concat([corpus, cluster_labels], axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Topic Modeling --- Latent Dirichlet Allocation"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" T1 \n",
" T2 \n",
" T3 \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 0.832191 \n",
" 0.083480 \n",
" 0.084329 \n",
" \n",
" \n",
" 1 \n",
" 0.863554 \n",
" 0.069100 \n",
" 0.067346 \n",
" \n",
" \n",
" 2 \n",
" 0.047794 \n",
" 0.047776 \n",
" 0.904430 \n",
" \n",
" \n",
" 3 \n",
" 0.037243 \n",
" 0.925559 \n",
" 0.037198 \n",
" \n",
" \n",
" 4 \n",
" 0.049121 \n",
" 0.903076 \n",
" 0.047802 \n",
" \n",
" \n",
" 5 \n",
" 0.054902 \n",
" 0.047778 \n",
" 0.897321 \n",
" \n",
" \n",
" 6 \n",
" 0.888287 \n",
" 0.055697 \n",
" 0.056016 \n",
" \n",
" \n",
" 7 \n",
" 0.055704 \n",
" 0.055689 \n",
" 0.888607 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" T1 T2 T3\n",
"0 0.832191 0.083480 0.084329\n",
"1 0.863554 0.069100 0.067346\n",
"2 0.047794 0.047776 0.904430\n",
"3 0.037243 0.925559 0.037198\n",
"4 0.049121 0.903076 0.047802\n",
"5 0.054902 0.047778 0.897321\n",
"6 0.888287 0.055697 0.056016\n",
"7 0.055704 0.055689 0.888607"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.decomposition import LatentDirichletAllocation\n",
"\n",
"lda = LatentDirichletAllocation(n_components=3, max_iter=10000, random_state=0)\n",
"dt_matrix = lda.fit_transform(cv_matrix)\n",
"features = pd.DataFrame(dt_matrix, columns=['T1', 'T2', 'T3'])\n",
"features"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('sky', 4.332439442470133), ('blue', 3.373774254787669), ('beautiful', 3.3323650509884386), ('today', 1.3325579855138987), ('love', 1.330415818217548)]\n",
"\n",
"[('bacon', 2.33269586574902), ('eggs', 2.33269586574902), ('ham', 2.33269586574902), ('sausages', 2.33269586574902), ('love', 1.3354610533796558), ('beans', 1.3327735190105536), ('breakfast', 1.3327735190105536), ('kings', 1.3327735190105536), ('toast', 1.3327735190105536), ('green', 1.3325431515674175)]\n",
"\n",
"[('brown', 3.3323473548404405), ('dog', 3.3323473548404405), ('fox', 3.3323473548404405), ('lazy', 3.3323473548404405), ('quick', 3.3323473548404405), ('jumps', 1.3324193772908193), ('blue', 1.2919423137963386)]\n",
"\n"
]
}
],
"source": [
"tt_matrix = lda.components_\n",
"for topic_weights in tt_matrix:\n",
" topic = [(token, weight) for token, weight in zip(vocab, topic_weights)] \n",
" topic = sorted(topic, key=lambda x: -x[1])\n",
" topic = [item for item in topic if item[1] > 0.6]\n",
" print(topic)\n",
" print()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 4
}