{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Esquemas de representacion\n", "\n", "* *30 min* | Última modificación: Sept 22, 2020" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preparacion de los datos" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Writing documents.txt\n" ] } ], "source": [ "%%writefile documents.txt\n", "The sky is blue and beautiful.\n", "Love this blue and beautiful sky!\n", "The quick brown fox jumps over the lazy dog.\n", "A king's breakfast has sausages, ham, bacon, eggs, toast and beans\n", "I love green eggs, ham, sausages and bacon!\n", "The brown fox is quick and the blue dog is lazy!\n", "The sky is very blue and the sky is very beautiful today\n", "The dog is lazy but the brown fox is quick!" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['The sky is blue and beautiful.',\n", " 'Love this blue and beautiful sky!',\n", " 'The quick brown fox jumps over the lazy dog.',\n", " \"A king's breakfast has sausages, ham, bacon, eggs, toast and beans\",\n", " 'I love green eggs, ham, sausages and bacon!',\n", " 'The brown fox is quick and the blue dog is lazy!',\n", " 'The sky is very blue and the sky is very beautiful today',\n", " 'The dog is lazy but the brown fox is quick!']" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Lee el corpus\n", "##\n", "with open('documents.txt' , 'r') as f:\n", " documents = f.readlines()\n", " \n", "documents = [t.replace('\\n', '') for t in documents]\n", "documents" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Overwriting labels.txt\n" ] } ], "source": [ "%%writefile labels.txt\n", "weather\n", "weather\n", "animals\n", "food\n", "food\n", "animals\n", "weather\n", "animals" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['weather',\n", " 'weather',\n", " 'animals',\n", " 'food',\n", " 'food',\n", " 'animals',\n", " 'weather',\n", " 'animals']" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Lee las etiquetas\n", "##\n", "with open('labels.txt' , 'r') as f:\n", " labels = f.readlines()\n", " \n", "labels = [t.replace('\\n', '') for t in labels]\n", "labels" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DocumentLabels
0The sky is blue and beautiful.weather
1Love this blue and beautiful sky!weather
2The quick brown fox jumps over the lazy dog.animals
3A king's breakfast has sausages, ham, bacon, e...food
4I love green eggs, ham, sausages and bacon!food
5The brown fox is quick and the blue dog is lazy!animals
6The sky is very blue and the sky is very beaut...weather
7The dog is lazy but the brown fox is quick!animals
\n", "
" ], "text/plain": [ " Document Labels\n", "0 The sky is blue and beautiful. weather\n", "1 Love this blue and beautiful sky! weather\n", "2 The quick brown fox jumps over the lazy dog. animals\n", "3 A king's breakfast has sausages, ham, bacon, e... food\n", "4 I love green eggs, ham, sausages and bacon! food\n", "5 The brown fox is quick and the blue dog is lazy! animals\n", "6 The sky is very blue and the sky is very beaut... weather\n", "7 The dog is lazy but the brown fox is quick! animals" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "corpus = pd.DataFrame({'Document': documents, 'Labels': labels})\n", "corpus" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preprocesamiento" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DocumentLabelsNormalized_Document
0The sky is blue and beautiful.weathersky blue beautiful
1Love this blue and beautiful sky!weatherlove blue beautiful sky
2The quick brown fox jumps over the lazy dog.animalsquick brown fox jumps lazy dog
3A king's breakfast has sausages, ham, bacon, e...foodkings breakfast sausages ham bacon eggs toast ...
4I love green eggs, ham, sausages and bacon!foodlove green eggs ham sausages bacon
5The brown fox is quick and the blue dog is lazy!animalsbrown fox quick blue dog lazy
6The sky is very blue and the sky is very beaut...weathersky blue sky beautiful today
7The dog is lazy but the brown fox is quick!animalsdog lazy brown fox quick
\n", "
" ], "text/plain": [ " Document Labels \\\n", "0 The sky is blue and beautiful. weather \n", "1 Love this blue and beautiful sky! weather \n", "2 The quick brown fox jumps over the lazy dog. animals \n", "3 A king's breakfast has sausages, ham, bacon, e... food \n", "4 I love green eggs, ham, sausages and bacon! food \n", "5 The brown fox is quick and the blue dog is lazy! animals \n", "6 The sky is very blue and the sky is very beaut... weather \n", "7 The dog is lazy but the brown fox is quick! animals \n", "\n", " Normalized_Document \n", "0 sky blue beautiful \n", "1 love blue beautiful sky \n", "2 quick brown fox jumps lazy dog \n", "3 kings breakfast sausages ham bacon eggs toast ... \n", "4 love green eggs ham sausages bacon \n", "5 brown fox quick blue dog lazy \n", "6 sky blue sky beautiful today \n", "7 dog lazy brown fox quick " ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import nltk\n", "import re\n", "\n", "tokenizer = nltk.WordPunctTokenizer()\n", "\n", "STOPWORDS = nltk.corpus.stopwords.words('english')\n", "\n", "def normalize_document(document):\n", " document = re.sub(r'[^a-zA-Z\\s]', '', document, re.I|re.A)\n", " document = document.lower().strip()\n", " tokens = tokenizer.tokenize(document)\n", " tokens = [token for token in tokens if token not in STOPWORDS]\n", " document = ' '.join(tokens)\n", " return document\n", " \n", "corpus['Normalized_Document'] = corpus.Document.map(normalize_document)\n", "corpus" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Bag of Words (BoW) model" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " (0, 17)\t1\n", " (0, 3)\t1\n", " (0, 2)\t1\n", " (1, 17)\t1\n", " (1, 3)\t1\n", " (1, 2)\t1\n", " (1, 14)\t1\n", " (2, 15)\t1\n", " (2, 5)\t1\n", " (2, 8)\t1\n", " (2, 11)\t1\n", " (2, 13)\t1\n", " (2, 6)\t1\n", " (3, 12)\t1\n", " (3, 4)\t1\n", " (3, 16)\t1\n", " (3, 10)\t1\n", " (3, 0)\t1\n", " (3, 7)\t1\n", " (3, 18)\t1\n", " (3, 1)\t1\n", " (4, 14)\t1\n", " (4, 16)\t1\n", " (4, 10)\t1\n", " (4, 0)\t1\n", " (4, 7)\t1\n", " (4, 9)\t1\n", " (5, 3)\t1\n", " (5, 15)\t1\n", " (5, 5)\t1\n", " (5, 8)\t1\n", " (5, 13)\t1\n", " (5, 6)\t1\n", " (6, 17)\t2\n", " (6, 3)\t1\n", " (6, 2)\t1\n", " (6, 19)\t1\n", " (7, 15)\t1\n", " (7, 5)\t1\n", " (7, 8)\t1\n", " (7, 13)\t1\n", " (7, 6)\t1\n" ] } ], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", "cv = CountVectorizer(\n", " # encoding='utf-8', # codificacion del texto\n", " # decode_error='strict', #\n", " # strip_accents=None, # remocion de acentos\n", " # lowercase=True, #\n", " # preprocessor=None, #\n", " # tokenizer=None, #\n", " # stop_words=None, #\n", " # token_pattern='(?u)\\b\\w\\w+\\b', #\n", " # ngram_range=(1, 1), #\n", " # analyzer='word', #\n", " max_df=1.0, #\n", " min_df=1, #\n", " # max_features=None, #\n", " # vocabulary=None, #\n", " # binary=False #\n", ")\n", "\n", "bow = cv.fit_transform(corpus.Normalized_Document.tolist())\n", "\n", "## sparse matrix\n", "print(bow)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],\n", " [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0],\n", " [0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0],\n", " [1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0],\n", " [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0],\n", " [0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0],\n", " [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1],\n", " [0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0]])" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bow = bow.toarray()\n", "bow" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
baconbeansbeautifulbluebreakfastbrowndogeggsfoxgreenhamjumpskingslazylovequicksausagesskytoasttoday
000110000000000000100
100110000000000100100
200000110100101010000
311001001001010001010
410000001011000101000
500010110100001010000
600110000000000000201
700000110100001010000
\n", "
" ], "text/plain": [ " bacon beans beautiful blue breakfast brown dog eggs fox green \\\n", "0 0 0 1 1 0 0 0 0 0 0 \n", "1 0 0 1 1 0 0 0 0 0 0 \n", "2 0 0 0 0 0 1 1 0 1 0 \n", "3 1 1 0 0 1 0 0 1 0 0 \n", "4 1 0 0 0 0 0 0 1 0 1 \n", "5 0 0 0 1 0 1 1 0 1 0 \n", "6 0 0 1 1 0 0 0 0 0 0 \n", "7 0 0 0 0 0 1 1 0 1 0 \n", "\n", " ham jumps kings lazy love quick sausages sky toast today \n", "0 0 0 0 0 0 0 0 1 0 0 \n", "1 0 0 0 0 1 0 0 1 0 0 \n", "2 0 1 0 1 0 1 0 0 0 0 \n", "3 1 0 1 0 0 0 1 0 1 0 \n", "4 1 0 0 0 1 0 1 0 0 0 \n", "5 0 0 0 1 0 1 0 0 0 0 \n", "6 0 0 0 0 0 0 0 2 0 1 \n", "7 0 0 0 1 0 1 0 0 0 0 " ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##\n", "## Representacion como un dataframe\n", "##\n", "pd.DataFrame(bow, columns=cv.get_feature_names())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Bag of N-Grams model" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234567
bacon eggs00010000
beautiful sky01000000
beautiful today00000010
blue beautiful11000000
blue dog00000100
blue sky00000010
breakfast sausages00010000
brown fox00100101
dog lazy00000101
eggs ham00001000
eggs toast00010000
fox jumps00100000
fox quick00000101
green eggs00001000
ham bacon00010000
ham sausages00001000
jumps lazy00100000
kings breakfast00010000
lazy brown00000001
lazy dog00100000
love blue01000000
love green00001000
quick blue00000100
quick brown00100000
sausages bacon00001000
sausages ham00010000
sky beautiful00000010
sky blue10000010
toast beans00010000
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7\n", "bacon eggs 0 0 0 1 0 0 0 0\n", "beautiful sky 0 1 0 0 0 0 0 0\n", "beautiful today 0 0 0 0 0 0 1 0\n", "blue beautiful 1 1 0 0 0 0 0 0\n", "blue dog 0 0 0 0 0 1 0 0\n", "blue sky 0 0 0 0 0 0 1 0\n", "breakfast sausages 0 0 0 1 0 0 0 0\n", "brown fox 0 0 1 0 0 1 0 1\n", "dog lazy 0 0 0 0 0 1 0 1\n", "eggs ham 0 0 0 0 1 0 0 0\n", "eggs toast 0 0 0 1 0 0 0 0\n", "fox jumps 0 0 1 0 0 0 0 0\n", "fox quick 0 0 0 0 0 1 0 1\n", "green eggs 0 0 0 0 1 0 0 0\n", "ham bacon 0 0 0 1 0 0 0 0\n", "ham sausages 0 0 0 0 1 0 0 0\n", "jumps lazy 0 0 1 0 0 0 0 0\n", "kings breakfast 0 0 0 1 0 0 0 0\n", "lazy brown 0 0 0 0 0 0 0 1\n", "lazy dog 0 0 1 0 0 0 0 0\n", "love blue 0 1 0 0 0 0 0 0\n", "love green 0 0 0 0 1 0 0 0\n", "quick blue 0 0 0 0 0 1 0 0\n", "quick brown 0 0 1 0 0 0 0 0\n", "sausages bacon 0 0 0 0 1 0 0 0\n", "sausages ham 0 0 0 1 0 0 0 0\n", "sky beautiful 0 0 0 0 0 0 1 0\n", "sky blue 1 0 0 0 0 0 1 0\n", "toast beans 0 0 0 1 0 0 0 0" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cv = CountVectorizer(\n", " # encoding='utf-8', # codificacion del texto\n", " # decode_error='strict', #\n", " # strip_accents=None, # remocion de acentos\n", " # lowercase=True, #\n", " # preprocessor=None, #\n", " # tokenizer=None, #\n", " # stop_words=None, #\n", " # token_pattern='(?u)\\b\\w\\w+\\b', #\n", " ngram_range=(2, 2), #\n", " # analyzer='word', #\n", " max_df=1.0, #\n", " min_df=1, #\n", " # max_features=None, #\n", " # vocabulary=None, #\n", " # binary=False #\n", ")\n", "\n", "bon = cv.fit_transform(corpus.Normalized_Document.tolist())\n", "pd.DataFrame(bon.toarray(), columns=cv.get_feature_names()).T\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## TF-IDF model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "$$tfidf= tf \\times idf$$\n", "\n", "* tf: term-frequency\n", "* idf: \n", "\n", "$$1+\\log\\frac{N}{1+df(w)}$$" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
baconbeansbeautifulbluebreakfastbrowndogeggsfoxgreenhamjumpskingslazylovequicksausagesskytoasttoday
00.000.000.600.530.000.000.000.000.000.000.000.000.000.000.000.000.000.600.000.0
10.000.000.490.430.000.000.000.000.000.000.000.000.000.000.570.000.000.490.000.0
20.000.000.000.000.000.380.380.000.380.000.000.530.000.380.000.380.000.000.000.0
30.320.380.000.000.380.000.000.320.000.000.320.000.380.000.000.000.320.000.380.0
40.390.000.000.000.000.000.000.390.000.470.390.000.000.000.390.000.390.000.000.0
50.000.000.000.370.000.420.420.000.420.000.000.000.000.420.000.420.000.000.000.0
60.000.000.360.320.000.000.000.000.000.000.000.000.000.000.000.000.000.720.000.5
70.000.000.000.000.000.450.450.000.450.000.000.000.000.450.000.450.000.000.000.0
\n", "
" ], "text/plain": [ " bacon beans beautiful blue breakfast brown dog eggs fox green \\\n", "0 0.00 0.00 0.60 0.53 0.00 0.00 0.00 0.00 0.00 0.00 \n", "1 0.00 0.00 0.49 0.43 0.00 0.00 0.00 0.00 0.00 0.00 \n", "2 0.00 0.00 0.00 0.00 0.00 0.38 0.38 0.00 0.38 0.00 \n", "3 0.32 0.38 0.00 0.00 0.38 0.00 0.00 0.32 0.00 0.00 \n", "4 0.39 0.00 0.00 0.00 0.00 0.00 0.00 0.39 0.00 0.47 \n", "5 0.00 0.00 0.00 0.37 0.00 0.42 0.42 0.00 0.42 0.00 \n", "6 0.00 0.00 0.36 0.32 0.00 0.00 0.00 0.00 0.00 0.00 \n", "7 0.00 0.00 0.00 0.00 0.00 0.45 0.45 0.00 0.45 0.00 \n", "\n", " ham jumps kings lazy love quick sausages sky toast today \n", "0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.60 0.00 0.0 \n", "1 0.00 0.00 0.00 0.00 0.57 0.00 0.00 0.49 0.00 0.0 \n", "2 0.00 0.53 0.00 0.38 0.00 0.38 0.00 0.00 0.00 0.0 \n", "3 0.32 0.00 0.38 0.00 0.00 0.00 0.32 0.00 0.38 0.0 \n", "4 0.39 0.00 0.00 0.00 0.39 0.00 0.39 0.00 0.00 0.0 \n", "5 0.00 0.00 0.00 0.42 0.00 0.42 0.00 0.00 0.00 0.0 \n", "6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.72 0.00 0.5 \n", "7 0.00 0.00 0.00 0.45 0.00 0.45 0.00 0.00 0.00 0.0 " ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "\n", "from sklearn.feature_extraction.text import TfidfTransformer\n", "\n", "cv = CountVectorizer(min_df=0., max_df=1.)\n", "cv_matrix = cv.fit_transform(corpus.Normalized_Document.tolist())\n", "cv_matrix = cv_matrix.toarray()\n", "\n", "\n", "tt = TfidfTransformer(norm='l2', use_idf=True)\n", "tt_matrix = tt.fit_transform(cv_matrix)\n", "tt_matrix = tt_matrix.toarray()\n", "vocab = cv.get_feature_names() \n", "pd.DataFrame(np.round(tt_matrix, 2), columns=vocab)\n" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
baconbeansbeautifulbluebreakfastbrowndogeggsfoxgreenhamjumpskingslazylovequicksausagesskytoasttoday
00.000.000.600.530.000.000.000.000.000.000.000.000.000.000.000.000.000.600.000.0
10.000.000.490.430.000.000.000.000.000.000.000.000.000.000.570.000.000.490.000.0
20.000.000.000.000.000.380.380.000.380.000.000.530.000.380.000.380.000.000.000.0
30.320.380.000.000.380.000.000.320.000.000.320.000.380.000.000.000.320.000.380.0
40.390.000.000.000.000.000.000.390.000.470.390.000.000.000.390.000.390.000.000.0
50.000.000.000.370.000.420.420.000.420.000.000.000.000.420.000.420.000.000.000.0
60.000.000.360.320.000.000.000.000.000.000.000.000.000.000.000.000.000.720.000.5
70.000.000.000.000.000.450.450.000.450.000.000.000.000.450.000.450.000.000.000.0
\n", "
" ], "text/plain": [ " bacon beans beautiful blue breakfast brown dog eggs fox green \\\n", "0 0.00 0.00 0.60 0.53 0.00 0.00 0.00 0.00 0.00 0.00 \n", "1 0.00 0.00 0.49 0.43 0.00 0.00 0.00 0.00 0.00 0.00 \n", "2 0.00 0.00 0.00 0.00 0.00 0.38 0.38 0.00 0.38 0.00 \n", "3 0.32 0.38 0.00 0.00 0.38 0.00 0.00 0.32 0.00 0.00 \n", "4 0.39 0.00 0.00 0.00 0.00 0.00 0.00 0.39 0.00 0.47 \n", "5 0.00 0.00 0.00 0.37 0.00 0.42 0.42 0.00 0.42 0.00 \n", "6 0.00 0.00 0.36 0.32 0.00 0.00 0.00 0.00 0.00 0.00 \n", "7 0.00 0.00 0.00 0.00 0.00 0.45 0.45 0.00 0.45 0.00 \n", "\n", " ham jumps kings lazy love quick sausages sky toast today \n", "0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.60 0.00 0.0 \n", "1 0.00 0.00 0.00 0.00 0.57 0.00 0.00 0.49 0.00 0.0 \n", "2 0.00 0.53 0.00 0.38 0.00 0.38 0.00 0.00 0.00 0.0 \n", "3 0.32 0.00 0.38 0.00 0.00 0.00 0.32 0.00 0.38 0.0 \n", "4 0.39 0.00 0.00 0.00 0.39 0.00 0.39 0.00 0.00 0.0 \n", "5 0.00 0.00 0.00 0.42 0.00 0.42 0.00 0.00 0.00 0.0 \n", "6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.72 0.00 0.5 \n", "7 0.00 0.00 0.00 0.45 0.00 0.45 0.00 0.00 0.00 0.0 " ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "tv = TfidfVectorizer(min_df=0.0, max_df=1.0, norm=\"l2\", use_idf=True, smooth_idf=True)\n", "tv_matrix = tv.fit_transform(corpus.Normalized_Document.tolist())\n", "tv_matrix = tv_matrix.toarray()\n", "\n", "vocab = tv.get_feature_names() \n", "pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Similaridad de documentos" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234567
01.0000000.8205990.0000000.0000000.0000000.1923530.8172460.000000
10.8205991.0000000.0000000.0000000.2254890.1578450.6706310.000000
20.0000000.0000001.0000000.0000000.0000000.7918210.0000000.850516
30.0000000.0000000.0000001.0000000.5068660.0000000.0000000.000000
40.0000000.2254890.0000000.5068661.0000000.0000000.0000000.000000
50.1923530.1578450.7918210.0000000.0000001.0000000.1154880.930989
60.8172460.6706310.0000000.0000000.0000000.1154881.0000000.000000
70.0000000.0000000.8505160.0000000.0000000.9309890.0000001.000000
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 \\\n", "0 1.000000 0.820599 0.000000 0.000000 0.000000 0.192353 0.817246 \n", "1 0.820599 1.000000 0.000000 0.000000 0.225489 0.157845 0.670631 \n", "2 0.000000 0.000000 1.000000 0.000000 0.000000 0.791821 0.000000 \n", "3 0.000000 0.000000 0.000000 1.000000 0.506866 0.000000 0.000000 \n", "4 0.000000 0.225489 0.000000 0.506866 1.000000 0.000000 0.000000 \n", "5 0.192353 0.157845 0.791821 0.000000 0.000000 1.000000 0.115488 \n", "6 0.817246 0.670631 0.000000 0.000000 0.000000 0.115488 1.000000 \n", "7 0.000000 0.000000 0.850516 0.000000 0.000000 0.930989 0.000000 \n", "\n", " 7 \n", "0 0.000000 \n", "1 0.000000 \n", "2 0.850516 \n", "3 0.000000 \n", "4 0.000000 \n", "5 0.930989 \n", "6 0.000000 \n", "7 1.000000 " ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.metrics.pairwise import cosine_similarity\n", "\n", "similarity_matrix = cosine_similarity(tv_matrix) \n", "similarity_df = pd.DataFrame(similarity_matrix) \n", "similarity_df" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Document\\Cluster 1Document\\Cluster 2DistanceCluster Size
0270.2530982
1060.3085392
2580.3869523
3190.4898453
4340.7329452
511122.695655
610133.451088
\n", "
" ], "text/plain": [ " Document\\Cluster 1 Document\\Cluster 2 Distance Cluster Size\n", "0 2 7 0.253098 2\n", "1 0 6 0.308539 2\n", "2 5 8 0.386952 3\n", "3 1 9 0.489845 3\n", "4 3 4 0.732945 2\n", "5 11 12 2.69565 5\n", "6 10 13 3.45108 8" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from scipy.cluster.hierarchy import dendrogram, linkage\n", "\n", "Z = linkage(similarity_matrix, \"ward\")\n", "pd.DataFrame(\n", " Z,\n", " columns=[\"Document\\Cluster 1\", \"Document\\Cluster 2\", \"Distance\", \"Cluster Size\"],\n", " dtype=\"object\",\n", ")" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfEAAADjCAYAAACVWy1ZAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAAfGklEQVR4nO3debgcZZn38e+PJJJAWIRECIFwRJYAItsRjA6SEZCAQJgBh0UDYcQwalQQ51WcGQRGX5d3RhwFgSj7viohBgVHwqJsCYaEbBj2hARCWEICBBPu9496TmiaPqf7hK5Tqc7vc111pZanqu6q7py7n6qn6lFEYGZmZuWzTtEBmJmZ2epxEjczMyspJ3EzM7OSchI3MzMrKSdxMzOzknISNzMzKykncVvjSZohafgaEMdoSfd0sfxWScfnuY8G1p8k6cT3EkMzSNpH0pyi42gGScMlzSs6DrNanMStUJKelLR/1bx3JLKI2DkiJvV4cN0UEQdFxKV57kPS+ySdIemvkpal83eRpLYm7uM9/ZAAiIi7I2KHZsVUKf1QeUPSq5KWSJoi6duS1s1jf2ZrMidxa1mSeq/GOr3yiKWJbgAOA44FNgJ2BaYA+xUZVKXVOe+rYWxEbAAMAk4FjgYmSlIP7HuVZh9rD507ayFO4rbGq6ytS1on1boek7RY0nWSNknL2iSFpC9Iehr4Y5p/vaSFkl6RdJeknSu2fYmk8yRNlLQM+HtJW0m6SdKitI9zquL5L0kvSXpC0kEV899xKVvSFyXNSjXGmZL2SPM74u+Y/w8Nnof9gQOAkRHxYESsiIhXIuLciLiwRvkzJF1RMd1xfnqn6dGSHk9xPCHpc5J2BM4HhklaKunlVHbddNxPS3pO0vmS+qVlwyXNk/QtSQuBi6svQafP8JuSpqXP4VpJfSuW/x9JCyQ9K+nEFOe29c5JRCxLV2kOA4YBn0nba+R7cnw6nhck/VtFLP3S9+IlSTOBj1ad1yfTsU4DlknqLekwZbd9Xk7fgx0ryu8h6S/pPF+fjv17XZy790uakL5/L6XxLSu2N0nS9yT9OX1Gt0jaVNKVyq5MPKgmXpmxNZuTuJXNV4HDgX2BLYCXgHOryuwL7AgcmKZvBbYDPgA8BFxZVf5Y4PvABsC9wATgKaANGAxcU1F2b2AOMAD4MXCh9O7an6TPAmcAxwEbkiWZxWnxY8A+ZDXpM4ErJA1q4Nj3Bx6IiGcaKNslSesDPwMOSjXajwNTI2IW8C/AvRHRPyI2Tqv8ENge2A3Yluy8nF6xyc2BTYCtgTGd7PafgBHAB4GPAKNTLCOAb6Tj2xYY3t3jiYingclk5xUa+578HbAD2VWM0ysS73eBD6XhQKBWO4djyH4wbAxsA1wNnAwMBCYCtyi79fE+4NfAJWTn52qg+kdb9blbB7g4TQ8BXgfOqVrnaGAU2efwIbLv7cVpO7PSMdjaICI8eChsAJ4ElgIvVwyvAfdUldk/jc8C9qtYNgj4G9CbLOkGsE0X+9s4ldkoTV8CXFaxfBiwCOhdY93RwNyK6fXStjZP05OAE9P474GvN3gOppLVrjv2cU8n5X4JXFNnW5UxnAFcUbGs4/z0BtZP5/oIoF+N46w8/wKWAR+qOk9PpPHhwJtA34rlw4F5VZ/h5yumfwycn8YvAn5QsWzbFOe29Y6xav41wC+78T3ZsmL5A8DRafxxYETFsjE1juWfK6b/A7iuYnodYH46B59M46pYfg/wvc7OXY3j2g14qer4/61i+r+BWyumDyX7QVb4/28P+Q+uidua4PCI2LhjAL7cRdmtgV+ny5Yvk/2xXglsVlFmVU1VUi9JP0yXVZeQ/QGGrCb9rvLAVsBTEbGik/0v7BiJiNfSaP8a5bYiq3G/i6TjJE2tOIYPV8XTmcVkyeg9i4hlwFFkte4Fkn4raWgnxQeS/WCZUhHz79L8Dosi4o06u11YMf4ab5+3LXjnZ7C6VxoGAy+m8Ua+J43G81SNfVUu36KyTES8lZYPTsvmR0R0si5UnTtJ60m6QNJT6Tt7F7Cx3tle47mK8ddrTNf6TloLchK3snmG7BLwxhVD34iYX1Gm8g/mscBIsku1G5HVwiCrXdYq/wwwRO+9gdEzZJc530HS1mQ16rHApulHyyNV8XTmD8BelfdH61hGlnw7bF65MCJ+HxEHkP0wmJ3igneeD4AXyBLDzhXnfKOIqEwU76U7xAVA5TFt1d0NSNoK2BO4O81q5HvSVTyVMQypUabyeJ8l+9HQEYvS+vPTtgZX3XKpPr7qc3cq2WX+vSNiQ7LaPDT2HbG1jJO4lc35wPdTMkTSQEkjuyi/AbCcrBa7HvB/62z/AbI/vD+UtL6kvpI+sRpx/gr4pqQ9ldk2xbw+2R/tRSn+E8hq4nVFxB+A28lqmHumBlUbSPoXSf9cY5WpwCclDZG0EXBaxwJJm0kame6NLye7pfFWWvwcsGW6n9tRs/wlcLakD6T1B0s6kOa4DjhB0o6S1iO7PN2QVGvdF7iZ7LObmBZ193tSHc9pqYHZlmT31+uV/4yk/ST1IUvCy4E/k92rXgmMTZ/XSGCvOtvbgOxH08vKGuP5/rZ1ykncyuZ/gPHAbZJeBe4ja2zWmcvILnXOB2am8p2KiJVk9xS3BZ4G5pFddu6WiLierLHcVcCrwG+ATSJiJtk9zHvJkuUuwJ+6sekjyRLVtcArZLX4drJaenUMt6dy08geQ5tQsXgdssZkz5Jdgt4X+FJa9kdgBrBQ0gtp3reAucB96RLvH8hqi+9ZRNxK1sjujo59pEXLu1jtnPT5Pwf8FLiR7D52xw+R7n5PKp1J9p15ArgNuLxO/HOAzwM/J7tqcShwaES8GRFvAv8IfIGsDcLnyT6Hro7tp0C/tK37yG5dmNWkd96qMTMrVmol/giwbhdtE0pL0v1kjfouLjoWKz/XxM2scJL+Qdmz6O8HfgTc0ioJXNK+kjZPl9OPJ3u8zrVrawoncTNbE5wEPE/Won8lb1/abwU7AA+TXU4/FTgyIhYUGpG1DF9ONzMzKynXxM3MzErKSdzMzKykStdjzoABA6Ktra3oMMzMzHrMlClTXoiIgdXzS5fE29ramDx5ctFhmJmZ9RhJtV7/m9/l9PSmqwckPaysi74za5QZnbrbm5qGE2tty8zMzN4tz5r4cuBTEbE0vYrwHkm3RkT1G7OujYixOcZhZmbWknJL4qnXnqVpsk8a/DybmZlZk+TaOj11AzmV7CUOt0fE/TWKHSFpmqQbUk9EtbYzRtJkSZMXLVqUZ8hmZmal0SMve5G0MfBr4KsR8UjF/E2BpRGxXNJJwFER8amuttXe3h5u2PZOV93/NDdPbaSHRTMrg5G7DebYvWv1gGprK0lTIqK9en6PPCceES+T9VA0omr+4ojo6M3nV2T9AVs33Tx1PjMXLCk6DDNrgpkLlvhHuTUst3vikgYCf4uIlyX1Aw4g69igssygincIHwbMyiueVrfToA259qRhRYdhZu/RURfcW3QIViJ5tk4fBFwqqRdZjf+6iJgg6SxgckSMB74m6TBgBVmfxqNzjMfMzKyl5Nk6fRqwe435p1eMnwacllcMZmZmrczvTjczMyspJ3EzM7OSchI3MzMrKSdxMzOzknISNzMzKykncTMzs5JyEjczMyspJ3EzM7OSchI3MzMrKSdxMzOzknISNzMzKykncTMzs5JyEjczMyup3JK4pL6SHpD0sKQZks6sUWZdSddKmivpfkltecVjZmbWavKsiS8HPhURuwK7ASMkfayqzBeAlyJiW+Bs4Ec5xmNmZtZSckvikVmaJvukIaqKjQQuTeM3APtJUl4xmZmZtZJc74lL6iVpKvA8cHtE3F9VZDDwDEBErABeATbNMyYzM7NWkWsSj4iVEbEbsCWwl6QPr852JI2RNFnS5EWLFjU1RjMzs7LqkdbpEfEycAcwomrRfGArAEm9gY2AxTXWHxcR7RHRPnDgwJyjNTMzK4c8W6cPlLRxGu8HHADMrio2Hjg+jR8J/DEiqu+bm5mZWQ29c9z2IOBSSb3IfixcFxETJJ0FTI6I8cCFwOWS5gIvAkfnGI+ZmVlLyS2JR8Q0YPca80+vGH8D+GxeMZiZmbUyv7HNzMyspJzEzczMSspJ3MzMrKScxM3MzErKSdzMzKyknMTNzMxKyknczMyspPJ82YuZreWuuv9pbp46v+gwSmXmgiUAHHXBvQVHUi4jdxvMsXsPKTqMHueauJnl5uap81clJWvMToM2ZKdBGxYdRqnMXLBkrf2x6Jq4meVqp0Ebcu1Jw4oOw1rY2nzVwjVxMzOzknISNzMzKykncTMzs5JyEjczMyup3JK4pK0k3SFppqQZkr5eo8xwSa9ImpqG02tty8zMzN4tz9bpK4BTI+IhSRsAUyTdHhEzq8rdHRGH5BiHmZlZS8qtJh4RCyLioTT+KjALGJzX/szMzNY2PXJPXFIbsDtwf43FwyQ9LOlWSTt3sv4YSZMlTV60aFGeoZqZmZVG7klcUn/gRuDkiKh+ddNDwNYRsSvwc+A3tbYREeMioj0i2gcOHJhrvGZmZmWRaxKX1IcsgV8ZETdVL4+IJRGxNI1PBPpIGpBnTGZmZq0iz9bpAi4EZkXETzops3kqh6S9UjyL84rJzMysleTZOv0TwChguqSpad53gCEAEXE+cCTwJUkrgNeBoyMicozJzMysZeSWxCPiHkB1ypwDnJNXDGZmZq3Mb2wzMzMrKSdxMzOzknISNzMzKykncTMzs5JyEjczMyspJ3EzM7OSajiJS9pa0v5pvF/qmczMzMwK0lASl/RF4AbggjRrSzp5z7mZmZn1jEZr4l8hewPbEoCI+CvwgbyCMjMzs/oaTeLLI+LNjglJvQG/HtXMzKxAjSbxOyV9B+gn6QDgeuCW/MIyMzOzehpN4t8GFgHTgZOAicC/5xWUmZmZ1ddoByj9gIsi4pcAknqlea/lFZiZmZl1rdGa+P+SJe0O/YA/dLWCpK0k3SFppqQZkr5eo4wk/UzSXEnTJO3ReOhmZmZrt0Zr4n0jYmnHREQslbRenXVWAKdGxEPpmfIpkm6PiJkVZQ4CtkvD3sB56V8zMzOro9Ga+LLKWrKkPYHXu1ohIhZExENp/FVgFjC4qthI4LLI3AdsLGlQw9GbmZmtxRqtiZ8MXC/pWUDA5sBRje5EUhuwO3B/1aLBwDMV0/PSvAWNbtvMzGxt1VASj4gHJQ0Fdkiz5kTE3xpZV1J/4Ebg5IhYsjpBShoDjAEYMmTI6mzCzMys5TRaEwf4KNCW1tlDEhFxWVcrSOpDlsCvjIibahSZD2xVMb1lmvcOETEOGAfQ3t7ul8yYmZnRYBKXdDnwIWAqsDLNDqDTJC5JwIXArIj4SSfFxgNjJV1D1qDtlYjwpXQzM7MGNFoTbwd2ioju1II/AYwCpkuamuZ9BxgCEBHnk7005mBgLtkz5yd0Y/tmZmZrtUaT+CNkjdkariVHxD1kjeC6KhNknauYmZlZNzWaxAcAMyU9ACzvmBkRh+USlZmZmdXVaBI/I88gzMzMrPsafcTszrwDMTMzs+5p6I1tkj4m6UFJSyW9KWmlpNV65tvMzMyao9HXrp4DHAP8lazzkxOBc/MKyszMzOprNIkTEXOBXhGxMiIuBkbkF5aZmZnV02jDttckvQ+YKunHZI+aNfwDwMzMzJqv0UQ8KpUdCywje1XqP+YVlJmZmdXXaBI/PCLeiIglEXFmRHwDOCTPwMzMzKxrjSbx42vMG93EOMzMzKyburwnLukY4Fjgg5LGVyzaEHgxz8DMzMysa/Uatv2ZrBHbAOC/K+a/CkzLKygzMzOrr8skHhFPAU9J2h94PSLekrQ9MBSY3hMBmpmZWW2N3hO/C+graTBwG1lr9UvyCsrMzMzqazSJKyJeI3us7BcR8Vlg5y5XkC6S9LykRzpZPlzSK5KmpuH07oVuZma2dms4iUsaBnwO+G2a16vOOpdQ/61ud0fEbmk4q8FYzMzMjMaT+MnAacCvI2KGpG2AO7paISLuwi3YzczMctOdrkjvrJh+HPhaE/Y/TNLDwLPANyNiRq1CksYAYwCGDBnShN2amZmVX73nxH8aESdLugWI6uURcdh72PdDwNYRsVTSwcBvgO1qFYyIccA4gPb29nfFYWZmtjaqVxO/PP37X83ecUQsqRifKOkXkgZExAvN3peZmVkrqvec+JT0752SBqbxRc3YsaTNgeciIiTtRXZ/fnEztm1mZrY2qHtPXNIZZL2XrZNNagXw83qtySVdDQwHBkiaB3wX6AMQEecDRwJfStt7HTg6Inyp3MzMrEH17ol/A/gE8NGIeCLN2wY4T9IpEXF2Z+tGxDFdbTsizgHO6X7IZmZmBvUfMRsFHNORwGFVy/TPA8flGZiZmZl1rV4S71OroVm6L94nn5DMzMysEfWS+JuruczMzMxyVq9h266SltSYL6BvDvGYmZlZg+o9Ylbv/ehmZmZWkEbfnW5mZmZrGCdxMzOzknISNzMzKykncTMzs5JyEjczMyspJ3EzM7OSchI3MzMrKSdxMzOzksotiUu6SNLzkh7pZLkk/UzSXEnTJO2RVyxmZmatKM+a+CXAiC6WHwRsl4YxwHk5xmJmZtZyckviEXEX8GIXRUYCl0XmPmBjSYPyisfMzKzVFHlPfDDwTMX0vDTPzMzMGlCvF7M1gqQxZJfc2WyzzTjjjDM44ogjmDRpEosXL2bMmDGMGzeOXXbZhf79+3PvvfdyzDHHMGHCBJYvX86xxx7LJZdcwp577gnAlClTGD16NFdddRXrrrsuhxxyCFdffTXDhg1j6dKlTJ8+fdU2N910U4YPH86NN97I8OHDefbZZ3n00UdXLR80aBDt7e3ccsstfPrTn+bRRx/lySefXLW8ra2N7bffnttuu41DDz2UyZMns2DBglXLt99+e7bYYgsmTZq02sf07LTZvPjUbJ48cFDLHFMrfk5r4zHNe2gKL89/jGcP3bpljqkVP6eyH9PMWy+lV+8+zBm+ScscU/Xn1Gl+jIhm59y3Ny61ARMi4sM1ll0ATIqIq9P0HGB4RCzoapvt7e0xefLkPMItraMuuBeAa08aVnAkZu/k76b1hLXheyZpSkS0V88v8nL6eOC41Er9Y8Ar9RK4mZmZvS23y+mSrgaGAwMkzQO+C/QBiIjzgYnAwcBc4DXghLxiMTMza0W5JfGIOKbO8gC+ktf+zczMWp3f2GZmZlZSTuJmZmYl5SRuZmZWUk7iZmZmJeUkbmZmVlJO4mZmZiVViteumplZebx07XUsmTChx/a3fMDfA/DUqJ7rDHPDQw7h/Uf9U4/trzNO4mZm1lRLJkzgjdmz6Tt0aI/s739euKNH9tPhjdmzAZzEzcysNfUdOpStL7+s6DBy8dSo44oOYRXfEzczMyspJ3EzM7OSchI3MzMrKSdxMzOzknISNzMzK6lck7ikEZLmSJor6ds1lo+WtEjS1DScmGc8ZmZmrSS3R8wk9QLOBQ4A5gEPShofETOril4bEWPzisPMzKxV5VkT3wuYGxGPR8SbwDXAyBz3Z2ZmtlbJ82Uvg4FnKqbnAXvXKHeEpE8CjwKnRMQz1QUkjQHGAAwZMiSHUM3K5/pHr2fi4xOLDqNLc17cF4ATfjeu4EjqO3ibg/ns9p8tOgyzbim6YdstQFtEfAS4Hbi0VqGIGBcR7RHRPnDgwB4N0GxNNfHxicx5cU7RYXRp993vZPfd7yw6jLrmvDhnjf9BZFZLnjXx+cBWFdNbpnmrRMTiislfAT/OMR6zlrPDJjtw8YiLiw6j9E743QlFh2C2WvKsiT8IbCfpg5LeBxwNjK8sIGlQxeRhwKwc4zEzM2spudXEI2KFpLHA74FewEURMUPSWcDkiBgPfE3SYcAK4EVgdF7xrJbJF8P0G4qOor6Fqb3gxd8rNo5G7HIktLvWY2bWDLn2YhYRE4GJVfNOrxg/DTgtzxjek+k3wMLpsPkuRUfSpWuH3Fx0CI1ZOD3710nczKwp3BVpPZvvAif8tugoWsPFnyk6AjOzllJ063QzMzNbTa6Jm9kap6efgZ/94mygZ1up+7l0awbXxM1sjdPTz8AP3WQoQzcZ2mP783Pp1iyuia/Nerr1/cJp2b89eW/creFLq5Wfgfdz6dYsromvzTpa3/eUzT+SDT1l4fRyPCJoZraaXBNf27Vy63u3hjezFueauJmZWUk5iZuZmZWUk7iZmVlJOYmbmZmVlJO4mZlZSTmJm5mZlVSuSVzSCElzJM2V9O0ay9eVdG1afr+ktjzjMTMzayW5JXFJvYBzgYOAnYBjJO1UVewLwEsRsS1wNvCjvOIxMzNrNXnWxPcC5kbE4xHxJnANMLKqzEjg0jR+A7CfJOUYk5mZWcvIM4kPBp6pmJ6X5tUsExErgFeATXOMyczMrGUoIvLZsHQkMCIiTkzTo4C9I2JsRZlHUpl5afqxVOaFqm2NAcakyR2AnuveyMzMrHhbR8TA6pl5vjt9PrBVxfSWaV6tMvMk9QY2AhZXbygixgHjcorTzMyslPK8nP4gsJ2kD0p6H3A0ML6qzHjg+DR+JPDHyOvSgJmZWYvJrSYeESskjQV+D/QCLoqIGZLOAiZHxHjgQuBySXOBF8kSvZmZmTUgt3viZmZmli+/sc3MzKyknMTNzMxKyknczMyspJzEOyFpkqQ3JC1NQ8s8m57eWX+hpKckvSppqqSDio6rWSo+s45hpaSfFx1Xs0gaK2mypOWSLik6nmaTtImkX0talr6jxxYdU7NJOlrSrHSMj0nap+iYmkXSFZIWSFoi6VFJJxYdUx4kbZdyxBVFxpHnc+KtYGxE/KroIHLQm+xNefsCTwMHA9dJ2iUiniwysGaIiP4d45L6AwuB64uLqOmeBb4HHAj0KziWPJwLvAlsBuwG/FbSwxExo9ComkTSAWT9RBwFPAAMKjaipvsB8IWIWC5pKDBJ0l8iYkrRgTXZuWSPUhfKNfG1UEQsi4gzIuLJiHgrIiYATwB7Fh1bDo4AngfuLjqQZomImyLiN9R4MVLZSVqf7DP7j4hYGhH3kL1PYlSxkTXVmcBZEXFf+v83PyKqX4RVWhExIyKWd0ym4UMFhtR0ko4GXgb+t+BQnMTr+IGkFyT9SdLwooPJi6TNgO2BlqjpVDkeuMwvESqN7YEVEfFoxbyHgZ0LiqepUu+O7cDA1AXzPEnnSGqpKyqSfiHpNWA2sACYWHBITSNpQ+As4BtFxwJO4l35FrANWSct44BbJLXUr0kASX2AK4FLI2J20fE0k6StyW4ZXFqvrK0x+gNLqua9AmxQQCx52AzoQ/aGyn3IbhfsDvx7gTE1XUR8mewz2we4CVje9Rql8p/AhR19fhTNSbwTEXF/RLwaEcsj4lLgT2T3jluGpHWAy8nuP46tU7yMRgH3RMQTRQdiDVsKbFg1b0Pg1QJiycPr6d+fR8SC1NnTT2ixvy0AEbEy3Q7ZEvhS0fE0g6TdgP2BswsOZRU3bGtcAC3T13nqt/1CsprBwRHxt4JDysNxwA+LDsK65VGgt6TtIuKvad6utMitnoh4SdI8sr8nq2YXFU8P6U3r3BMfDrQBT2d/QukP9JK0U0TsUURAronXIGljSQdK6iupt6TPAZ8Efld0bE10HrAjcGhEvF6vcNlI+jjZrZBWapUOQPpO9iXrk6BXx/e06LiaISKWkV1+PUvS+pI+AYwku2LUKi4GvirpA5LeD5wCTCg4pqZIx3S0pP6Sekk6EDiGNaABWJOMI/tBslsazgd+S/akSCFa4j9+DvqQPcIzFFhJ1jjj8KrGNqWV7hWfRHafamH6RQlwUkRcWVhgzXU8cFNEtMpl2Er/Dny3YvrzZC2ezygkmub7MnAR2VMFi4EvtcrjZcl/AgPIrjq8AVwHfL/QiJonyC6dn09WSXwKODl1eFV6EfEa8FrHtKSlwBsRsaiomNwBipmZWUn5crqZmVlJOYmbmZmVlJO4mZlZSTmJm5mZlZSTuJmZWUk5iZuZmZWUk7hZC0h9pk+VNEPSw5JOTa/V7Wqdtp7oq1vSryTtVKfM4fXKmNm7OYmbtYbXI2K3iNgZOAA4iHe+EKaWNiD3JB4RJ0bEzDrFDgecxM26yUncrMVExPPAGGCsMm2S7pb0UBo+nor+ENgn1eBP6aLcKqnMbElXSpol6QZJ66Vl+0n6i6Tpki6StG6aP0lSexpfKun76WrBfZI2S/s5DPh/KZZWec+2We6cxM1aUEQ8TvZu9Q+Qvb70gNRBw1HAz1KxbwN3pxr82V2Uq7YD8IuI2JGs29Avp3e5XwIcFRG7kL3SuVbPVesD90XErsBdwBcj4s/AeOBfUyyPvcfDN1trOImbtb4+wC8lTSfrEKazy9aNlnsmIv6Uxq8A/o4ssT9R0b/ApWSdBlV7k7c7+5hCdknfzFaTO0Axa0GStiHrvOd5snvjz5F16bkOWacbtZzSYLnqDhe60wHD3+LtDhtW4r9BZu+Ja+JmLUbSQLJepM5JCXMjYEFEvAWMIrvMDvAqsEHFqp2VqzZE0rA0fixwDzAHaJO0bZo/CrizG2FXx2JmDXASN2sN/ToeMQP+ANxG1j0pwC+A4yU9TNa97rI0fxqwMjUyO6WLctXmAF+RNAt4P3BeRLwBnABcny7Hv0X2Q6JR1wD/mhrGuWGbWYPcFamZNUxSGzAhIj5cdCxm5pq4mZlZabkmbmZmVlKuiZuZmZWUk7iZmVlJOYmbmZmVlJO4mZlZSTmJm5mZlZSTuJmZWUn9f/BTACg9IrqLAAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "\n", "plt.figure(figsize=(8, 3)) \n", "plt.title('Hierarchical Clustering Dendrogram') \n", "plt.xlabel('Data point')\n", "plt.ylabel('Distance')\n", "dendrogram(Z)\n", "plt.axhline(y=1.0, c='k', ls='--', lw=0.5)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DocumentLabelsNormalized_DocumentClusterLabel
0The sky is blue and beautiful.weathersky blue beautiful2
1Love this blue and beautiful sky!weatherlove blue beautiful sky2
2The quick brown fox jumps over the lazy dog.animalsquick brown fox jumps lazy dog1
3A king's breakfast has sausages, ham, bacon, e...foodkings breakfast sausages ham bacon eggs toast ...3
4I love green eggs, ham, sausages and bacon!foodlove green eggs ham sausages bacon3
5The brown fox is quick and the blue dog is lazy!animalsbrown fox quick blue dog lazy1
6The sky is very blue and the sky is very beaut...weathersky blue sky beautiful today2
7The dog is lazy but the brown fox is quick!animalsdog lazy brown fox quick1
\n", "
" ], "text/plain": [ " Document Labels \\\n", "0 The sky is blue and beautiful. weather \n", "1 Love this blue and beautiful sky! weather \n", "2 The quick brown fox jumps over the lazy dog. animals \n", "3 A king's breakfast has sausages, ham, bacon, e... food \n", "4 I love green eggs, ham, sausages and bacon! food \n", "5 The brown fox is quick and the blue dog is lazy! animals \n", "6 The sky is very blue and the sky is very beaut... weather \n", "7 The dog is lazy but the brown fox is quick! animals \n", "\n", " Normalized_Document ClusterLabel \n", "0 sky blue beautiful 2 \n", "1 love blue beautiful sky 2 \n", "2 quick brown fox jumps lazy dog 1 \n", "3 kings breakfast sausages ham bacon eggs toast ... 3 \n", "4 love green eggs ham sausages bacon 3 \n", "5 brown fox quick blue dog lazy 1 \n", "6 sky blue sky beautiful today 2 \n", "7 dog lazy brown fox quick 1 " ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from scipy.cluster.hierarchy import fcluster \n", "\n", "max_dist = 1.0\n", "cluster_labels = fcluster(Z, max_dist, criterion='distance') \n", "cluster_labels = pd.DataFrame(cluster_labels, columns=['ClusterLabel']) \n", "pd.concat([corpus, cluster_labels], axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Topic Modeling --- Latent Dirichlet Allocation" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
T1T2T3
00.8321910.0834800.084329
10.8635540.0691000.067346
20.0477940.0477760.904430
30.0372430.9255590.037198
40.0491210.9030760.047802
50.0549020.0477780.897321
60.8882870.0556970.056016
70.0557040.0556890.888607
\n", "
" ], "text/plain": [ " T1 T2 T3\n", "0 0.832191 0.083480 0.084329\n", "1 0.863554 0.069100 0.067346\n", "2 0.047794 0.047776 0.904430\n", "3 0.037243 0.925559 0.037198\n", "4 0.049121 0.903076 0.047802\n", "5 0.054902 0.047778 0.897321\n", "6 0.888287 0.055697 0.056016\n", "7 0.055704 0.055689 0.888607" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.decomposition import LatentDirichletAllocation\n", "\n", "lda = LatentDirichletAllocation(n_components=3, max_iter=10000, random_state=0)\n", "dt_matrix = lda.fit_transform(cv_matrix)\n", "features = pd.DataFrame(dt_matrix, columns=['T1', 'T2', 'T3'])\n", "features" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('sky', 4.332439442470133), ('blue', 3.373774254787669), ('beautiful', 3.3323650509884386), ('today', 1.3325579855138987), ('love', 1.330415818217548)]\n", "\n", "[('bacon', 2.33269586574902), ('eggs', 2.33269586574902), ('ham', 2.33269586574902), ('sausages', 2.33269586574902), ('love', 1.3354610533796558), ('beans', 1.3327735190105536), ('breakfast', 1.3327735190105536), ('kings', 1.3327735190105536), ('toast', 1.3327735190105536), ('green', 1.3325431515674175)]\n", "\n", "[('brown', 3.3323473548404405), ('dog', 3.3323473548404405), ('fox', 3.3323473548404405), ('lazy', 3.3323473548404405), ('quick', 3.3323473548404405), ('jumps', 1.3324193772908193), ('blue', 1.2919423137963386)]\n", "\n" ] } ], "source": [ "tt_matrix = lda.components_\n", "for topic_weights in tt_matrix:\n", " topic = [(token, weight) for token, weight in zip(vocab, topic_weights)] \n", " topic = sorted(topic, key=lambda x: -x[1])\n", " topic = [item for item in topic if item[1] > 0.6]\n", " print(topic)\n", " print()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 4 }