diff --git a/Untitled31.ipynb b/Untitled31.ipynb
new file mode 100644
index 0000000..39e2991
--- /dev/null
+++ b/Untitled31.ipynb
@@ -0,0 +1,852 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import nltk\n",
+ "import re\n",
+ "import os\n",
+ "from sklearn import feature_extraction\n",
+ "import pickle\n",
+ "import scipy\n",
+ "from nltk.stem import WordNetLemmatizer\n",
+ "wnl = WordNetLemmatizer()\n",
+ "from nltk import word_tokenize\n",
+ "from nltk.corpus import stopwords"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "100 100 100\n"
+ ]
+ }
+ ],
+ "source": [
+ "titles = pickle.load(open('data/titles.pkl','rb'))\n",
+ "genres = pickle.load(open('data/genres.pkl','rb'))\n",
+ "synopsis = pickle.load(open('data/synopses.pkl','rb'))\n",
+ "print(len(titles), len(genres), len(synopsis))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " Plot [edit] [ [ edit edit ] ] \n",
+ " On the day of his only daughter's wedding, Vito Corleone hears requests in his role as the Godfather, the Don of a New York crime family. Vito's youngest son, Michael, in a Marine Corps uniform, introduces his girlfriend, Kay Adams, to his family at the sprawling reception. Vito's godson Johnny Fontane, a popular singer, pleads for help in securing a coveted movie role, so Vito dispatches his consigliere, Tom Hagen, to Los Angeles to influence the abrasive studio head, Jack Woltz. Woltz is unmoved until the morning he wakes up in bed with the severed head of his prized stallion. On the day of his only daughter's wedding, Vito Corleone Vito Corleone hears requests in his role as the Godfather, the Don Don of a New York crime family. Vito's youngest son, Michael Michael , in a Marine Corps Marine Corps uniform, introduces his girlfriend, Kay Adams Kay Adams , to his family at the sprawling reception. Vito's godson Johnny\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(synopsis[0][:1000])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "def preprocess_data(text):\n",
+ " \n",
+ " tagged_sentence = nltk.tag.pos_tag(text.split())\n",
+ " text = [word for word,tag in tagged_sentence if tag != 'NNP' and tag != 'NNPS']\n",
+ " text=' '.join(text) \n",
+ " \n",
+ " result = nltk.RegexpTokenizer(r'\\w+').tokenize(text)\n",
+ " text=' '.join(result)\n",
+ " stop = set(stopwords.words('english'))\n",
+ " s=[]\n",
+ " for i in text.split():\n",
+ " if i not in stop:\n",
+ " s.append(i)\n",
+ " \n",
+ " text=' '.join(s) \n",
+ " text=text.lower()\n",
+ " m=[]\n",
+ " for t in text.split():\n",
+ " k=wnl.lemmatize(t, pos='v')\n",
+ " m.append(k)\n",
+ " \n",
+ " text=' '.join(m)\n",
+ "\n",
+ " return text"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'edit edit on day daughter wed hear request role crime family youngest son uniform introduce girlfriend family sprawl reception godson popular singer plead help secure covet movie role dispatch consigliere influence abrasive studio head unmoved morning wake bed sever head prize stallion on day daughter wed hear request role crime family youngest son uniform introduce girlfriend family sprawl reception godson'"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ " preprocess_data(synopsis[0][:1000])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Wall time: 19.3 s\n",
+ "(100, 453)\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+ "\n",
+ "tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=20000, min_df=0.2,\\\n",
+ " stop_words='english', preprocessor=preprocess_data, \\\n",
+ " use_idf=True, ngram_range=(1, 1))\n",
+ "\n",
+ "%time tfidf_matrix = tfidf_vectorizer.fit_transform(synopsis)\n",
+ "\n",
+ "print(tfidf_matrix.shape)\n",
+ "terms = tfidf_vectorizer.get_feature_names()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "from sklearn.cluster import KMeans"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "kmeans = KMeans(n_clusters=3, random_state=0).fit(tfidf_matrix.todense())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " title | \n",
+ " cluster | \n",
+ " genre | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1 | \n",
+ " The Godfather | \n",
+ " 1 | \n",
+ " [u' Crime', u' Drama'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " The Shawshank Redemption | \n",
+ " 1 | \n",
+ " [u' Crime', u' Drama'] | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Schindler's List | \n",
+ " 2 | \n",
+ " [u' Biography', u' Drama', u' History'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Raging Bull | \n",
+ " 1 | \n",
+ " [u' Biography', u' Drama', u' Sport'] | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Casablanca | \n",
+ " 2 | \n",
+ " [u' Drama', u' Romance', u' War'] | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " One Flew Over the Cuckoo's Nest | \n",
+ " 2 | \n",
+ " [u' Drama'] | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " Gone with the Wind | \n",
+ " 0 | \n",
+ " [u' Drama', u' Romance', u' War'] | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " Citizen Kane | \n",
+ " 0 | \n",
+ " [u' Drama', u' Mystery'] | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " The Wizard of Oz | \n",
+ " 2 | \n",
+ " [u' Adventure', u' Family', u' Fantasy', u' Mu... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Titanic | \n",
+ " 2 | \n",
+ " [u' Drama', u' Romance'] | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Lawrence of Arabia | \n",
+ " 2 | \n",
+ " [u' Adventure', u' Biography', u' Drama', u' H... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " The Godfather: Part II | \n",
+ " 1 | \n",
+ " [u' Crime', u' Drama'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Psycho | \n",
+ " 1 | \n",
+ " [u' Horror', u' Mystery', u' Thriller'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Sunset Blvd. | \n",
+ " 1 | \n",
+ " [u' Drama', u' Film-Noir'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Vertigo | \n",
+ " 1 | \n",
+ " [u' Mystery', u' Romance', u' Thriller'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " On the Waterfront | \n",
+ " 1 | \n",
+ " [u' Crime', u' Drama'] | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " Forrest Gump | \n",
+ " 0 | \n",
+ " [u' Drama', u' Romance'] | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " The Sound of Music | \n",
+ " 0 | \n",
+ " [u' Biography', u' Drama', u' Family', u' Musi... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " West Side Story | \n",
+ " 1 | \n",
+ " [u' Crime', u' Drama', u' Musical', u' Romance... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Star Wars | \n",
+ " 2 | \n",
+ " [u' Action', u' Adventure', u' Fantasy', u' Sc... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " E.T. the Extra-Terrestrial | \n",
+ " 1 | \n",
+ " [u' Adventure', u' Family', u' Sci-Fi'] | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2001: A Space Odyssey | \n",
+ " 2 | \n",
+ " [u' Mystery', u' Sci-Fi'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " The Silence of the Lambs | \n",
+ " 1 | \n",
+ " [u' Crime', u' Drama', u' Thriller'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Chinatown | \n",
+ " 1 | \n",
+ " [u' Drama', u' Mystery', u' Thriller'] | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " The Bridge on the River Kwai | \n",
+ " 2 | \n",
+ " [u' Adventure', u' Drama', u' War'] | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " Singin' in the Rain | \n",
+ " 0 | \n",
+ " [u' Comedy', u' Musical', u' Romance'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " It's a Wonderful Life | \n",
+ " 1 | \n",
+ " [u' Drama', u' Family', u' Fantasy'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Some Like It Hot | \n",
+ " 1 | \n",
+ " [u' Comedy'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 12 Angry Men | \n",
+ " 1 | \n",
+ " [u' Drama'] | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Dr. Strangelove or: How I Learned to Stop Worr... | \n",
+ " 2 | \n",
+ " [u' Comedy', u' War'] | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Rain Man | \n",
+ " 1 | \n",
+ " [u' Drama'] | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " Annie Hall | \n",
+ " 0 | \n",
+ " [u' Comedy', u' Drama', u' Romance'] | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " Out of Africa | \n",
+ " 0 | \n",
+ " [u' Biography', u' Drama', u' Romance'] | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " Good Will Hunting | \n",
+ " 0 | \n",
+ " [u' Drama'] | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " Terms of Endearment | \n",
+ " 0 | \n",
+ " [u' Comedy', u' Drama'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Tootsie | \n",
+ " 1 | \n",
+ " [u' Comedy', u' Drama', u' Romance'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Fargo | \n",
+ " 1 | \n",
+ " [u' Crime', u' Drama', u' Thriller'] | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " Giant | \n",
+ " 0 | \n",
+ " [u' Drama', u' Romance'] | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " The Grapes of Wrath | \n",
+ " 2 | \n",
+ " [u' Drama'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Shane | \n",
+ " 1 | \n",
+ " [u' Drama', u' Romance', u' Western'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " The Green Mile | \n",
+ " 1 | \n",
+ " [u' Crime', u' Drama', u' Fantasy', u' Mystery'] | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Close Encounters of the Third Kind | \n",
+ " 2 | \n",
+ " [u' Drama', u' Sci-Fi'] | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " Network | \n",
+ " 0 | \n",
+ " [u' Drama'] | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " Nashville | \n",
+ " 0 | \n",
+ " [u' Drama', u' Music'] | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " The Graduate | \n",
+ " 0 | \n",
+ " [u' Comedy', u' Drama', u' Romance'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " American Graffiti | \n",
+ " 1 | \n",
+ " [u' Comedy', u' Drama'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Pulp Fiction | \n",
+ " 1 | \n",
+ " [u' Crime', u' Drama', u' Thriller'] | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " The African Queen | \n",
+ " 2 | \n",
+ " [u' Adventure', u' Romance', u' War'] | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Stagecoach | \n",
+ " 2 | \n",
+ " [u' Adventure', u' Western'] | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Mutiny on the Bounty | \n",
+ " 2 | \n",
+ " [u' Adventure', u' Drama', u' History'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " The Maltese Falcon | \n",
+ " 1 | \n",
+ " [u' Drama', u' Film-Noir', u' Mystery'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " A Clockwork Orange | \n",
+ " 1 | \n",
+ " [u' Crime', u' Drama', u' Sci-Fi'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Taxi Driver | \n",
+ " 1 | \n",
+ " [u' Crime', u' Drama'] | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " Wuthering Heights | \n",
+ " 0 | \n",
+ " [u' Drama', u' Romance'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Double Indemnity | \n",
+ " 1 | \n",
+ " [u' Crime', u' Drama', u' Film-Noir', u' Thril... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Rebel Without a Cause | \n",
+ " 1 | \n",
+ " [u' Drama'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Rear Window | \n",
+ " 1 | \n",
+ " [u' Mystery', u' Thriller'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " The Third Man | \n",
+ " 1 | \n",
+ " [u' Film-Noir', u' Mystery', u' Thriller'] | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " North by Northwest | \n",
+ " 1 | \n",
+ " [u' Mystery', u' Thriller'] | \n",
+ "
\n",
+ " \n",
+ " | 0 | \n",
+ " Yankee Doodle Dandy | \n",
+ " 0 | \n",
+ " [u' Biography', u' Drama', u' Musical'] | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
100 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " title cluster \\\n",
+ "1 The Godfather 1 \n",
+ "1 The Shawshank Redemption 1 \n",
+ "2 Schindler's List 2 \n",
+ "1 Raging Bull 1 \n",
+ "2 Casablanca 2 \n",
+ "2 One Flew Over the Cuckoo's Nest 2 \n",
+ "0 Gone with the Wind 0 \n",
+ "0 Citizen Kane 0 \n",
+ "2 The Wizard of Oz 2 \n",
+ "2 Titanic 2 \n",
+ "2 Lawrence of Arabia 2 \n",
+ "1 The Godfather: Part II 1 \n",
+ "1 Psycho 1 \n",
+ "1 Sunset Blvd. 1 \n",
+ "1 Vertigo 1 \n",
+ "1 On the Waterfront 1 \n",
+ "0 Forrest Gump 0 \n",
+ "0 The Sound of Music 0 \n",
+ "1 West Side Story 1 \n",
+ "2 Star Wars 2 \n",
+ "1 E.T. the Extra-Terrestrial 1 \n",
+ "2 2001: A Space Odyssey 2 \n",
+ "1 The Silence of the Lambs 1 \n",
+ "1 Chinatown 1 \n",
+ "2 The Bridge on the River Kwai 2 \n",
+ "0 Singin' in the Rain 0 \n",
+ "1 It's a Wonderful Life 1 \n",
+ "1 Some Like It Hot 1 \n",
+ "1 12 Angry Men 1 \n",
+ "2 Dr. Strangelove or: How I Learned to Stop Worr... 2 \n",
+ ".. ... ... \n",
+ "1 Rain Man 1 \n",
+ "0 Annie Hall 0 \n",
+ "0 Out of Africa 0 \n",
+ "0 Good Will Hunting 0 \n",
+ "0 Terms of Endearment 0 \n",
+ "1 Tootsie 1 \n",
+ "1 Fargo 1 \n",
+ "0 Giant 0 \n",
+ "2 The Grapes of Wrath 2 \n",
+ "1 Shane 1 \n",
+ "1 The Green Mile 1 \n",
+ "2 Close Encounters of the Third Kind 2 \n",
+ "0 Network 0 \n",
+ "0 Nashville 0 \n",
+ "0 The Graduate 0 \n",
+ "1 American Graffiti 1 \n",
+ "1 Pulp Fiction 1 \n",
+ "2 The African Queen 2 \n",
+ "2 Stagecoach 2 \n",
+ "2 Mutiny on the Bounty 2 \n",
+ "1 The Maltese Falcon 1 \n",
+ "1 A Clockwork Orange 1 \n",
+ "1 Taxi Driver 1 \n",
+ "0 Wuthering Heights 0 \n",
+ "1 Double Indemnity 1 \n",
+ "1 Rebel Without a Cause 1 \n",
+ "1 Rear Window 1 \n",
+ "1 The Third Man 1 \n",
+ "1 North by Northwest 1 \n",
+ "0 Yankee Doodle Dandy 0 \n",
+ "\n",
+ " genre \n",
+ "1 [u' Crime', u' Drama'] \n",
+ "1 [u' Crime', u' Drama'] \n",
+ "2 [u' Biography', u' Drama', u' History'] \n",
+ "1 [u' Biography', u' Drama', u' Sport'] \n",
+ "2 [u' Drama', u' Romance', u' War'] \n",
+ "2 [u' Drama'] \n",
+ "0 [u' Drama', u' Romance', u' War'] \n",
+ "0 [u' Drama', u' Mystery'] \n",
+ "2 [u' Adventure', u' Family', u' Fantasy', u' Mu... \n",
+ "2 [u' Drama', u' Romance'] \n",
+ "2 [u' Adventure', u' Biography', u' Drama', u' H... \n",
+ "1 [u' Crime', u' Drama'] \n",
+ "1 [u' Horror', u' Mystery', u' Thriller'] \n",
+ "1 [u' Drama', u' Film-Noir'] \n",
+ "1 [u' Mystery', u' Romance', u' Thriller'] \n",
+ "1 [u' Crime', u' Drama'] \n",
+ "0 [u' Drama', u' Romance'] \n",
+ "0 [u' Biography', u' Drama', u' Family', u' Musi... \n",
+ "1 [u' Crime', u' Drama', u' Musical', u' Romance... \n",
+ "2 [u' Action', u' Adventure', u' Fantasy', u' Sc... \n",
+ "1 [u' Adventure', u' Family', u' Sci-Fi'] \n",
+ "2 [u' Mystery', u' Sci-Fi'] \n",
+ "1 [u' Crime', u' Drama', u' Thriller'] \n",
+ "1 [u' Drama', u' Mystery', u' Thriller'] \n",
+ "2 [u' Adventure', u' Drama', u' War'] \n",
+ "0 [u' Comedy', u' Musical', u' Romance'] \n",
+ "1 [u' Drama', u' Family', u' Fantasy'] \n",
+ "1 [u' Comedy'] \n",
+ "1 [u' Drama'] \n",
+ "2 [u' Comedy', u' War'] \n",
+ ".. ... \n",
+ "1 [u' Drama'] \n",
+ "0 [u' Comedy', u' Drama', u' Romance'] \n",
+ "0 [u' Biography', u' Drama', u' Romance'] \n",
+ "0 [u' Drama'] \n",
+ "0 [u' Comedy', u' Drama'] \n",
+ "1 [u' Comedy', u' Drama', u' Romance'] \n",
+ "1 [u' Crime', u' Drama', u' Thriller'] \n",
+ "0 [u' Drama', u' Romance'] \n",
+ "2 [u' Drama'] \n",
+ "1 [u' Drama', u' Romance', u' Western'] \n",
+ "1 [u' Crime', u' Drama', u' Fantasy', u' Mystery'] \n",
+ "2 [u' Drama', u' Sci-Fi'] \n",
+ "0 [u' Drama'] \n",
+ "0 [u' Drama', u' Music'] \n",
+ "0 [u' Comedy', u' Drama', u' Romance'] \n",
+ "1 [u' Comedy', u' Drama'] \n",
+ "1 [u' Crime', u' Drama', u' Thriller'] \n",
+ "2 [u' Adventure', u' Romance', u' War'] \n",
+ "2 [u' Adventure', u' Western'] \n",
+ "2 [u' Adventure', u' Drama', u' History'] \n",
+ "1 [u' Drama', u' Film-Noir', u' Mystery'] \n",
+ "1 [u' Crime', u' Drama', u' Sci-Fi'] \n",
+ "1 [u' Crime', u' Drama'] \n",
+ "0 [u' Drama', u' Romance'] \n",
+ "1 [u' Crime', u' Drama', u' Film-Noir', u' Thril... \n",
+ "1 [u' Drama'] \n",
+ "1 [u' Mystery', u' Thriller'] \n",
+ "1 [u' Film-Noir', u' Mystery', u' Thriller'] \n",
+ "1 [u' Mystery', u' Thriller'] \n",
+ "0 [u' Biography', u' Drama', u' Musical'] \n",
+ "\n",
+ "[100 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "films = {'title': titles, 'synopsis': synopsis, 'cluster': kmeans.labels_, 'genre': genres}\n",
+ "frame = pd.DataFrame(films, index = [kmeans.labels_] , columns = ['title', 'cluster', 'genre'])\n",
+ "frame"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1 38\n",
+ "2 34\n",
+ "0 28\n",
+ "Name: cluster, dtype: int64"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "frame['cluster'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Top terms per cluster:\n",
+ "Cluster 0 words: marry, love, father, family, home, marriage,\n",
+ "Cluster 0 titles: Gone with the Wind, Citizen Kane, Forrest Gump, The Sound of Music, Singin' in the Rain, Amadeus, A Streetcar Named Desire, The Philadelphia Story, An American in Paris, The Best Years of Our Lives, My Fair Lady, Doctor Zhivago, Braveheart, City Lights, The King's Speech, It Happened One Night, A Place in the Sun, Mr. Smith Goes to Washington, Annie Hall, Out of Africa, Good Will Hunting, Terms of Endearment, Giant, Network, Nashville, The Graduate, Wuthering Heights, Yankee Doodle Dandy,\n",
+ "Cluster 1 words: police, car, kill, apartment, say, murder,\n",
+ "Cluster 1 titles: The Godfather, The Shawshank Redemption, Raging Bull, The Godfather: Part II, Psycho, Sunset Blvd., Vertigo, On the Waterfront, West Side Story, E.T. the Extra-Terrestrial, The Silence of the Lambs, Chinatown, It's a Wonderful Life, Some Like It Hot, 12 Angry Men, Unforgiven, Rocky, To Kill a Mockingbird, The Apartment, Goodfellas, The Exorcist, The French Connection, Midnight Cowboy, Rain Man, Tootsie, Fargo, Shane, The Green Mile, American Graffiti, Pulp Fiction, The Maltese Falcon, A Clockwork Orange, Taxi Driver, Double Indemnity, Rebel Without a Cause, Rear Window, The Third Man, North by Northwest,\n",
+ "Cluster 2 words: kill, soldier, water, order, camp, officer,\n",
+ "Cluster 2 titles: Schindler's List, Casablanca, One Flew Over the Cuckoo's Nest, The Wizard of Oz, Titanic, Lawrence of Arabia, Star Wars, 2001: A Space Odyssey, The Bridge on the River Kwai, Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb, Apocalypse Now, Gandhi, The Lord of the Rings: The Return of the King, Gladiator, From Here to Eternity, Saving Private Ryan, Raiders of the Lost Ark, Ben-Hur, Patton, Jaws, The Good, the Bad and the Ugly, Butch Cassidy and the Sundance Kid, The Treasure of the Sierra Madre, Platoon, High Noon, Dances with Wolves, The Pianist, The Deer Hunter, All Quiet on the Western Front, The Grapes of Wrath, Close Encounters of the Third Kind, The African Queen, Stagecoach, Mutiny on the Bounty,\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Top terms per cluster:\")\n",
+ "order_centroids = np.asarray(kmeans.cluster_centers_).argsort()[:, ::-1]\n",
+ "cluster_names = []\n",
+ "for i in range(order_centroids.shape[0]):\n",
+ " print(\"Cluster %d words:\" % i, end='')\n",
+ " q = \"\"\n",
+ " for ind in order_centroids[i, :6]:\n",
+ " print(' %s' % terms[ind], end=',')\n",
+ " q += str(terms[ind])\n",
+ " q += \" \"\n",
+ " cluster_names.append(q)\n",
+ " print()\n",
+ " print(\"Cluster %d titles:\" % i, end='')\n",
+ " for title in frame.loc[i]['title'].values.tolist():\n",
+ " print(' %s,' % title, end='')\n",
+ " print()\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "from sklearn.decomposition import PCA as sklearnPCA"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "pca = sklearnPCA(n_components=2) #2-dimensional PCA\n",
+ "transformed = pd.DataFrame(pca.fit_transform(tfidf_matrix.todense()))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD8CAYAAACVZ8iyAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAG81JREFUeJzt3X+MZWV5B/Dvd5bFMAFW9gdIgbmX\ntksTyhrNTjY1aavNYoom7KrxB/Rql/pjQilpNlQrZkxRzE2spmFJqtFpjVmZaRBMLItiKG61P6xr\nGKJduhBkwTvLCpEB6oS66MrO0z/OneXeO+fcufee95zzvud8P8lk7tw5c++7Z2bPc973ed73pZlB\nRESqbazoBoiISPEUDERERMFAREQUDEREBAoGIiICBQMREYGCgYiIQMFARESgYCAiIgDOKLoBSTZv\n3mz1er3oZoiIBOWhhx56zsy2DPtz3gaDer2O+fn5opshIhIUkguj/JyGiURERMFAREQUDEREBAoG\nIiICBQMREYGCgYiIwFEwIHkVycdIHiV5c5/j3knSSE66eF8REXEjdTAguQ7A5wC8BcDlAK4leXnM\ncecA+EsAP0j7npLO3BxQrwNjY9HnubmiWyQiRXPRM9gB4KiZPWlmJwHcCWB3zHGfAvAZAL908J6V\nkMVFe24OmJoCFhYAs+jz1JQCgkjVuQgGFwF4quPr4+3nTiP5egCXmNk3HLxfJWR10Z6eBk6c6H7u\nxInoeRGpLhfBgDHP2elvkmMAbgPwV2u+EDlFcp7k/OLiooOmhSuri/axY8M9LyLV4CIYHAdwScfX\nFwN4uuPrcwBcAeC7JFsAfg/AgbgkspnNmNmkmU1u2TL0OkulktVFe2JiuOfjKOcgUj4ugsGDALaS\nvJTkmQCuAXBg5ZtmtmRmm82sbmZ1AIcA7DIzrULXh4uLdpxmExgf735ufDx6fhDKOYiUU+pgYGYv\nA7gRwP0AHgVwl5kdIXkryV1pX7+q0l60kzQawMwMUKsBZPR5ZiZ6fhDKOYiUE81s7aMKMDk5aVVf\nwnpuLrrIHjsW9QiazcEv2lkZG4t6BL1IYHk5//aISDeSD5nZ0HO5vN3PQKILf9EX/14TE9HQUNzz\nIhIuLUchQ8lq+EpEiqVgIENJm3MQET9pmEiG5uPwlYiko56BpDbqvAPNVxDxh3oGksrKvIOVctOV\neQdA/97DqD8nItlQaamkUq/HVxfVakCr5f7nRKS/UUtLNUwkqYy6bIbWSBLxi4KBJ0IdPx912Yys\nltsQkdEoGHgg5PV+Rp13oPkKIn5RMPBAyOv9jDrvQPMVRPyiBLIHtN6PiLiiBHLANH4uIkVTMPCA\nxs9FpGgKBh7Q+LmIFE3BwBONRjTZank56hFMT4dXZioi4dJyFJ7RMg0iUgT1DDwTcpmpiIRLwcAz\nWqZBRIqgYOAZlZmKSBEUDDxT5TLTUNdnEikDBQPPVLXMNOT1mUTKQMHAQ51lpq1W+QMBMFriXD0J\nEXdUWipeGDZxrhJcEbfUMxAvDJs4VwmuiFsKBuKFYRPnKsEVcUvBQFJzMXY/bOJcJbgibikYSCou\nq4CGSZxXuQRXJAsKBpJKUWP3VS3BFcmKgoFHQiyVLHLsvooluCJZUTDwRKiTrjR2L1IOCgaeCLVU\nUmP3IuWgYOCJUEslNXYvUg6ageyJiYloaCjued81Grr4i4ROPQNPaLhFRIqkYOAJDbeISJE0TOQR\nDbeISFHUMxAREQUDERFRMJCUQpw1LSKrOQkGJK8i+RjJoyRvjvn+TSQfIXmY5EGSNRfvK8UKdda0\niKyWOhiQXAfgcwDeAuByANeSvLznsB8CmDSz1wL4GoDPpH3fqvPhjjzUWdMispqLnsEOAEfN7Ekz\nOwngTgC7Ow8ws++Y2cpl4xCAix28b2X5ckce6qxpEVnNRTC4CMBTHV8fbz+X5AMAvhX3DZJTJOdJ\nzi8uLjpoWjn5ckeuRepEysNFMGDMcxZ7IPleAJMAPhv3fTObMbNJM5vcsmWLg6aVky935Jo1LVIe\nLoLBcQCXdHx9MYCnew8ieSWAaQC7zOxXDt63sny5I9esaZHycBEMHgSwleSlJM8EcA2AA50HkHw9\ngC8iCgTPOnjPSkt7R+4y+awNZkTKIXUwMLOXAdwI4H4AjwK4y8yOkLyV5K72YZ8FcDaAu0n+iOSB\nhJeTAaS5I/cl+SwifqFZ7PB+4SYnJ21+fr7oZpROvR6/VHatFt3Zi0jYSD5kZpPD/pxmIFeML8ln\nEfGLgkHF+JJ8FhG/KBhUTB7loD7MjhaR4SgYVEzW5aBKUIuESQlkcUoJaj/NPTyH6YPTOLZ0DBMb\nJtDc2URjm+qAy2jUBLJ2OhOnlKD2z9zDc5i6dwonfh2tYbKwtICpe6cAQAFBTtMwkTilBLV/pg9O\nnw4EK078+gSmDw62mNXcw3Oo76tj7JNjqO+rY+5hjfmVkYKBOKX1ivxzbCm+W5b0fKeVXsXC0gIM\ndrpXoYBQPgoGBShztY3WK/LPxIb4blnS853S9iokHAoGOatCtc1a6xWVORj6qLmzifH13d218fXj\naO5cu7uWplchYVEwyJkvexEUpQrB0DeNbQ3MXD2D2oYaCKK2oYaZq2cGSh6n6VVIWBQMclb1apu8\ng2HVkp9Jva7GtgZae1tYvmUZrb2tgauI0vQqJCwKBjnzqdqmiOGaPINhmZOfcUEui15Xml6FhEWT\nznK28h+28+6YBK6/Hvj857N7z+np6II7MfFKZU9vO8bHs0/2Jk1K2/SmOZy92+2kqPq+OhaWVr/Z\npjNqeG66leq1i9Q7bwCI7tbPemAGz3939TnThL9qGXXSmYJBAW64AfjCF6K7txVZXYjjgs/4OHDW\nWcDzz68+PusLR1x71m+fA3dN4aR1X9zS3oGOfXIMFrcDqxGzW5eDrXBKCnL4eQ3Y11r1NBkl86Ua\ntIR1QO67rzsQANmNmyeN0ccFAiD73EVc6em5b5/uCgSAm/LFxCTn0kTQCfvESp4N8c9rwp8MQsGg\nAHmOmw/7mnlcOHpLT194OZvyxebOJnCyZwbcyXHgYDPohH1SkNu0fkIT/mRkCgYFyDOJnPSamzb5\nM1M4q/LFxrYGNv3XTDR8Yow+3zsDPNwI+m45qcLn9l3N070ubJvDug/XceKvxzC92F1FVbUKKxmM\nFqorQLMZP46fxYU46b1uvz163JtYLmIcvbmzGZsQHbZ8MW5lzts/2MDUVCOXc52XlTxK7Cqk2wC8\nNnlhOgBatE5iKYFckLgKn6wuxHm+16jSLrGcVGEzc/UMcLjh/b/fpaQEc21DDQASv9fa28q6aZID\nVRNJpfW7AFblIrcS9BeuGwO4+v81QQCIrbAiiOVbVHJUBqomkkqr+ho6nRPOsJScg9HyEpJEwUBK\noeoXua4S4oOrq6hWcjBaXkKSKBhIKVT9ItdVKvtwI6qaaldRdS4hEeryEqqAyp5yBhUWQmJ5GIMm\nocu4H3CZ957uVxwQ+u8tC0ogy1CSlqko+0Y0Zb2wlPn3qeKA4SiBLEOp6r4KZd25q8w7zFW9OCAv\nCgYVFcq+Cq7Hist8YVlrh7lQVb04IC8KBp7Las+BLJbEcN3WLPYj0IUlPFUvDsiLgoHHstwistl0\nuzZRXFvf/35g8+bRg0MWQzq6sIQn1Aqo0CiB7LGsK0RcVhMltbXTsAnNpP0I0s6WLWM1kcgKVROV\n0NjY6n0PAD83K0lqa69hApmqSESGp2qiEvJpv+S1DNqmhYXB8woa0hHJj4KBx1yP62cprq1xyMFz\nIBorjmj2reTCzLz82L59uwVldtasVjMjo8+zsz6/bCY627ppk9n69WbRZT/6ILu/Xvmo1Ypuub9m\nD8/aeHPc8Amc/hhvjtvsYY//EHrMHp612m014ydotdtqQbU9Ly7PEYB5G+Gaq5zBqDqzrxs3Ai++\nCJw8+cr3yzL9M4XeBHVSgtnHHIgvQs+blHXGt0uuz5ESyHmKm/sfpwwLwzhU5vVzspJVRVVeQg9m\neXB9jpRAzlPcWg5xfJvO69AoE8xCyoEAfozVhz5Jrswzvl3x5RwpGIyiz0V+Dteijp9gDKdQHzvm\nbMawT0adDBfS+jlZzH4eRegVVUlBy2BKhrf5EvCdBAOSV5F8jORRkjfHfP9VJL/a/v4PSNZdvG9h\nEuoo53AtpvAPWEAdhjEsnLrY2Yxhn6RZ5C6U9XN8WdAu9IqquGC2oqgA6xtfAn7qnAHJdQB+DODN\nAI4DeBDAtWb2SMcxNwB4rZldT/IaAG83s/f0e93gcgbr16N+6gksLF+y6vCyjYmHNBluVKGP1ftk\nZcZ33Lg4oPwB4HZWfJE5gx0AjprZk2Z2EsCdAHb3HLMbwP72468B2EmSDt67GHHjHV/+Mo7Z6kAA\nlC91ENJkuFH50nUvg8a2Blp7WyDi/8srf/DKOVq+ZRmtva1Cen4ugsFFAJ7q+Pp4+7nYY8zsZQBL\nADY5eO/+slryE4gd73Bxkcyyya74lAjOKsnrS9fdFy7OswKs31wEg7hw39u/HuQYkJwiOU9yfnFx\nMV2rslzyM0Hai2QBTR6JL4ngLJO8Po3VF13V5Oo8K8B6bpSZap0fAN4A4P6Orz8G4GM9x9wP4A3t\nx2cAeA7tfEXSR+oZyLVaIdNd08wYTttk32cru56JWrut1jUzd+WjdlvNTYM94MMMZJfnObTZyKG1\n16zAGcgkz0CUQN4J4KeIEsh/YmZHOo75CwDb7JUE8jvM7N39Xjd1AjnALGeaJo+0B67LNazXkMVM\n1CokeX2YtFWF8xwn1NnThSWQLcoB3Ijo7v9RAHeZ2RGSt5Lc1T7sSwA2kTwK4CYAq8pPnQswy5mm\nyUOXe+Y8JpVFqWYVxqB9mJCUdD43nrWx8El5WfKlvDgvTuYZmNl9ZnaZmf2WmTXbz/2NmR1oP/6l\nmb3LzH7bzHaY2ZMu3rcvn7KcA0rT5KH3NE4zWWAEWVzUhh2DLnrsfRQ+BLy487x+bD1ePPli4ZPy\ngOx+rz4E4jyVdwayL1nOIaRp8tC9iqGjx2rDVD5lcVEbJsnry4ziYfmQdI07z+e+6lycPHWy67gi\n7pqz/L36EIjzpIXqQtUz3j/31llM7f/9wXMGKVeNGzZHUfT4qw9j78PonIS08ayNAIAXXnrBm206\nfckjrPV7TTOZq+i/2VFpoboqiRnvb+z/Y8zs+c/BexUph9GGHWUqulQzpC5/793u8y89j5defgl3\nvOOOwiYk9fLlrrnf7zVtr6Hov9m8qWcQIldrQaeoJgqtWCvrnoHL5QRC6MX4ctfc71wB8P48ZkE9\ngypxMN4PINWqcaEVa2U59u563DqEXowvd839fq8hnEefKBiEyIMrcWjFWllevFyXIPoyBLMWH9bT\n6fd7DeU8+kLBIESDXIkzXuQowGKtzC5eru9AfaggCknS71XncTgKBiFa60qc04SyUPYmyJrrO9A0\nvZgQ51JkxZehrFAogVxG2mw4V74kU31phxRLCWR5hasEswzElzvQqi2fIG4pGIRk0DyABwnmqvEh\nmarqGUlDwSAUw+QBQiv1ESdUPSNpKBiEYpgpvz6U+oSwZVvJqHpG0lACORRJW0b7OOV3pM0VxAWX\nM6ElTKMmkBUMQjA3B7zvffHrP/hYIaRqJpHCqJqozKankxcC8jEPoGomkeAoGIQg6SJq5uewi6qZ\nRIKjYBCCpItorZZvOwalaiaR4CgYhGDIi2vhhTxpq5kK/weIVJCZefmxfft2kw6zs2a1mhkZfZ6d\nTTxsfNwsGkOKPsbHEw/3T/D/AJFiAZi3Ea65qibyQYpNZnoFX8gT/D9ApFiqJgqV4xVGXRXyFDZS\no0okkUIoGBRt2M2E1zBMIU/SBT+nFbAHb2i/50XECQWDoiXd8S4sjHRLPmiuud8F33F8Go4qkUQK\noWBQtH53vCPckg9ayNPvgn9sIT6PlPS8Uz6sqyRSQQoGRYu7E+40wi35IDuQ9Ruan1j309jv9T6f\nWV5BW6gNRbubiQsKBkXrvBNOMuKQUT/9huabpz6Kcfyi6/lx/ALNUx89/XWheQU5bWV3s4WlBRgM\nC0sLmLp3SgFBhqZg4IOVO+G1AoLDq22/oflG7XuYwYdQQwvEMmpoYQYfQqP2vdPHFppXkNOK3t1M\nvZLyUDDwSQZDRkn6Ds03m2iM34MWLsUy1qGFS9EYv6criVvVClDfLn5F7m6mXkm5KBj4ZNAhI0eD\n9IlD8wMkcatYAerjxa/I3c2K7pWIWwoGvhlkyCiPQfo1krhVrAD18eJX5O5m2nO5XBQMfLXWkBFQ\n6CB9FStAfbz4NbY1MHP1DGobaiCI2oYaZq6eyWV3szx6Jb4Ny5WZ1ibyWeeaRUm/Jx+3vSyp+r46\nFpZWr5tU21BDa28r/wYVbGXYrLO3NL5+3Fkwyvr1y0prE5VR51BN0rBRmQfpPaMN57tl3SvxcViu\nzM4ougEyoGYzfpP5Mg/Se2blIqcN51/R2NbI7N/v47BcmalnEIpRB+nzWH60QpvRNLY10NrbwvIt\ny2jtbVU6EGStyEqpKlIwCMmwyzTkMU1YU5ElIxqWy5cSyGWWx0Yx2oxGMjT38JyG5YY0agJZwaDM\nyOTvufq9j43Fv9YaVU4ON3cTkQ6FVBOR3EjyAZKPtz+fF3PM60h+n+QRkodJvifNe8oQ1q0b7vlR\njDAV2eXIkurQRdxImzO4GcBBM9sK4GD7614nAPypmf0ugKsA7CP56pTvK4M4dSr5eVcJ3xGmIrta\n5M7H5SFEQpU2GOwGsL/9eD+At/UeYGY/NrPH24+fBvAsgC0p31cGkTQ3gXSX8B2hysnVIneqQxdx\nJ20wuMDMngGA9ufz+x1McgeAMwE8kfJ9ZRBxd+3k6jH+tMtaDFnl5GqRO9Whi7izZjAg+W2S/xPz\nsXuYNyJ5IYA7APyZmcVmFklOkZwnOb+4uDjMy0ucuLv2pMRxjmtPu1rkTnXoIu6sGQzM7EozuyLm\n4x4AP2tf5Fcu9s/GvQbJcwF8E8DHzexQn/eaMbNJM5vcskUjSanFlex4sKyFq0XuVIcu4k7aYaID\nAPa0H+8BcE/vASTPBPB1AF8xs7tTvp8MKqlk561v9WLtaRfbHBe5YqdI2aSaZ0ByE4C7AEwAOAbg\nXWb2AslJANeb2QdJvhfAlwEc6fjR68zsR/1eW/MMUuo3GazZVJG/SElp0pl0G3EymJc0Q01kYFrC\nWrqVZV9KrX0kkgsFg7Iqy76UrmaoiUhfCgZlVZZ9KV3NUBORvrS5TZk1GuFd/HtNTMQnwkMb7hLx\nnHoG4jePhru0KJ6UmYKB+M2T4S4tiidlp9JSkQHU99WxsLR6uKq2oYbW3lb+DRJJoNJSkQxpUTwp\nOwUDkQFoUTwpOwUDkQFoUTwpOwUDkQFoUTwpOyWQQ6Y1e0Skx6gJZE06C9XKmj0rSzWsrNkDKCCI\nyNA0TBSqrNfsmZuLlsEeG4s+a2E4kVJTzyBUWa7Zo16HSOWoZxCqLJeo1kqhIpWjYBCqLNfs0Uqh\n2dHwm3hKwSBUWa7ZU5aNcXyjjXrEYyotldV6cwZA1OsIcT8En/Tbl7rVyrs1UlJam0jijTIs4clK\noaWj4TfxmIJBmaUZlmg0orvV5eXo87CBQGPjq2n4TTymYFBmRVUFaWw8nkcb9Yj0UjAos6KGJVSa\nGk/Db+IxTTors6L2D9bYeLIy7EstpaSeQVo+j40XNSyhsXGR4CgYpOH72HhRwxIaGxcJjuYZpKG6\n8WRaXlukEKPOM1AwSGNsLOoR9CKjkkwRkZxp0lkRNDYuIiWhYJCGxsZFpCQUDNJQ3biIlITmGaSl\nunERKQH1DERERMFAREQUDEREBAoGIiICBQMREYGCgYiIQMFARESgYCAiIkgZDEhuJPkAycfbn8/r\nc+y5JH9K8u/TvKek4PPeCyJSqLQ9g5sBHDSzrQAOtr9O8ikA/5by/WRUvu+9ICKFShsMdgPY3368\nH8Db4g4iuR3ABQD+JeX7yai0L7GI9JE2GFxgZs8AQPvz+b0HkBwD8HcAPrLWi5GcIjlPcn5xcTFl\n06SL9iUWkT7WXKiO5LcBvCbmW4PeUt4A4D4ze4pk3wPNbAbADBBtbjPg68sgJibid2XT3gsiggGC\ngZldmfQ9kj8jeaGZPUPyQgDPxhz2BgB/QPIGAGcDOJPk/5lZv/yCuNZsRjmCzqEi7b0gIm1ph4kO\nANjTfrwHwD29B5hZw8wmzKwO4MMAvqJAUADtvSAifaTdz+DTAO4i+QEAxwC8CwBITgK43sw+mPL1\nxSXtvSAiCWhxG7p7YHJy0ubn54tuhohIUEg+ZGaTw/6cZiCLiIiCgYiIKBiIiAgUDEREBAoGIiIC\nBQMREYGCgYiIwON5BiQXAcQsplMJmwE8V3QjPKLz0U3no5vOR7ffMbNzhv2htDOQM2NmW4puQ1FI\nzo8yaaSsdD666Xx00/noRnKk2boaJhIREQUDERFRMPDVTNEN8IzORzedj246H91GOh/eJpBFRCQ/\n6hmIiIiCgQ9IbiT5AMnH25/PiznmdSS/T/IIycMk31NEW7NC8iqSj5E8SnLV5kckX0Xyq+3v/4Bk\nPf9W5meA83ETyUfafwsHSdaKaGde1jofHce9k6S191QprUHOB8l3t/9GjpD8pzVf1Mz0UfAHgM8A\nuLn9+GYAfxtzzGUAtrYf/waAZwC8uui2O/r3rwPwBIDfBHAmgP8GcHnPMTcA+EL78TUAvlp0uws+\nH38EYLz9+M+rfj7ax50D4N8BHAIwWXS7C/772ArghwDOa399/lqvq56BH3YD2N9+vB/A23oPMLMf\nm9nj7cdPI9pvuixzMXYAOGpmT5rZSQB3IjonnTrP0dcA7CTJHNuYpzXPh5l9x8xWNrQ+BODinNuY\np0H+PgDgU4hurH6ZZ+MKMMj5+BCAz5nZ/wKAmcXtT99FwcAPF5jZMwDQ/nx+v4NJ7kB0R/BEDm3L\nw0UAnur4+nj7udhjzOxlAEsANuXSuvwNcj46fQDAtzJtUbHWPB8kXw/gEjP7Rp4NK8ggfx+XAbiM\n5PdIHiJ51Vov6u0M5LIh+W0Ar4n51vSQr3MhgDsA7DGzZRdt80DcHX5vmdsgx5TFwP9Wku8FMAng\njZm2qFh9zwfJMQC3AbgurwYVbJC/jzMQDRW9CVGv8T9IXmFmP096UQWDnJjZlUnfI/kzkhea2TPt\ni31sl47kuQC+CeDjZnYoo6YW4TiASzq+vhjA0wnHHCd5BoANAF7Ip3m5G+R8gOSViG4m3mhmv8qp\nbUVY63ycA+AKAN9tjxy+BsABkrvMrIwbqQ/6/+WQmf0awE9IPoYoODyY9KIaJvLDAQB72o/3ALin\n9wCSZwL4OoCvmNndObYtDw8C2Ery0va/8xpE56RT5zl6J4B/tXZmrITWPB/tYZEvAtg1yHhw4Pqe\nDzNbMrPNZlY3szqiHEpZAwEw2P+Xf0ZUZACSmxENGz3Z70UVDPzwaQBvJvk4gDe3vwbJSZL/2D7m\n3QD+EMB1JH/U/nhdMc11q50DuBHA/QAeBXCXmR0heSvJXe3DvgRgE8mjAG5CVHVVSgOej88COBvA\n3e2/hd6LQWkMeD4qY8DzcT+A50k+AuA7AD5iZs/3e13NQBYREfUMREREwUBERKBgICIiUDAQEREo\nGIiICBQMREQECgYiIgIFAxERAfD/fu31SOJHH2oAAAAASUVORK5CYII=\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "colors = ['red', 'blue', 'green', 'yellow', 'black', 'gray', 'orange', 'brown']\n",
+ "for i in range(len(cluster_names)):\n",
+ " plt.scatter(transformed[kmeans.labels_ == i][0], transformed[kmeans.labels_ == i][1], label=cluster_names[i], c=colors[i])\n",
+ "\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}