diff --git a/Untitled31.ipynb b/Untitled31.ipynb new file mode 100644 index 0000000..39e2991 --- /dev/null +++ b/Untitled31.ipynb @@ -0,0 +1,852 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import nltk\n", + "import re\n", + "import os\n", + "from sklearn import feature_extraction\n", + "import pickle\n", + "import scipy\n", + "from nltk.stem import WordNetLemmatizer\n", + "wnl = WordNetLemmatizer()\n", + "from nltk import word_tokenize\n", + "from nltk.corpus import stopwords" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "100 100 100\n" + ] + } + ], + "source": [ + "titles = pickle.load(open('data/titles.pkl','rb'))\n", + "genres = pickle.load(open('data/genres.pkl','rb'))\n", + "synopsis = pickle.load(open('data/synopses.pkl','rb'))\n", + "print(len(titles), len(genres), len(synopsis))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Plot [edit] [ [ edit edit ] ] \n", + " On the day of his only daughter's wedding, Vito Corleone hears requests in his role as the Godfather, the Don of a New York crime family. Vito's youngest son, Michael, in a Marine Corps uniform, introduces his girlfriend, Kay Adams, to his family at the sprawling reception. Vito's godson Johnny Fontane, a popular singer, pleads for help in securing a coveted movie role, so Vito dispatches his consigliere, Tom Hagen, to Los Angeles to influence the abrasive studio head, Jack Woltz. Woltz is unmoved until the morning he wakes up in bed with the severed head of his prized stallion. On the day of his only daughter's wedding, Vito Corleone Vito Corleone hears requests in his role as the Godfather, the Don Don of a New York crime family. Vito's youngest son, Michael Michael , in a Marine Corps Marine Corps uniform, introduces his girlfriend, Kay Adams Kay Adams , to his family at the sprawling reception. Vito's godson Johnny\n" + ] + } + ], + "source": [ + "print(synopsis[0][:1000])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def preprocess_data(text):\n", + " \n", + " tagged_sentence = nltk.tag.pos_tag(text.split())\n", + " text = [word for word,tag in tagged_sentence if tag != 'NNP' and tag != 'NNPS']\n", + " text=' '.join(text) \n", + " \n", + " result = nltk.RegexpTokenizer(r'\\w+').tokenize(text)\n", + " text=' '.join(result)\n", + " stop = set(stopwords.words('english'))\n", + " s=[]\n", + " for i in text.split():\n", + " if i not in stop:\n", + " s.append(i)\n", + " \n", + " text=' '.join(s) \n", + " text=text.lower()\n", + " m=[]\n", + " for t in text.split():\n", + " k=wnl.lemmatize(t, pos='v')\n", + " m.append(k)\n", + " \n", + " text=' '.join(m)\n", + "\n", + " return text" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'edit edit on day daughter wed hear request role crime family youngest son uniform introduce girlfriend family sprawl reception godson popular singer plead help secure covet movie role dispatch consigliere influence abrasive studio head unmoved morning wake bed sever head prize stallion on day daughter wed hear request role crime family youngest son uniform introduce girlfriend family sprawl reception godson'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + " preprocess_data(synopsis[0][:1000])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall time: 19.3 s\n", + "(100, 453)\n" + ] + } + ], + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "\n", + "tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=20000, min_df=0.2,\\\n", + " stop_words='english', preprocessor=preprocess_data, \\\n", + " use_idf=True, ngram_range=(1, 1))\n", + "\n", + "%time tfidf_matrix = tfidf_vectorizer.fit_transform(synopsis)\n", + "\n", + "print(tfidf_matrix.shape)\n", + "terms = tfidf_vectorizer.get_feature_names()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from sklearn.cluster import KMeans" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "kmeans = KMeans(n_clusters=3, random_state=0).fit(tfidf_matrix.todense())" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titleclustergenre
1The Godfather1[u' Crime', u' Drama']
1The Shawshank Redemption1[u' Crime', u' Drama']
2Schindler's List2[u' Biography', u' Drama', u' History']
1Raging Bull1[u' Biography', u' Drama', u' Sport']
2Casablanca2[u' Drama', u' Romance', u' War']
2One Flew Over the Cuckoo's Nest2[u' Drama']
0Gone with the Wind0[u' Drama', u' Romance', u' War']
0Citizen Kane0[u' Drama', u' Mystery']
2The Wizard of Oz2[u' Adventure', u' Family', u' Fantasy', u' Mu...
2Titanic2[u' Drama', u' Romance']
2Lawrence of Arabia2[u' Adventure', u' Biography', u' Drama', u' H...
1The Godfather: Part II1[u' Crime', u' Drama']
1Psycho1[u' Horror', u' Mystery', u' Thriller']
1Sunset Blvd.1[u' Drama', u' Film-Noir']
1Vertigo1[u' Mystery', u' Romance', u' Thriller']
1On the Waterfront1[u' Crime', u' Drama']
0Forrest Gump0[u' Drama', u' Romance']
0The Sound of Music0[u' Biography', u' Drama', u' Family', u' Musi...
1West Side Story1[u' Crime', u' Drama', u' Musical', u' Romance...
2Star Wars2[u' Action', u' Adventure', u' Fantasy', u' Sc...
1E.T. the Extra-Terrestrial1[u' Adventure', u' Family', u' Sci-Fi']
22001: A Space Odyssey2[u' Mystery', u' Sci-Fi']
1The Silence of the Lambs1[u' Crime', u' Drama', u' Thriller']
1Chinatown1[u' Drama', u' Mystery', u' Thriller']
2The Bridge on the River Kwai2[u' Adventure', u' Drama', u' War']
0Singin' in the Rain0[u' Comedy', u' Musical', u' Romance']
1It's a Wonderful Life1[u' Drama', u' Family', u' Fantasy']
1Some Like It Hot1[u' Comedy']
112 Angry Men1[u' Drama']
2Dr. Strangelove or: How I Learned to Stop Worr...2[u' Comedy', u' War']
............
1Rain Man1[u' Drama']
0Annie Hall0[u' Comedy', u' Drama', u' Romance']
0Out of Africa0[u' Biography', u' Drama', u' Romance']
0Good Will Hunting0[u' Drama']
0Terms of Endearment0[u' Comedy', u' Drama']
1Tootsie1[u' Comedy', u' Drama', u' Romance']
1Fargo1[u' Crime', u' Drama', u' Thriller']
0Giant0[u' Drama', u' Romance']
2The Grapes of Wrath2[u' Drama']
1Shane1[u' Drama', u' Romance', u' Western']
1The Green Mile1[u' Crime', u' Drama', u' Fantasy', u' Mystery']
2Close Encounters of the Third Kind2[u' Drama', u' Sci-Fi']
0Network0[u' Drama']
0Nashville0[u' Drama', u' Music']
0The Graduate0[u' Comedy', u' Drama', u' Romance']
1American Graffiti1[u' Comedy', u' Drama']
1Pulp Fiction1[u' Crime', u' Drama', u' Thriller']
2The African Queen2[u' Adventure', u' Romance', u' War']
2Stagecoach2[u' Adventure', u' Western']
2Mutiny on the Bounty2[u' Adventure', u' Drama', u' History']
1The Maltese Falcon1[u' Drama', u' Film-Noir', u' Mystery']
1A Clockwork Orange1[u' Crime', u' Drama', u' Sci-Fi']
1Taxi Driver1[u' Crime', u' Drama']
0Wuthering Heights0[u' Drama', u' Romance']
1Double Indemnity1[u' Crime', u' Drama', u' Film-Noir', u' Thril...
1Rebel Without a Cause1[u' Drama']
1Rear Window1[u' Mystery', u' Thriller']
1The Third Man1[u' Film-Noir', u' Mystery', u' Thriller']
1North by Northwest1[u' Mystery', u' Thriller']
0Yankee Doodle Dandy0[u' Biography', u' Drama', u' Musical']
\n", + "

100 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " title cluster \\\n", + "1 The Godfather 1 \n", + "1 The Shawshank Redemption 1 \n", + "2 Schindler's List 2 \n", + "1 Raging Bull 1 \n", + "2 Casablanca 2 \n", + "2 One Flew Over the Cuckoo's Nest 2 \n", + "0 Gone with the Wind 0 \n", + "0 Citizen Kane 0 \n", + "2 The Wizard of Oz 2 \n", + "2 Titanic 2 \n", + "2 Lawrence of Arabia 2 \n", + "1 The Godfather: Part II 1 \n", + "1 Psycho 1 \n", + "1 Sunset Blvd. 1 \n", + "1 Vertigo 1 \n", + "1 On the Waterfront 1 \n", + "0 Forrest Gump 0 \n", + "0 The Sound of Music 0 \n", + "1 West Side Story 1 \n", + "2 Star Wars 2 \n", + "1 E.T. the Extra-Terrestrial 1 \n", + "2 2001: A Space Odyssey 2 \n", + "1 The Silence of the Lambs 1 \n", + "1 Chinatown 1 \n", + "2 The Bridge on the River Kwai 2 \n", + "0 Singin' in the Rain 0 \n", + "1 It's a Wonderful Life 1 \n", + "1 Some Like It Hot 1 \n", + "1 12 Angry Men 1 \n", + "2 Dr. Strangelove or: How I Learned to Stop Worr... 2 \n", + ".. ... ... \n", + "1 Rain Man 1 \n", + "0 Annie Hall 0 \n", + "0 Out of Africa 0 \n", + "0 Good Will Hunting 0 \n", + "0 Terms of Endearment 0 \n", + "1 Tootsie 1 \n", + "1 Fargo 1 \n", + "0 Giant 0 \n", + "2 The Grapes of Wrath 2 \n", + "1 Shane 1 \n", + "1 The Green Mile 1 \n", + "2 Close Encounters of the Third Kind 2 \n", + "0 Network 0 \n", + "0 Nashville 0 \n", + "0 The Graduate 0 \n", + "1 American Graffiti 1 \n", + "1 Pulp Fiction 1 \n", + "2 The African Queen 2 \n", + "2 Stagecoach 2 \n", + "2 Mutiny on the Bounty 2 \n", + "1 The Maltese Falcon 1 \n", + "1 A Clockwork Orange 1 \n", + "1 Taxi Driver 1 \n", + "0 Wuthering Heights 0 \n", + "1 Double Indemnity 1 \n", + "1 Rebel Without a Cause 1 \n", + "1 Rear Window 1 \n", + "1 The Third Man 1 \n", + "1 North by Northwest 1 \n", + "0 Yankee Doodle Dandy 0 \n", + "\n", + " genre \n", + "1 [u' Crime', u' Drama'] \n", + "1 [u' Crime', u' Drama'] \n", + "2 [u' Biography', u' Drama', u' History'] \n", + "1 [u' Biography', u' Drama', u' Sport'] \n", + "2 [u' Drama', u' Romance', u' War'] \n", + "2 [u' Drama'] \n", + "0 [u' Drama', u' Romance', u' War'] \n", + "0 [u' Drama', u' Mystery'] \n", + "2 [u' Adventure', u' Family', u' Fantasy', u' Mu... \n", + "2 [u' Drama', u' Romance'] \n", + "2 [u' Adventure', u' Biography', u' Drama', u' H... \n", + "1 [u' Crime', u' Drama'] \n", + "1 [u' Horror', u' Mystery', u' Thriller'] \n", + "1 [u' Drama', u' Film-Noir'] \n", + "1 [u' Mystery', u' Romance', u' Thriller'] \n", + "1 [u' Crime', u' Drama'] \n", + "0 [u' Drama', u' Romance'] \n", + "0 [u' Biography', u' Drama', u' Family', u' Musi... \n", + "1 [u' Crime', u' Drama', u' Musical', u' Romance... \n", + "2 [u' Action', u' Adventure', u' Fantasy', u' Sc... \n", + "1 [u' Adventure', u' Family', u' Sci-Fi'] \n", + "2 [u' Mystery', u' Sci-Fi'] \n", + "1 [u' Crime', u' Drama', u' Thriller'] \n", + "1 [u' Drama', u' Mystery', u' Thriller'] \n", + "2 [u' Adventure', u' Drama', u' War'] \n", + "0 [u' Comedy', u' Musical', u' Romance'] \n", + "1 [u' Drama', u' Family', u' Fantasy'] \n", + "1 [u' Comedy'] \n", + "1 [u' Drama'] \n", + "2 [u' Comedy', u' War'] \n", + ".. ... \n", + "1 [u' Drama'] \n", + "0 [u' Comedy', u' Drama', u' Romance'] \n", + "0 [u' Biography', u' Drama', u' Romance'] \n", + "0 [u' Drama'] \n", + "0 [u' Comedy', u' Drama'] \n", + "1 [u' Comedy', u' Drama', u' Romance'] \n", + "1 [u' Crime', u' Drama', u' Thriller'] \n", + "0 [u' Drama', u' Romance'] \n", + "2 [u' Drama'] \n", + "1 [u' Drama', u' Romance', u' Western'] \n", + "1 [u' Crime', u' Drama', u' Fantasy', u' Mystery'] \n", + "2 [u' Drama', u' Sci-Fi'] \n", + "0 [u' Drama'] \n", + "0 [u' Drama', u' Music'] \n", + "0 [u' Comedy', u' Drama', u' Romance'] \n", + "1 [u' Comedy', u' Drama'] \n", + "1 [u' Crime', u' Drama', u' Thriller'] \n", + "2 [u' Adventure', u' Romance', u' War'] \n", + "2 [u' Adventure', u' Western'] \n", + "2 [u' Adventure', u' Drama', u' History'] \n", + "1 [u' Drama', u' Film-Noir', u' Mystery'] \n", + "1 [u' Crime', u' Drama', u' Sci-Fi'] \n", + "1 [u' Crime', u' Drama'] \n", + "0 [u' Drama', u' Romance'] \n", + "1 [u' Crime', u' Drama', u' Film-Noir', u' Thril... \n", + "1 [u' Drama'] \n", + "1 [u' Mystery', u' Thriller'] \n", + "1 [u' Film-Noir', u' Mystery', u' Thriller'] \n", + "1 [u' Mystery', u' Thriller'] \n", + "0 [u' Biography', u' Drama', u' Musical'] \n", + "\n", + "[100 rows x 3 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "films = {'title': titles, 'synopsis': synopsis, 'cluster': kmeans.labels_, 'genre': genres}\n", + "frame = pd.DataFrame(films, index = [kmeans.labels_] , columns = ['title', 'cluster', 'genre'])\n", + "frame" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1 38\n", + "2 34\n", + "0 28\n", + "Name: cluster, dtype: int64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "frame['cluster'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top terms per cluster:\n", + "Cluster 0 words: marry, love, father, family, home, marriage,\n", + "Cluster 0 titles: Gone with the Wind, Citizen Kane, Forrest Gump, The Sound of Music, Singin' in the Rain, Amadeus, A Streetcar Named Desire, The Philadelphia Story, An American in Paris, The Best Years of Our Lives, My Fair Lady, Doctor Zhivago, Braveheart, City Lights, The King's Speech, It Happened One Night, A Place in the Sun, Mr. Smith Goes to Washington, Annie Hall, Out of Africa, Good Will Hunting, Terms of Endearment, Giant, Network, Nashville, The Graduate, Wuthering Heights, Yankee Doodle Dandy,\n", + "Cluster 1 words: police, car, kill, apartment, say, murder,\n", + "Cluster 1 titles: The Godfather, The Shawshank Redemption, Raging Bull, The Godfather: Part II, Psycho, Sunset Blvd., Vertigo, On the Waterfront, West Side Story, E.T. the Extra-Terrestrial, The Silence of the Lambs, Chinatown, It's a Wonderful Life, Some Like It Hot, 12 Angry Men, Unforgiven, Rocky, To Kill a Mockingbird, The Apartment, Goodfellas, The Exorcist, The French Connection, Midnight Cowboy, Rain Man, Tootsie, Fargo, Shane, The Green Mile, American Graffiti, Pulp Fiction, The Maltese Falcon, A Clockwork Orange, Taxi Driver, Double Indemnity, Rebel Without a Cause, Rear Window, The Third Man, North by Northwest,\n", + "Cluster 2 words: kill, soldier, water, order, camp, officer,\n", + "Cluster 2 titles: Schindler's List, Casablanca, One Flew Over the Cuckoo's Nest, The Wizard of Oz, Titanic, Lawrence of Arabia, Star Wars, 2001: A Space Odyssey, The Bridge on the River Kwai, Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb, Apocalypse Now, Gandhi, The Lord of the Rings: The Return of the King, Gladiator, From Here to Eternity, Saving Private Ryan, Raiders of the Lost Ark, Ben-Hur, Patton, Jaws, The Good, the Bad and the Ugly, Butch Cassidy and the Sundance Kid, The Treasure of the Sierra Madre, Platoon, High Noon, Dances with Wolves, The Pianist, The Deer Hunter, All Quiet on the Western Front, The Grapes of Wrath, Close Encounters of the Third Kind, The African Queen, Stagecoach, Mutiny on the Bounty,\n" + ] + } + ], + "source": [ + "print(\"Top terms per cluster:\")\n", + "order_centroids = np.asarray(kmeans.cluster_centers_).argsort()[:, ::-1]\n", + "cluster_names = []\n", + "for i in range(order_centroids.shape[0]):\n", + " print(\"Cluster %d words:\" % i, end='')\n", + " q = \"\"\n", + " for ind in order_centroids[i, :6]:\n", + " print(' %s' % terms[ind], end=',')\n", + " q += str(terms[ind])\n", + " q += \" \"\n", + " cluster_names.append(q)\n", + " print()\n", + " print(\"Cluster %d titles:\" % i, end='')\n", + " for title in frame.loc[i]['title'].values.tolist():\n", + " print(' %s,' % title, end='')\n", + " print()\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from sklearn.decomposition import PCA as sklearnPCA" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "pca = sklearnPCA(n_components=2) #2-dimensional PCA\n", + "transformed = pd.DataFrame(pca.fit_transform(tfidf_matrix.todense()))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD8CAYAAACVZ8iyAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAG81JREFUeJzt3X+MZWV5B/Dvd5bFMAFW9gdIgbmX\ntksTyhrNTjY1aavNYoom7KrxB/Rql/pjQilpNlQrZkxRzE2spmFJqtFpjVmZaRBMLItiKG61P6xr\nGKJduhBkwTvLCpEB6oS66MrO0z/OneXeO+fcufee95zzvud8P8lk7tw5c++7Z2bPc973ed73pZlB\nRESqbazoBoiISPEUDERERMFAREQUDEREBAoGIiICBQMREYGCgYiIQMFARESgYCAiIgDOKLoBSTZv\n3mz1er3oZoiIBOWhhx56zsy2DPtz3gaDer2O+fn5opshIhIUkguj/JyGiURERMFAREQUDEREBAoG\nIiICBQMREYGCgYiIwFEwIHkVycdIHiV5c5/j3knSSE66eF8REXEjdTAguQ7A5wC8BcDlAK4leXnM\ncecA+EsAP0j7npLO3BxQrwNjY9HnubmiWyQiRXPRM9gB4KiZPWlmJwHcCWB3zHGfAvAZAL908J6V\nkMVFe24OmJoCFhYAs+jz1JQCgkjVuQgGFwF4quPr4+3nTiP5egCXmNk3HLxfJWR10Z6eBk6c6H7u\nxInoeRGpLhfBgDHP2elvkmMAbgPwV2u+EDlFcp7k/OLiooOmhSuri/axY8M9LyLV4CIYHAdwScfX\nFwN4uuPrcwBcAeC7JFsAfg/AgbgkspnNmNmkmU1u2TL0OkulktVFe2JiuOfjKOcgUj4ugsGDALaS\nvJTkmQCuAXBg5ZtmtmRmm82sbmZ1AIcA7DIzrULXh4uLdpxmExgf735ufDx6fhDKOYiUU+pgYGYv\nA7gRwP0AHgVwl5kdIXkryV1pX7+q0l60kzQawMwMUKsBZPR5ZiZ6fhDKOYiUE81s7aMKMDk5aVVf\nwnpuLrrIHjsW9QiazcEv2lkZG4t6BL1IYHk5//aISDeSD5nZ0HO5vN3PQKILf9EX/14TE9HQUNzz\nIhIuLUchQ8lq+EpEiqVgIENJm3MQET9pmEiG5uPwlYiko56BpDbqvAPNVxDxh3oGksrKvIOVctOV\neQdA/97DqD8nItlQaamkUq/HVxfVakCr5f7nRKS/UUtLNUwkqYy6bIbWSBLxi4KBJ0IdPx912Yys\nltsQkdEoGHgg5PV+Rp13oPkKIn5RMPBAyOv9jDrvQPMVRPyiBLIHtN6PiLiiBHLANH4uIkVTMPCA\nxs9FpGgKBh7Q+LmIFE3BwBONRjTZank56hFMT4dXZioi4dJyFJ7RMg0iUgT1DDwTcpmpiIRLwcAz\nWqZBRIqgYOAZlZmKSBEUDDxT5TLTUNdnEikDBQPPVLXMNOT1mUTKQMHAQ51lpq1W+QMBMFriXD0J\nEXdUWipeGDZxrhJcEbfUMxAvDJs4VwmuiFsKBuKFYRPnKsEVcUvBQFJzMXY/bOJcJbgibikYSCou\nq4CGSZxXuQRXJAsKBpJKUWP3VS3BFcmKgoFHQiyVLHLsvooluCJZUTDwRKiTrjR2L1IOCgaeCLVU\nUmP3IuWgYOCJUEslNXYvUg6ageyJiYloaCjued81Grr4i4ROPQNPaLhFRIqkYOAJDbeISJE0TOQR\nDbeISFHUMxAREQUDERFRMJCUQpw1LSKrOQkGJK8i+RjJoyRvjvn+TSQfIXmY5EGSNRfvK8UKdda0\niKyWOhiQXAfgcwDeAuByANeSvLznsB8CmDSz1wL4GoDPpH3fqvPhjjzUWdMispqLnsEOAEfN7Ekz\nOwngTgC7Ow8ws++Y2cpl4xCAix28b2X5ckce6qxpEVnNRTC4CMBTHV8fbz+X5AMAvhX3DZJTJOdJ\nzi8uLjpoWjn5ckeuRepEysNFMGDMcxZ7IPleAJMAPhv3fTObMbNJM5vcsmWLg6aVky935Jo1LVIe\nLoLBcQCXdHx9MYCnew8ieSWAaQC7zOxXDt63sny5I9esaZHycBEMHgSwleSlJM8EcA2AA50HkHw9\ngC8iCgTPOnjPSkt7R+4y+awNZkTKIXUwMLOXAdwI4H4AjwK4y8yOkLyV5K72YZ8FcDaAu0n+iOSB\nhJeTAaS5I/cl+SwifqFZ7PB+4SYnJ21+fr7oZpROvR6/VHatFt3Zi0jYSD5kZpPD/pxmIFeML8ln\nEfGLgkHF+JJ8FhG/KBhUTB7loD7MjhaR4SgYVEzW5aBKUIuESQlkcUoJaj/NPTyH6YPTOLZ0DBMb\nJtDc2URjm+qAy2jUBLJ2OhOnlKD2z9zDc5i6dwonfh2tYbKwtICpe6cAQAFBTtMwkTilBLV/pg9O\nnw4EK078+gSmDw62mNXcw3Oo76tj7JNjqO+rY+5hjfmVkYKBOKX1ivxzbCm+W5b0fKeVXsXC0gIM\ndrpXoYBQPgoGBShztY3WK/LPxIb4blnS853S9iokHAoGOatCtc1a6xWVORj6qLmzifH13d218fXj\naO5cu7uWplchYVEwyJkvexEUpQrB0DeNbQ3MXD2D2oYaCKK2oYaZq2cGSh6n6VVIWBQMclb1apu8\ng2HVkp9Jva7GtgZae1tYvmUZrb2tgauI0vQqJCwKBjnzqdqmiOGaPINhmZOfcUEui15Xml6FhEWT\nznK28h+28+6YBK6/Hvj857N7z+np6II7MfFKZU9vO8bHs0/2Jk1K2/SmOZy92+2kqPq+OhaWVr/Z\npjNqeG66leq1i9Q7bwCI7tbPemAGz3939TnThL9qGXXSmYJBAW64AfjCF6K7txVZXYjjgs/4OHDW\nWcDzz68+PusLR1x71m+fA3dN4aR1X9zS3oGOfXIMFrcDqxGzW5eDrXBKCnL4eQ3Y11r1NBkl86Ua\ntIR1QO67rzsQANmNmyeN0ccFAiD73EVc6em5b5/uCgSAm/LFxCTn0kTQCfvESp4N8c9rwp8MQsGg\nAHmOmw/7mnlcOHpLT194OZvyxebOJnCyZwbcyXHgYDPohH1SkNu0fkIT/mRkCgYFyDOJnPSamzb5\nM1M4q/LFxrYGNv3XTDR8Yow+3zsDPNwI+m45qcLn9l3N070ubJvDug/XceKvxzC92F1FVbUKKxmM\nFqorQLMZP46fxYU46b1uvz163JtYLmIcvbmzGZsQHbZ8MW5lzts/2MDUVCOXc52XlTxK7Cqk2wC8\nNnlhOgBatE5iKYFckLgKn6wuxHm+16jSLrGcVGEzc/UMcLjh/b/fpaQEc21DDQASv9fa28q6aZID\nVRNJpfW7AFblIrcS9BeuGwO4+v81QQCIrbAiiOVbVHJUBqomkkqr+ho6nRPOsJScg9HyEpJEwUBK\noeoXua4S4oOrq6hWcjBaXkKSKBhIKVT9ItdVKvtwI6qaaldRdS4hEeryEqqAyp5yBhUWQmJ5GIMm\nocu4H3CZ957uVxwQ+u8tC0ogy1CSlqko+0Y0Zb2wlPn3qeKA4SiBLEOp6r4KZd25q8w7zFW9OCAv\nCgYVFcq+Cq7Hist8YVlrh7lQVb04IC8KBp7Las+BLJbEcN3WLPYj0IUlPFUvDsiLgoHHstwistl0\nuzZRXFvf/35g8+bRg0MWQzq6sIQn1Aqo0CiB7LGsK0RcVhMltbXTsAnNpP0I0s6WLWM1kcgKVROV\n0NjY6n0PAD83K0lqa69hApmqSESGp2qiEvJpv+S1DNqmhYXB8woa0hHJj4KBx1yP62cprq1xyMFz\nIBorjmj2reTCzLz82L59uwVldtasVjMjo8+zsz6/bCY627ppk9n69WbRZT/6ILu/Xvmo1Ypuub9m\nD8/aeHPc8Amc/hhvjtvsYY//EHrMHp612m014ydotdtqQbU9Ly7PEYB5G+Gaq5zBqDqzrxs3Ai++\nCJw8+cr3yzL9M4XeBHVSgtnHHIgvQs+blHXGt0uuz5ESyHmKm/sfpwwLwzhU5vVzspJVRVVeQg9m\neXB9jpRAzlPcWg5xfJvO69AoE8xCyoEAfozVhz5Jrswzvl3x5RwpGIyiz0V+Dteijp9gDKdQHzvm\nbMawT0adDBfS+jlZzH4eRegVVUlBy2BKhrf5EvCdBAOSV5F8jORRkjfHfP9VJL/a/v4PSNZdvG9h\nEuoo53AtpvAPWEAdhjEsnLrY2Yxhn6RZ5C6U9XN8WdAu9IqquGC2oqgA6xtfAn7qnAHJdQB+DODN\nAI4DeBDAtWb2SMcxNwB4rZldT/IaAG83s/f0e93gcgbr16N+6gksLF+y6vCyjYmHNBluVKGP1ftk\nZcZ33Lg4oPwB4HZWfJE5gx0AjprZk2Z2EsCdAHb3HLMbwP72468B2EmSDt67GHHjHV/+Mo7Z6kAA\nlC91ENJkuFH50nUvg8a2Blp7WyDi/8srf/DKOVq+ZRmtva1Cen4ugsFFAJ7q+Pp4+7nYY8zsZQBL\nADY5eO/+slryE4gd73Bxkcyyya74lAjOKsnrS9fdFy7OswKs31wEg7hw39u/HuQYkJwiOU9yfnFx\nMV2rslzyM0Hai2QBTR6JL4ngLJO8Po3VF13V5Oo8K8B6bpSZap0fAN4A4P6Orz8G4GM9x9wP4A3t\nx2cAeA7tfEXSR+oZyLVaIdNd08wYTttk32cru56JWrut1jUzd+WjdlvNTYM94MMMZJfnObTZyKG1\n16zAGcgkz0CUQN4J4KeIEsh/YmZHOo75CwDb7JUE8jvM7N39Xjd1AjnALGeaJo+0B67LNazXkMVM\n1CokeX2YtFWF8xwn1NnThSWQLcoB3Ijo7v9RAHeZ2RGSt5Lc1T7sSwA2kTwK4CYAq8pPnQswy5mm\nyUOXe+Y8JpVFqWYVxqB9mJCUdD43nrWx8El5WfKlvDgvTuYZmNl9ZnaZmf2WmTXbz/2NmR1oP/6l\nmb3LzH7bzHaY2ZMu3rcvn7KcA0rT5KH3NE4zWWAEWVzUhh2DLnrsfRQ+BLy487x+bD1ePPli4ZPy\ngOx+rz4E4jyVdwayL1nOIaRp8tC9iqGjx2rDVD5lcVEbJsnry4ziYfmQdI07z+e+6lycPHWy67gi\n7pqz/L36EIjzpIXqQtUz3j/31llM7f/9wXMGKVeNGzZHUfT4qw9j78PonIS08ayNAIAXXnrBm206\nfckjrPV7TTOZq+i/2VFpoboqiRnvb+z/Y8zs+c/BexUph9GGHWUqulQzpC5/793u8y89j5defgl3\nvOOOwiYk9fLlrrnf7zVtr6Hov9m8qWcQIldrQaeoJgqtWCvrnoHL5QRC6MX4ctfc71wB8P48ZkE9\ngypxMN4PINWqcaEVa2U59u563DqEXowvd839fq8hnEefKBiEyIMrcWjFWllevFyXIPoyBLMWH9bT\n6fd7DeU8+kLBIESDXIkzXuQowGKtzC5eru9AfaggCknS71XncTgKBiFa60qc04SyUPYmyJrrO9A0\nvZgQ51JkxZehrFAogVxG2mw4V74kU31phxRLCWR5hasEswzElzvQqi2fIG4pGIRk0DyABwnmqvEh\nmarqGUlDwSAUw+QBQiv1ESdUPSNpKBiEYpgpvz6U+oSwZVvJqHpG0lACORRJW0b7OOV3pM0VxAWX\nM6ElTKMmkBUMQjA3B7zvffHrP/hYIaRqJpHCqJqozKankxcC8jEPoGomkeAoGIQg6SJq5uewi6qZ\nRIKjYBCCpItorZZvOwalaiaR4CgYhGDIi2vhhTxpq5kK/weIVJCZefmxfft2kw6zs2a1mhkZfZ6d\nTTxsfNwsGkOKPsbHEw/3T/D/AJFiAZi3Ea65qibyQYpNZnoFX8gT/D9ApFiqJgqV4xVGXRXyFDZS\no0okkUIoGBRt2M2E1zBMIU/SBT+nFbAHb2i/50XECQWDoiXd8S4sjHRLPmiuud8F33F8Go4qkUQK\noWBQtH53vCPckg9ayNPvgn9sIT6PlPS8Uz6sqyRSQQoGRYu7E+40wi35IDuQ9Ruan1j309jv9T6f\nWV5BW6gNRbubiQsKBkXrvBNOMuKQUT/9huabpz6Kcfyi6/lx/ALNUx89/XWheQU5bWV3s4WlBRgM\nC0sLmLp3SgFBhqZg4IOVO+G1AoLDq22/oflG7XuYwYdQQwvEMmpoYQYfQqP2vdPHFppXkNOK3t1M\nvZLyUDDwSQZDRkn6Ds03m2iM34MWLsUy1qGFS9EYv6criVvVClDfLn5F7m6mXkm5KBj4ZNAhI0eD\n9IlD8wMkcatYAerjxa/I3c2K7pWIWwoGvhlkyCiPQfo1krhVrAD18eJX5O5m2nO5XBQMfLXWkBFQ\n6CB9FStAfbz4NbY1MHP1DGobaiCI2oYaZq6eyWV3szx6Jb4Ny5WZ1ibyWeeaRUm/Jx+3vSyp+r46\nFpZWr5tU21BDa28r/wYVbGXYrLO3NL5+3Fkwyvr1y0prE5VR51BN0rBRmQfpPaMN57tl3SvxcViu\nzM4ougEyoGYzfpP5Mg/Se2blIqcN51/R2NbI7N/v47BcmalnEIpRB+nzWH60QpvRNLY10NrbwvIt\ny2jtbVU6EGStyEqpKlIwCMmwyzTkMU1YU5ElIxqWy5cSyGWWx0Yx2oxGMjT38JyG5YY0agJZwaDM\nyOTvufq9j43Fv9YaVU4ON3cTkQ6FVBOR3EjyAZKPtz+fF3PM60h+n+QRkodJvifNe8oQ1q0b7vlR\njDAV2eXIkurQRdxImzO4GcBBM9sK4GD7614nAPypmf0ugKsA7CP56pTvK4M4dSr5eVcJ3xGmIrta\n5M7H5SFEQpU2GOwGsL/9eD+At/UeYGY/NrPH24+fBvAsgC0p31cGkTQ3gXSX8B2hysnVIneqQxdx\nJ20wuMDMngGA9ufz+x1McgeAMwE8kfJ9ZRBxd+3k6jH+tMtaDFnl5GqRO9Whi7izZjAg+W2S/xPz\nsXuYNyJ5IYA7APyZmcVmFklOkZwnOb+4uDjMy0ucuLv2pMRxjmtPu1rkTnXoIu6sGQzM7EozuyLm\n4x4AP2tf5Fcu9s/GvQbJcwF8E8DHzexQn/eaMbNJM5vcskUjSanFlex4sKyFq0XuVIcu4k7aYaID\nAPa0H+8BcE/vASTPBPB1AF8xs7tTvp8MKqlk561v9WLtaRfbHBe5YqdI2aSaZ0ByE4C7AEwAOAbg\nXWb2AslJANeb2QdJvhfAlwEc6fjR68zsR/1eW/MMUuo3GazZVJG/SElp0pl0G3EymJc0Q01kYFrC\nWrqVZV9KrX0kkgsFg7Iqy76UrmaoiUhfCgZlVZZ9KV3NUBORvrS5TZk1GuFd/HtNTMQnwkMb7hLx\nnHoG4jePhru0KJ6UmYKB+M2T4S4tiidlp9JSkQHU99WxsLR6uKq2oYbW3lb+DRJJoNJSkQxpUTwp\nOwUDkQFoUTwpOwUDkQFoUTwpOwUDkQFoUTwpOyWQQ6Y1e0Skx6gJZE06C9XKmj0rSzWsrNkDKCCI\nyNA0TBSqrNfsmZuLlsEeG4s+a2E4kVJTzyBUWa7Zo16HSOWoZxCqLJeo1kqhIpWjYBCqLNfs0Uqh\n2dHwm3hKwSBUWa7ZU5aNcXyjjXrEYyotldV6cwZA1OsIcT8En/Tbl7rVyrs1UlJam0jijTIs4clK\noaWj4TfxmIJBmaUZlmg0orvV5eXo87CBQGPjq2n4TTymYFBmRVUFaWw8nkcb9Yj0UjAos6KGJVSa\nGk/Db+IxTTors6L2D9bYeLIy7EstpaSeQVo+j40XNSyhsXGR4CgYpOH72HhRwxIaGxcJjuYZpKG6\n8WRaXlukEKPOM1AwSGNsLOoR9CKjkkwRkZxp0lkRNDYuIiWhYJCGxsZFpCQUDNJQ3biIlITmGaSl\nunERKQH1DERERMFAREQUDEREBAoGIiICBQMREYGCgYiIQMFARESgYCAiIkgZDEhuJPkAycfbn8/r\nc+y5JH9K8u/TvKek4PPeCyJSqLQ9g5sBHDSzrQAOtr9O8ikA/5by/WRUvu+9ICKFShsMdgPY3368\nH8Db4g4iuR3ABQD+JeX7yai0L7GI9JE2GFxgZs8AQPvz+b0HkBwD8HcAPrLWi5GcIjlPcn5xcTFl\n06SL9iUWkT7WXKiO5LcBvCbmW4PeUt4A4D4ze4pk3wPNbAbADBBtbjPg68sgJibid2XT3gsiggGC\ngZldmfQ9kj8jeaGZPUPyQgDPxhz2BgB/QPIGAGcDOJPk/5lZv/yCuNZsRjmCzqEi7b0gIm1ph4kO\nANjTfrwHwD29B5hZw8wmzKwO4MMAvqJAUADtvSAifaTdz+DTAO4i+QEAxwC8CwBITgK43sw+mPL1\nxSXtvSAiCWhxG7p7YHJy0ubn54tuhohIUEg+ZGaTw/6cZiCLiIiCgYiIKBiIiAgUDEREBAoGIiIC\nBQMREYGCgYiIwON5BiQXAcQsplMJmwE8V3QjPKLz0U3no5vOR7ffMbNzhv2htDOQM2NmW4puQ1FI\nzo8yaaSsdD666Xx00/noRnKk2boaJhIREQUDERFRMPDVTNEN8IzORzedj246H91GOh/eJpBFRCQ/\n6hmIiIiCgQ9IbiT5AMnH25/PiznmdSS/T/IIycMk31NEW7NC8iqSj5E8SnLV5kckX0Xyq+3v/4Bk\nPf9W5meA83ETyUfafwsHSdaKaGde1jofHce9k6S191QprUHOB8l3t/9GjpD8pzVf1Mz0UfAHgM8A\nuLn9+GYAfxtzzGUAtrYf/waAZwC8uui2O/r3rwPwBIDfBHAmgP8GcHnPMTcA+EL78TUAvlp0uws+\nH38EYLz9+M+rfj7ax50D4N8BHAIwWXS7C/772ArghwDOa399/lqvq56BH3YD2N9+vB/A23oPMLMf\nm9nj7cdPI9pvuixzMXYAOGpmT5rZSQB3IjonnTrP0dcA7CTJHNuYpzXPh5l9x8xWNrQ+BODinNuY\np0H+PgDgU4hurH6ZZ+MKMMj5+BCAz5nZ/wKAmcXtT99FwcAPF5jZMwDQ/nx+v4NJ7kB0R/BEDm3L\nw0UAnur4+nj7udhjzOxlAEsANuXSuvwNcj46fQDAtzJtUbHWPB8kXw/gEjP7Rp4NK8ggfx+XAbiM\n5PdIHiJ51Vov6u0M5LIh+W0Ar4n51vSQr3MhgDsA7DGzZRdt80DcHX5vmdsgx5TFwP9Wku8FMAng\njZm2qFh9zwfJMQC3AbgurwYVbJC/jzMQDRW9CVGv8T9IXmFmP096UQWDnJjZlUnfI/kzkhea2TPt\ni31sl47kuQC+CeDjZnYoo6YW4TiASzq+vhjA0wnHHCd5BoANAF7Ip3m5G+R8gOSViG4m3mhmv8qp\nbUVY63ycA+AKAN9tjxy+BsABkrvMrIwbqQ/6/+WQmf0awE9IPoYoODyY9KIaJvLDAQB72o/3ALin\n9wCSZwL4OoCvmNndObYtDw8C2Ery0va/8xpE56RT5zl6J4B/tXZmrITWPB/tYZEvAtg1yHhw4Pqe\nDzNbMrPNZlY3szqiHEpZAwEw2P+Xf0ZUZACSmxENGz3Z70UVDPzwaQBvJvk4gDe3vwbJSZL/2D7m\n3QD+EMB1JH/U/nhdMc11q50DuBHA/QAeBXCXmR0heSvJXe3DvgRgE8mjAG5CVHVVSgOej88COBvA\n3e2/hd6LQWkMeD4qY8DzcT+A50k+AuA7AD5iZs/3e13NQBYREfUMREREwUBERKBgICIiUDAQEREo\nGIiICBQMREQECgYiIgIFAxERAfD/fu31SOJHH2oAAAAASUVORK5CYII=\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "colors = ['red', 'blue', 'green', 'yellow', 'black', 'gray', 'orange', 'brown']\n", + "for i in range(len(cluster_names)):\n", + " plt.scatter(transformed[kmeans.labels_ == i][0], transformed[kmeans.labels_ == i][1], label=cluster_names[i], c=colors[i])\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}