-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathyelp_recommendation_engine.py
More file actions
194 lines (157 loc) · 7.3 KB
/
yelp_recommendation_engine.py
File metadata and controls
194 lines (157 loc) · 7.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
"""
Simple item-item collaborative filtering restaurant recommendation engine built using the yelp dataset.
ver 1.0 - Works for states AZ, NV, NC.
For more information on the dataset used please visit http://yelp.com/dataset_challenge
@author Marco Mera
"""
import pandas as pd
import numpy as np
import sys
import random
from math import sqrt
def square_rooted(x):
'''
Helper function for cosine similarity
'''
return sqrt(sum([a*a for a in x]))
def cosine_similarity(x,y):
'''
The distance metric used to determine similarity between item vectors
'''
numerator = sum(a*b for a,b in zip(x,y))
denominator = square_rooted(x) * square_rooted(y)
return 1 - numerator/float(denominator)
def process_business_data(data):
'''
The yelp dataset consists of millions of businesses not limited to
restaurants. With this function I process the data by eliminating
the businesses that aren't restaurants as well as restaurants with a
rating less than 3.5 and a total review count less than 50.
'''
business = pd.read_csv(data,usecols=(15,20,23,36,38,61))
size = business.shape[0]
drop_index = []
#If a row in the DF is not a restaurant its index is added to a list to use later for dropping the rows
for i in range(size):
if "Restaurants" not in business["categories"][i]:
drop_index.append(i)
business = business.drop(business.index[[drop_index]])#Use the indexes to drop rows
business = business[business['review_count'] >= 50]
business = business[business['stars'] >= 3.5]
return business
def get_state_df(businessData,state):
'''
This function narrows down the DF to a particular state.
'''
states = [s for s in set(business.state.values) if business[business['state']==s].shape[0]>200] # A list of states who have over 200 restaurants
if state in states:
return businessData[businessData['state']== state]
else:
print "Sorry that state is not supported yet! Will use the default: NC "
return businessData[businessData['state']=='NC'] #default
def get_table(stateDF):
'''
Function to cross reference the reviews DF with the state DF in order to produce a pivot
table that contains the (ratings X users), with the NA values set to zero.
'''
reviews = stars[stars['business_id'].isin(stateDF['business_id'].values)]
table = reviews.pivot_table(index='business_id', columns='user_id', values='stars').fillna(0)
return table
def get_matrix(table):
'''
Function to convert the Pandas table to a matrix (Numpy array). It also serves to remove the users that
have reviewed less than '5%' of the businesses.
'''
matrix = table.values.T
size = matrix.shape[0]
n = matrix.shape[1]
index = []
for i in range(size):
if np.count_nonzero(matrix[i]) < int(n*.05): #If that user has reviewed less than 5% of n restaurants then we're going to drop that user
index.append(i)
matrix = np.delete(matrix,index,axis=0)
return matrix
def get_restaurant_names(table,state_df):
'''
This function grabs the restaurant names from a table and puts them in a list.
'''
business_ids = table.T.columns
restaurant_names = []
for i in range(len(business_ids)):
#This row cross references the business_ids in the pivot table generated with the state DF to get the actual name of the business.
restaurant_names.append((state_df.loc[state_df['business_id'] == business_ids[i]]).name.values[0])
return restaurant_names
def list_restaurant_options(matrix,restaurant_names):
'''
Function that searches the matrix for the 5 restaurants with the most reviews and prints those.
It also prints 5 random restaurants.
'''
restaurant_ratings = matrix.T
size = restaurant_ratings.shape[0]
maxList =[np.count_nonzero(row) for row in restaurant_ratings] #List of counts of reviews for each row(restaurant)
indexes =[]
rands = random.sample(range(0,size-1),5) #A list of five random ints to be used for printing random restaurants
#This loop appends the index of the row with the most reviews to a list of indexes
for i in range(5):
temp = maxList.index(max(maxList))
indexes.append(temp)
maxList[temp]=0 # We reset the to review count (to zero) so it won't be counted in the next iteration
print "Here are the top 5 most popular restaurants in the area:"
print "-------------------------------------------------------"
for restaurant in indexes:
print restaurant_names[restaurant]
print"**"
print "\nFor more options, here is a list of five random restaurants:"
print "-----------------------------------------------------------"
for restaurant in rands:
print restaurant_names[restaurant]
print"**"
def print_top_k_restaurants(matrix,restaurant_names,query_restaurant,k):
"""
"""
restaurant_ratings = matrix.T
if query_restaurant == "random":
print "\nChose random I see...I too like to live dangerously!"
rest_index = random.randint(0,restaurant_ratings.shape[0]) #We get an index for a random restaurant in the matrix
else:
try:
rest_index = restaurant_names.index(query_restaurant) #We attempt to find the restaurant the user is querying
except ValueError: #Otherwise we just use a random index
print "\nSorry that restaurant was not found! Will use a random restaurant instead :)"
rest_index = random.randint(0,restaurant_ratings.shape[0])
query_restaurant_rating = restaurant_ratings[rest_index] #We get the vector of user ratings for that restaurant
distances = [cosine_similarity(query_restaurant_rating,restaurant) for restaurant in restaurant_ratings] # We compute the distances between that restaurant and all the other restaurants in the matrix
indexes = []
#Compiles a list of the k restaurants(indexes) with the smallest distance to the query restaurant
for i in range(k+1):
temp = distances.index(min(distances))
indexes.append(temp)
distances[temp]=1 # Reset the distance (to one) so that it won't be counted in the next iteration
print "\nGreat! Based on the selected restaurant: %s" % restaurant_names[rest_index]
print "\nHere are your top %d recommended restaurants:" % k
print "---------------------------------------------"
for restaurant in indexes[1:]: #We don't want to count the smallest distance since it will be the query restaurant itself
print restaurant_names[restaurant]
print"**"
#Read files from command line
reviews = sys.argv[1]
businesses = sys.argv[2]
print "\n**************************************"
print "* Welcome to Restaurant Recommender! *"
print "**************************************"
state = raw_input("\nWhat state will you be traveling to?\nThese are your options: ['NC', 'WI', 'QC', 'PA', 'AZ', 'NV'] ")
print "\nOne moment please while we fire up some recommendations for you!\n"
stars = pd.read_csv(reviews, usecols=(0,4,6))
business = process_business_data(businesses)
state_df = get_state_df(business, state)
table = get_table(state_df)
matrix = get_matrix(table)
restaurant_names = get_restaurant_names(table, state_df)
run = 'y' #flag to keep running program, anything else terminates program
while run == 'y':
list_restaurant_options(matrix, restaurant_names)
selected_restaurant = raw_input("\nWhat restaurant are you curious about? (You can also type "'"random"'"): ")
k = raw_input("\nHow many restaurants do you want recommended to you? ")
print_top_k_restaurants(matrix,restaurant_names,selected_restaurant,int(k))
run = raw_input("\nWould you like to search for a different restaurant? [y/n] ")
print