Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 121 additions & 0 deletions README.CHS.md

Large diffs are not rendered by default.

125 changes: 69 additions & 56 deletions README.md

Large diffs are not rendered by default.

282 changes: 164 additions & 118 deletions TFRecModel/src/com/sparrowrecsys/offline/tensorflow/DIEN.py

Large diffs are not rendered by default.

58 changes: 38 additions & 20 deletions TFRecModel/src/com/sparrowrecsys/offline/tensorflow/DIN.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,24 @@
import tensorflow as tf

import pathlib
current_working_directory = pathlib.Path().absolute()
train_abs_path = current_working_directory / \
"src/main/resources/webroot/sampledata/trainingSamples.csv"
test_abs_path = current_working_directory / \
"src/main/resources/webroot/sampledata/testSamples.csv"
print(train_abs_path)
print(test_abs_path)

# Training samples path, change to your local path
training_samples_file_path = tf.keras.utils.get_file("trainingSamples.csv",
"file:///Users/zhewang/Workspace/SparrowRecSys/src/main"
"/resources/webroot/sampledata/trainingSamples.csv")
"file://" + str(train_abs_path))
# Test samples path, change to your local path
test_samples_file_path = tf.keras.utils.get_file("testSamples.csv",
"file:///Users/zhewang/Workspace/SparrowRecSys/src/main"
"/resources/webroot/sampledata/testSamples.csv")

"file://" + str(test_abs_path))

# load sample as tf dataset


def get_dataset(file_path):
dataset = tf.data.experimental.make_csv_dataset(
file_path,
Expand Down Expand Up @@ -63,7 +71,8 @@ def get_dataset(file_path):
#movie_emb_col = tf.feature_column.embedding_column(movie_col, EMBEDDING_SIZE)

# user id embedding feature
user_col = tf.feature_column.categorical_column_with_identity(key='userId', num_buckets=30001)
user_col = tf.feature_column.categorical_column_with_identity(
key='userId', num_buckets=30001)
user_emb_col = tf.feature_column.embedding_column(user_col, EMBEDDING_SIZE)

# genre features vocabulary
Expand All @@ -73,11 +82,13 @@ def get_dataset(file_path):
# user genre embedding feature
user_genre_col = tf.feature_column.categorical_column_with_vocabulary_list(key="userGenre1",
vocabulary_list=genre_vocab)
user_genre_emb_col = tf.feature_column.embedding_column(user_genre_col, EMBEDDING_SIZE)
user_genre_emb_col = tf.feature_column.embedding_column(
user_genre_col, EMBEDDING_SIZE)
# item genre embedding feature
item_genre_col = tf.feature_column.categorical_column_with_vocabulary_list(key="movieGenre1",
vocabulary_list=genre_vocab)
item_genre_emb_col = tf.feature_column.embedding_column(item_genre_col, EMBEDDING_SIZE)
item_genre_emb_col = tf.feature_column.embedding_column(
item_genre_col, EMBEDDING_SIZE)


'''
Expand All @@ -92,7 +103,8 @@ def get_dataset(file_path):
'''


candidate_movie_col = [ tf.feature_column.numeric_column(key='movieId', default_value=0), ]
candidate_movie_col = [tf.feature_column.numeric_column(
key='movieId', default_value=0), ]

recent_rate_col = [
tf.feature_column.numeric_column(key='userRatedMovie1', default_value=0),
Expand All @@ -103,7 +115,6 @@ def get_dataset(file_path):
]



# user profile
user_profile = [
user_emb_col,
Expand All @@ -125,18 +136,21 @@ def get_dataset(file_path):
candidate_layer = tf.keras.layers.DenseFeatures(candidate_movie_col)(inputs)
user_behaviors_layer = tf.keras.layers.DenseFeatures(recent_rate_col)(inputs)
user_profile_layer = tf.keras.layers.DenseFeatures(user_profile)(inputs)
context_features_layer = tf.keras.layers.DenseFeatures(context_features)(inputs)
context_features_layer = tf.keras.layers.DenseFeatures(
context_features)(inputs)

# Activation Unit

movie_emb_layer = tf.keras.layers.Embedding(input_dim=1001,output_dim=EMBEDDING_SIZE,mask_zero=True)# mask zero
movie_emb_layer = tf.keras.layers.Embedding(
input_dim=1001, output_dim=EMBEDDING_SIZE, mask_zero=True) # mask zero

user_behaviors_emb_layer = movie_emb_layer(user_behaviors_layer)
user_behaviors_emb_layer = movie_emb_layer(user_behaviors_layer)

candidate_emb_layer = movie_emb_layer(candidate_layer)
candidate_emb_layer = tf.squeeze(candidate_emb_layer,axis=1)
candidate_emb_layer = movie_emb_layer(candidate_layer)
candidate_emb_layer = tf.squeeze(candidate_emb_layer, axis=1)

repeated_candidate_emb_layer = tf.keras.layers.RepeatVector(RECENT_MOVIES)(candidate_emb_layer)
repeated_candidate_emb_layer = tf.keras.layers.RepeatVector(
RECENT_MOVIES)(candidate_emb_layer)

activation_sub_layer = tf.keras.layers.Subtract()([user_behaviors_emb_layer,
repeated_candidate_emb_layer]) # element-wise sub
Expand All @@ -148,14 +162,17 @@ def get_dataset(file_path):

activation_unit = tf.keras.layers.Dense(32)(activation_all)
activation_unit = tf.keras.layers.PReLU()(activation_unit)
activation_unit = tf.keras.layers.Dense(1, activation='sigmoid')(activation_unit)
activation_unit = tf.keras.layers.Dense(
1, activation='sigmoid')(activation_unit)
activation_unit = tf.keras.layers.Flatten()(activation_unit)
activation_unit = tf.keras.layers.RepeatVector(EMBEDDING_SIZE)(activation_unit)
activation_unit = tf.keras.layers.Permute((2, 1))(activation_unit)
activation_unit = tf.keras.layers.Multiply()([user_behaviors_emb_layer, activation_unit])
activation_unit = tf.keras.layers.Multiply()(
[user_behaviors_emb_layer, activation_unit])

# sum pooling
user_behaviors_pooled_layers = tf.keras.layers.Lambda(lambda x: tf.keras.backend.sum(x, axis=1))(activation_unit)
user_behaviors_pooled_layers = tf.keras.layers.Lambda(
lambda x: tf.keras.backend.sum(x, axis=1))(activation_unit)

# fc layer
concat_layer = tf.keras.layers.concatenate([user_profile_layer, user_behaviors_pooled_layers,
Expand All @@ -177,7 +194,8 @@ def get_dataset(file_path):
model.fit(train_dataset, epochs=5)

# evaluate the model
test_loss, test_accuracy, test_roc_auc, test_pr_auc = model.evaluate(test_dataset)
test_loss, test_accuracy, test_roc_auc, test_pr_auc = model.evaluate(
test_dataset)
print('\n\nTest Loss {}, Test Accuracy {}, Test ROC AUC {}, Test PR AUC {}'.format(test_loss, test_accuracy,
test_roc_auc, test_pr_auc))

Expand Down
65 changes: 44 additions & 21 deletions TFRecModel/src/com/sparrowrecsys/offline/tensorflow/DeepFM.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,24 @@
import tensorflow as tf

import pathlib
current_working_directory = pathlib.Path().absolute()
train_abs_path = current_working_directory / \
"src/main/resources/webroot/sampledata/trainingSamples.csv"
test_abs_path = current_working_directory / \
"src/main/resources/webroot/sampledata/testSamples.csv"
print(train_abs_path)
print(test_abs_path)

# Training samples path, change to your local path
training_samples_file_path = tf.keras.utils.get_file("trainingSamples.csv",
"file:///Users/zhewang/Workspace/SparrowRecSys/src/main"
"/resources/webroot/sampledata/trainingSamples.csv")
"file://" + str(train_abs_path))
# Test samples path, change to your local path
test_samples_file_path = tf.keras.utils.get_file("testSamples.csv",
"file:///Users/zhewang/Workspace/SparrowRecSys/src/main"
"/resources/webroot/sampledata/testSamples.csv")

"file://" + str(test_abs_path))

# load sample as tf dataset


def get_dataset(file_path):
dataset = tf.data.experimental.make_csv_dataset(
file_path,
Expand Down Expand Up @@ -51,14 +59,18 @@ def get_dataset(file_path):
}

# movie id embedding feature
movie_col = tf.feature_column.categorical_column_with_identity(key='movieId', num_buckets=1001)
movie_col = tf.feature_column.categorical_column_with_identity(
key='movieId', num_buckets=1001)
movie_emb_col = tf.feature_column.embedding_column(movie_col, 10)
movie_ind_col = tf.feature_column.indicator_column(movie_col) # movid id indicator columns
movie_ind_col = tf.feature_column.indicator_column(
movie_col) # movid id indicator columns

# user id embedding feature
user_col = tf.feature_column.categorical_column_with_identity(key='userId', num_buckets=30001)
user_col = tf.feature_column.categorical_column_with_identity(
key='userId', num_buckets=30001)
user_emb_col = tf.feature_column.embedding_column(user_col, 10)
user_ind_col = tf.feature_column.indicator_column(user_col) # user id indicator columns
user_ind_col = tf.feature_column.indicator_column(
user_col) # user id indicator columns

# genre features vocabulary
genre_vocab = ['Film-Noir', 'Action', 'Adventure', 'Horror', 'Romance', 'War', 'Comedy', 'Western', 'Documentary',
Expand All @@ -68,15 +80,18 @@ def get_dataset(file_path):
user_genre_col = tf.feature_column.categorical_column_with_vocabulary_list(key="userGenre1",
vocabulary_list=genre_vocab)
user_genre_emb_col = tf.feature_column.embedding_column(user_genre_col, 10)
user_genre_ind_col = tf.feature_column.indicator_column(user_genre_col) # user genre indicator columns
user_genre_ind_col = tf.feature_column.indicator_column(
user_genre_col) # user genre indicator columns
# item genre embedding feature
item_genre_col = tf.feature_column.categorical_column_with_vocabulary_list(key="movieGenre1",
vocabulary_list=genre_vocab)
item_genre_emb_col = tf.feature_column.embedding_column(item_genre_col, 10)
item_genre_ind_col = tf.feature_column.indicator_column(item_genre_col) # item genre indicator columns
item_genre_ind_col = tf.feature_column.indicator_column(
item_genre_col) # item genre indicator columns

# fm first-order term columns: without embedding and concatenate to the output layer directly
fm_first_order_columns = [movie_ind_col, user_ind_col, user_genre_ind_col, item_genre_ind_col]
fm_first_order_columns = [movie_ind_col, user_ind_col,
user_genre_ind_col, item_genre_ind_col]

deep_feature_columns = [tf.feature_column.numeric_column('releaseYear'),
tf.feature_column.numeric_column('movieRatingCount'),
Expand All @@ -90,17 +105,24 @@ def get_dataset(file_path):

item_emb_layer = tf.keras.layers.DenseFeatures([movie_emb_col])(inputs)
user_emb_layer = tf.keras.layers.DenseFeatures([user_emb_col])(inputs)
item_genre_emb_layer = tf.keras.layers.DenseFeatures([item_genre_emb_col])(inputs)
user_genre_emb_layer = tf.keras.layers.DenseFeatures([user_genre_emb_col])(inputs)
item_genre_emb_layer = tf.keras.layers.DenseFeatures(
[item_genre_emb_col])(inputs)
user_genre_emb_layer = tf.keras.layers.DenseFeatures(
[user_genre_emb_col])(inputs)

# The first-order term in the FM layer
fm_first_order_layer = tf.keras.layers.DenseFeatures(fm_first_order_columns)(inputs)
fm_first_order_layer = tf.keras.layers.DenseFeatures(
fm_first_order_columns)(inputs)

# FM part, cross different categorical feature embeddings
product_layer_item_user = tf.keras.layers.Dot(axes=1)([item_emb_layer, user_emb_layer])
product_layer_item_genre_user_genre = tf.keras.layers.Dot(axes=1)([item_genre_emb_layer, user_genre_emb_layer])
product_layer_item_genre_user = tf.keras.layers.Dot(axes=1)([item_genre_emb_layer, user_emb_layer])
product_layer_user_genre_item = tf.keras.layers.Dot(axes=1)([item_emb_layer, user_genre_emb_layer])
product_layer_item_user = tf.keras.layers.Dot(
axes=1)([item_emb_layer, user_emb_layer])
product_layer_item_genre_user_genre = tf.keras.layers.Dot(
axes=1)([item_genre_emb_layer, user_genre_emb_layer])
product_layer_item_genre_user = tf.keras.layers.Dot(
axes=1)([item_genre_emb_layer, user_emb_layer])
product_layer_user_genre_item = tf.keras.layers.Dot(
axes=1)([item_emb_layer, user_genre_emb_layer])

# deep part, MLP to generalize all input features
deep = tf.keras.layers.DenseFeatures(deep_feature_columns)(inputs)
Expand All @@ -123,7 +145,8 @@ def get_dataset(file_path):
model.fit(train_dataset, epochs=5)

# evaluate the model
test_loss, test_accuracy, test_roc_auc, test_pr_auc = model.evaluate(test_dataset)
test_loss, test_accuracy, test_roc_auc, test_pr_auc = model.evaluate(
test_dataset)
print('\n\nTest Loss {}, Test Accuracy {}, Test ROC AUC {}, Test PR AUC {}'.format(test_loss, test_accuracy,
test_roc_auc, test_pr_auc))

Expand All @@ -132,4 +155,4 @@ def get_dataset(file_path):
for prediction, goodRating in zip(predictions[:12], list(test_dataset)[0][1][:12]):
print("Predicted good rating: {:.2%}".format(prediction[0]),
" | Actual rating label: ",
("Good Rating" if bool(goodRating) else "Bad Rating"))
("Good Rating" if bool(goodRating) else "Bad Rating"))
Loading