Cosine-Similarity-with-Databricks-and-Python/Cos_Sim_DB_&_Py.py at main · ProjectWorst/Cosine-Similarity-with-Databricks-and-Python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from datetime import datetime, timedelta
from delta.tables import *
from pyspark.sql.functions import *
from pyspark.sql.types import StructType
import string
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Set up the path to the stopwords file
stop_words_path = '/filelocation/stopwords.txt'
with open(stop_words_path, 'r') as file:
    stopwords_list = file.read().splitlines()

# Drop existing table if it exists
%sql
DROP TABLE IF EXISTS data_base_location.table_name

# Database and sheet configuration
database_name = 'data_base_name'
sheets = ['sheet1', 'sheet2', 'sheet3', 'sheet4', 'sheet5', 'sheet6', 'sheet7', 'sheet8', 'sheet9', 'sheet10', 'sheet11', 'sheet12']

# Preprocessing function to clean and filter text
def preprocess_text(text):
    """Lowercase the text, remove leading/trailing spaces, and filter out stopwords."""
    text = text.translate(str.maketrans('', '', string.punctuation)
    text = text.strip().lower()
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords_list]
    return ' '.join(tokens)

# Load and preprocess data from each sheet
data_frames = {}
for sheet in sheets:
    # Assuming data_frames is a dictionary containing DataFrames for each sheet
    data_frames[sheet] = pd.read_excel(f"{sheet}.xlsx")  # Example of loading data
    data_frames[sheet]['processed_column_header'] = data_frames[sheet]['column_header'].apply(preprocess_text)

# Initialize and fit the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
all_text = []
for df in data_frames.values():
    all_text.extend(df['processed_column_header'].tolist())
tfidf_vectorizer.fit(all_text)

# Transform text data into TF-IDF matrices
tfidf_matrices = {sheet: tfidf_vectorizer.transform(data_frames[sheet]['processed_column_header']) for sheet in sheets}

# Mapping sheet names to custom labels
sheet_change = {'sheet1': 'DATA1', 'sheet2': 'DATA2', 'sheet3': 'DATA3', 'sheet4': 'DATA4', 'sheet5': 'DATA5', 'sheet6': 'DATA6', 'sheet7': 'DATA7', 'sheet8': 'DATA8', 'sheet9': 'DATA9', 'sheet10': 'DATA10', 'sheet11': 'DATA11', 'sheet12': 'DATA12'}

# Calculate cosine similarity and format the data
formatted_data = []
for idx1, row1 in data_frames[sheets[0]].iterrows():
    first_sheet_text = row1['column_header']
    first_tfidf_matrix = tfidf_matrices[sheets[0]][idx1]

    for sheet in sheets[1:]:
        for idx2, row2 in data_frames[sheet].iterrows():
            other_sheet_text = row2['column_header']
            other_tfidf_matrix = tfidf_matrices[sheet][idx2]

            similarity = cosine_similarity(first_tfidf_matrix, other_tfidf_matrix.reshape(1, -1))[0][0]
            formatted_data.append({
                'Column1': similarity,
                'Column2': idx1,
                'Column3': 'custom',
                'column_header': first_sheet_text,
            })

            formatted_data.append({
                'Column1': similarity,
                'Column2': idx2,
                'Column3': sheet_change[sheet],
                'column_header': other_sheet_text,
            })

# Enable Delta Lake preview feature
spark.sql("SET spark.databricks.delta.preview.enabled = true")

# Convert formatted data to Spark DataFrame and sort by similarity score
formatted_df = pd.DataFrame(formatted_data)
formatted_spark_df = spark.createDataFrame(formatted_df)
sorted_formatted_spark_df = formatted_spark_df.orderBy("Column1", col("Column3").asc())

# Save the results to a Delta table
full_similarity_table_name = f'data_base_location.table_name'
sorted_formatted_spark_df.write.format('delta').mode('overwrite').saveAsTable(full_similarity_table_name)

# Print the schema and show a sample of the stored data
print(f'Similarity scores for {full_similarity_table_name} have been calculated and stored in the database.')
print(formatted_spark_df.printSchema())
ver_df = spark.table(full_similarity_table_name)
ver_df.show(5)