11# utils/recommender.py
22# Contains all recommendation logic: scoring and filtering projects.
3+ # Upgraded to use vector similarity-based scoring via TF-IDF and cosine similarity.
34# Kept separate from routing so it can be tested and extended independently.
45
56from utils .data_loader import load_all_projects
7+ from sklearn .feature_extraction .text import TfidfVectorizer
8+ from sklearn .metrics .pairwise import cosine_similarity
69
710# Maximum number of recommendations returned to the user
811MAX_RESULTS = 3
912
10- # Scoring weights used by the recommendation engine.
11- # Higher weights mean that criterion has more influence
12- # on the final recommendation score.
13+ # Scale factor to convert cosine similarity (0.0–1.0) to 0–10 range
14+ # so skill match weight is comparable to bonus_score (max 5 points)
15+ SIMILARITY_SCALE = 10
16+
17+ # Scoring weights — kept for backward compatibility and reference
18+ # These are used as bonus points for non-skill criteria
1319SCORING_WEIGHTS = {
1420 "skill" : 3 ,
1521 "level" : 2 ,
1622 "interest" : 2 ,
1723 "time" : 1 ,
1824}
1925
26+ # Individual weight constants for clarity inside scoring function
27+ WEIGHT_LEVEL = SCORING_WEIGHTS ["level" ]
28+ WEIGHT_INTEREST = SCORING_WEIGHTS ["interest" ]
29+ WEIGHT_TIME = SCORING_WEIGHTS ["time" ]
2030
2131# Common aliases and abbreviations for skills
2232# This improves recommendation accuracy by normalizing user input
@@ -38,7 +48,6 @@ def parse_skills(skills_string):
3848 Example:
3949 "JS, HTML5, CSS3" -> ["javascript", "html", "css"]
4050 """
41-
4251 raw_skills = [
4352 s .strip ().lower ()
4453 for s in skills_string .split ("," )
@@ -53,42 +62,76 @@ def parse_skills(skills_string):
5362 return normalized_skills
5463
5564
65+ def compute_skill_similarity (user_skills , project_skills ):
66+ """
67+ Compute cosine similarity between user skills and project skills
68+ using TF-IDF vectorization.
69+
70+ Steps:
71+ 1. Convert skill lists to single strings
72+ 2. Fit TF-IDF vectorizer on both
73+ 3. Compute cosine similarity between vectors
74+ 4. Return similarity score between 0.0 and 1.0
75+
76+ Example:
77+ user_skills = ["python", "html"]
78+ project_skills = ["python", "css", "html"]
79+ returns ~0.82 (high similarity)
80+ """
81+ # If either has no skills return 0
82+ if not user_skills or not project_skills :
83+ return 0.0
84+
85+ # Convert skill lists to strings for TF-IDF
86+ user_text = " " .join (user_skills )
87+ project_text = " " .join ([s .lower () for s in project_skills ])
88+
89+ # Vectorize both using TF-IDF
90+ vectorizer = TfidfVectorizer ()
91+ try :
92+ tfidf_matrix = vectorizer .fit_transform ([user_text , project_text ])
93+ except ValueError :
94+ return 0.0
95+
96+ # Compute cosine similarity between user and project vectors
97+ similarity = cosine_similarity (tfidf_matrix [0 :1 ], tfidf_matrix [1 :2 ])
98+
99+ # Returns a value between 0.0 and 1.0
100+ return float (similarity [0 ][0 ])
101+
102+
56103def score_single_project (
57104 project , user_skills ,
58105 level , interest , time_availability ):
59106 """
60- Calculate a numeric relevance score for one project.
107+ Calculate a relevance score for one project using:
108+ - TF-IDF cosine similarity for skill matching (0.0 to 1.0)
109+ - Fixed points for level, interest, time match
61110
62- Each matching criterion adds points:
63- - Each matching skill: +3
64- - Level match: +2
65- - Interest match: +2
66- - Time match: +1
67-
68- Returns an integer score (0 means no match at all).
111+ Final score combines both for a balanced ranking.
112+ SIMILARITY_SCALE converts cosine score to 0-10 range
113+ so it is comparable to bonus_score (max 5 points).
69114 """
70- score = 0
71-
72- # Compare user's skills against the project's required skills
73- project_skills = [s .lower () for s in project .get ("skills" , [])]
74- # Count how many user skills overlap with the
75- # skills required by the current project.
76- matched_skills = sum (1 for skill in user_skills if skill in project_skills )
77- # Add weighted points based on the number of matching skills.
78- # More overlapping skills result in a higher recommendation score.
79- score += matched_skills * SCORING_WEIGHTS ["skill" ]
80-
81- # Award points for each additional matching criterion
115+ # Vector similarity-based skill score (between 0.0 and 1.0)
116+ project_skills = project .get ("skills" , [])
117+ skill_score = compute_skill_similarity (user_skills , project_skills )
118+
119+ # Fixed points for other criteria
120+ bonus_score = 0
121+
82122 if project .get ("level" , "" ).lower () == level .lower ():
83- score += SCORING_WEIGHTS [ "level" ]
123+ bonus_score += WEIGHT_LEVEL
84124
85125 if project .get ("interest" , "" ).lower () == interest .lower ():
86- score += SCORING_WEIGHTS [ "interest" ]
126+ bonus_score += WEIGHT_INTEREST
87127
88128 if project .get ("time" , "" ).lower () == time_availability .lower ():
89- score += SCORING_WEIGHTS ["time" ]
129+ bonus_score += WEIGHT_TIME
130+
131+ # Combine: skill similarity (scaled) + bonus points
132+ final_score = (skill_score * SIMILARITY_SCALE ) + bonus_score
90133
91- return score
134+ return final_score
92135
93136
94137def get_recommendations (skills_string , level , interest , time_availability ):
@@ -97,7 +140,7 @@ def get_recommendations(skills_string, level, interest, time_availability):
97140
98141 Steps:
99142 1. Parse the raw skills input into a list.
100- 2. Score every project in the dataset .
143+ 2. Compute cosine similarity score for every project .
101144 3. Drop projects with a score of zero (no overlap at all).
102145 4. Sort by score descending.
103146 5. Return the top MAX_RESULTS projects.
@@ -143,4 +186,4 @@ def validate_recommendation_inputs(skills, level, interest, time_availability):
143186 if not time_availability or not time_availability .strip ():
144187 errors .append ("Please select your time availability." )
145188
146- return errors
189+ return errors
0 commit comments