Fix .rank() method for multiple models (#615)

hieuddo · web-flow · commit ae7ba8601aa3 · 2024-04-25T04:00:06.000+08:00
The new Recommender.rank() function adds k as required value, which breaks some models that do not use k in ranking evaluation (e.g., ComparER, EFM, LRPPM).

This commit updates .rank() for mentioned models with topK option.
diff --git a/cornac/models/comparer/recom_comparer_obj.pyx b/cornac/models/comparer/recom_comparer_obj.pyx
@@ -663,39 +663,51 @@ class ComparERObj(Recommender):
             item_score = self.U2[item_id, :].dot(self.U1[user_id, :]) + self.H2[item_id, :].dot(self.H1[user_id, :])
             return item_score
 
-    def rank(self, user_id, item_ids=None):
+    def rank(self, user_idx, item_indices=None, k=-1):
         """Rank all test items for a given user.
 
         Parameters
         ----------
-        user_id: int, required
+        user_idx: int, required
             The index of the user for whom to perform item raking.
 
-        item_ids: 1d array, optional, default: None
+        item_indices: 1d array, optional, default: None
             A list of candidate item indices to be ranked by the user.
             If `None`, list of ranked known item indices and their scores will be returned
 
+        k: int, required
+            Cut-off length for recommendations, k=-1 will return ranked list of all items.
+            This is more important for ANN to know the limit to avoid exhaustive ranking.
+
         Returns
         -------
-        Tuple of `item_rank`, and `item_scores`. The order of values
-        in item_scores are corresponding to the order of their ids in item_ids
+        (ranked_items, item_scores): tuple
+            `ranked_items` contains item indices being ranked by their scores.
+            `item_scores` contains scores of items corresponding to index in `item_indices` input.
 
         """
-        X_ = self.U1[user_id, :].dot(self.V.T)
+        X_ = self.U1[user_idx, :].dot(self.V.T)
         most_cared_aspects_indices = (-X_).argsort()[:self.num_most_cared_aspects]
         most_cared_X_ = X_[most_cared_aspects_indices]
         most_cared_Y_ = self.U2.dot(self.V[most_cared_aspects_indices, :].T)
         explicit_scores = most_cared_X_.dot(most_cared_Y_.T) / (self.num_most_cared_aspects * self.rating_scale)
-        item_scores = self.alpha * explicit_scores + (1 - self.alpha) * self.score(user_id)
-
-        if item_ids is None:
-            item_scores = item_scores
-            item_rank = item_scores.argsort()[::-1]
-        else:
-            num_items = max(self.num_items, max(item_ids) + 1)
-            item_scores = np.ones(num_items) * np.min(item_scores)
-            item_scores[:self.num_items] = item_scores
-            item_rank = item_scores.argsort()[::-1]
-            item_rank = intersects(item_rank, item_ids, assume_unique=True)
-            item_scores = item_scores[item_ids]
-        return item_rank, item_scores
+        all_item_scores = self.alpha * explicit_scores + (1 - self.alpha) * self.score(user_idx)
+
+        # rank items based on their scores
+        item_indices = (
+            np.arange(self.num_items)
+            if item_indices is None
+            else np.asarray(item_indices)
+        )
+        item_scores = all_item_scores[item_indices]
+
+        if k != -1:  # O(n + k log k), faster for small k which is usually the case
+            partitioned_idx = np.argpartition(item_scores, -k)
+            top_k_idx = partitioned_idx[-k:]
+            sorted_top_k_idx = top_k_idx[np.argsort(item_scores[top_k_idx])]
+            partitioned_idx[-k:] = sorted_top_k_idx
+            ranked_items = item_indices[partitioned_idx[::-1]]
+        else:  # O(n log n)
+            ranked_items = item_indices[item_scores.argsort()[::-1]]
+
+        return ranked_items, item_scores
diff --git a/cornac/models/comparer/recom_comparer_sub.pyx b/cornac/models/comparer/recom_comparer_sub.pyx
@@ -759,7 +759,7 @@ class ComparERSub(MTER):
 
         return correct, skipped, loss, bpr_loss
 
-    def rank(self, user_idx, item_indices=None):
+    def rank(self, user_idx, item_indices=None, k=-1):
         if self.alpha > 0 and self.n_top_aspects > 0:
             n_top_aspects = min(self.n_top_aspects, self.num_aspects)
             ts1 = np.einsum("abc,a->bc", self.G1, self.U[user_idx])
@@ -786,12 +786,21 @@ class ComparERSub(MTER):
                 all_item_scores[: self.num_items] = known_item_scores
 
             # rank items based on their scores
-            if item_indices is None:
-                item_scores = all_item_scores[: self.num_items]
-                item_rank = item_scores.argsort()[::-1]
-            else:
-                item_scores = all_item_scores[item_indices]
-                item_rank = np.array(item_indices)[item_scores.argsort()[::-1]]
-
-            return item_rank, item_scores
-        return super().rank(user_idx, item_indices)
+            item_indices = (
+                np.arange(self.num_items)
+                if item_indices is None
+                else np.asarray(item_indices)
+            )
+            item_scores = all_item_scores[item_indices]
+                
+            if k != -1:  # O(n + k log k), faster for small k which is usually the case
+                partitioned_idx = np.argpartition(item_scores, -k)
+                top_k_idx = partitioned_idx[-k:]
+                sorted_top_k_idx = top_k_idx[np.argsort(item_scores[top_k_idx])]
+                partitioned_idx[-k:] = sorted_top_k_idx
+                ranked_items = item_indices[partitioned_idx[::-1]]
+            else:  # O(n log n)
+                ranked_items = item_indices[item_scores.argsort()[::-1]]
+
+            return ranked_items, item_scores
+        return super().rank(user_idx, item_indices, k)
diff --git a/cornac/models/efm/recom_efm.pyx b/cornac/models/efm/recom_efm.pyx
@@ -468,7 +468,7 @@ class EFM(Recommender):
             item_score = self.U2[item_idx, :].dot(self.U1[user_idx, :]) + self.H2[item_idx, :].dot(self.H1[user_idx, :])
             return item_score
 
-    def rank(self, user_idx, item_indices=None):
+    def rank(self, user_idx, item_indices=None, k=-1):
         """Rank all test items for a given user.
 
         Parameters
@@ -480,10 +480,15 @@ class EFM(Recommender):
             A list of candidate item indices to be ranked by the user.
             If `None`, list of ranked known item indices and their scores will be returned
 
+        k: int, required
+            Cut-off length for recommendations, k=-1 will return ranked list of all items.
+            This is more important for ANN to know the limit to avoid exhaustive ranking.
+
         Returns
         -------
-        Tuple of `item_rank`, and `item_scores`. The order of values
-        in item_scores are corresponding to the order of their ids in item_ids
+        (ranked_items, item_scores): tuple
+            `ranked_items` contains item indices being ranked by their scores.
+            `item_scores` contains scores of items corresponding to index in `item_indices` input.
 
         """
         X_ = self.U1[user_idx, :].dot(self.V.T)
@@ -504,11 +509,20 @@ class EFM(Recommender):
             all_item_scores[: self.num_items] = known_item_scores
 
         # rank items based on their scores
-        if item_indices is None:
-            item_scores = all_item_scores[: self.num_items]
-            item_rank = item_scores.argsort()[::-1]
-        else:
-            item_scores = all_item_scores[item_indices]
-            item_rank = np.array(item_indices)[item_scores.argsort()[::-1]]
-
-        return item_rank, item_scores
+        item_indices = (
+            np.arange(self.num_items)
+            if item_indices is None
+            else np.asarray(item_indices)
+        )
+        item_scores = all_item_scores[item_indices]
+
+        if k != -1:  # O(n + k log k), faster for small k which is usually the case
+            partitioned_idx = np.argpartition(item_scores, -k)
+            top_k_idx = partitioned_idx[-k:]
+            sorted_top_k_idx = top_k_idx[np.argsort(item_scores[top_k_idx])]
+            partitioned_idx[-k:] = sorted_top_k_idx
+            ranked_items = item_indices[partitioned_idx[::-1]]
+        else:  # O(n log n)
+            ranked_items = item_indices[item_scores.argsort()[::-1]]
+
+        return ranked_items, item_scores
diff --git a/cornac/models/lrppm/recom_lrppm.pyx b/cornac/models/lrppm/recom_lrppm.pyx
@@ -516,7 +516,7 @@ class LRPPM(Recommender):
             item_score = self.I[i_idx].dot(self.U[u_idx])
             return item_score
 
-    def rank(self, user_idx, item_indices=None):
+    def rank(self, user_idx, item_indices=None, k=-1):
         if self.alpha > 0 and self.num_top_aspects > 0:
             n_items = self.num_items
             num_top_aspects = min(self.num_top_aspects, self.num_aspects)
@@ -540,12 +540,21 @@ class LRPPM(Recommender):
                 all_item_scores[: self.num_items] = known_item_scores
 
             # rank items based on their scores
-            if item_indices is None:
-                item_scores = all_item_scores[: self.num_items]
-                item_rank = item_scores.argsort()[::-1]
-            else:
-                item_scores = all_item_scores[item_indices]
-                item_rank = np.array(item_indices)[item_scores.argsort()[::-1]]
-
-            return item_rank, item_scores
-        return super().rank(user_idx, item_indices)
+            item_indices = (
+                np.arange(self.num_items)
+                if item_indices is None
+                else np.asarray(item_indices)
+            )
+            item_scores = all_item_scores[item_indices]
+
+            if k != -1:  # O(n + k log k), faster for small k which is usually the case
+                partitioned_idx = np.argpartition(item_scores, -k)
+                top_k_idx = partitioned_idx[-k:]
+                sorted_top_k_idx = top_k_idx[np.argsort(item_scores[top_k_idx])]
+                partitioned_idx[-k:] = sorted_top_k_idx
+                ranked_items = item_indices[partitioned_idx[::-1]]
+            else:  # O(n log n)
+                ranked_items = item_indices[item_scores.argsort()[::-1]]
+
+            return ranked_items, item_scores
+        return super().rank(user_idx, item_indices, k)