CTLab-ITMO · peterochek · Jun 22, 2025 · Jun 22, 2025 · peterochek · Jun 22, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,14 @@
+# IDE
 .idea
+
+# Python
 __pycache__
-data/*
+.venv
+
+# Data
 tensorboard_logs/*
+data/*
 checkpoints/*
+
+# Dev
+uv.lock
diff --git a/configs/train/letter.json b/configs/train/letter.json
@@ -0,0 +1,173 @@
+{
+  "experiment_name": "letter_data",
+  "best_metric": "validation/ndcg@20",
+  "train_epochs_num": 100,
+  "dataset": {
+    "type": "letter_full",
+    "path_to_data_dir": "../data",
+    "name": "Beauty_letter",
+    "max_sequence_length": 50,
+    "samplers": {
+      "type": "last_item_prediction",
+      "negative_sampler_type": "random"
+    },
+    "beauty_inter_json": "../../LETTER/data/Beauty/Beauty.inter.json"
+  },
+  "dataloader": {
+    "train": {
+      "type": "torch",
+      "batch_size": 256,
+      "batch_processor": {
+        "type": "letter",
+        "beauty_index_json": "../../LETTER/data/Beauty/Beauty.index.json",
+        "semantic_length": 4
+      },
+      "drop_last": true,
+      "shuffle": true
+    },
+    "validation": {
+      "type": "torch",
+      "batch_size": 256,
+      "batch_processor": {
+        "type": "letter",
+        "beauty_index_json": "../../LETTER/data/Beauty/Beauty.index.json",
+        "semantic_length": 4
+      },
+      "drop_last": false,
+      "shuffle": false
+    }
+  },
+  "model": {
+    "type": "sasrec",
+    "sequence_prefix": "item",
+    "positive_prefix": "positive",
+    "negative_prefix": "negative",
+    "candidate_prefix": "candidates",
+    "embedding_dim": 64,
+    "num_heads": 2,
+    "num_layers": 2,
+    "dim_feedforward": 256,
+    "dropout": 0.3,
+    "activation": "gelu",
+    "layer_norm_eps": 1e-9,
+    "initializer_range": 0.02
+  },
+  "optimizer": {
+    "type": "basic",
+    "optimizer": {
+      "type": "adam",
+      "lr": 0.001
+    },
+    "clip_grad_threshold": 5.0
+  },
+  "loss": {
+    "type": "composite",
+    "losses": [
+      {
+        "type": "sasrec",
+        "positive_prefix": "positive_scores",
+        "negative_prefix": "negative_scores",
+        "output_prefix": "downstream_loss"
+      }
+    ],
+    "output_prefix": "loss"
+  },
+  "callback": {
+    "type": "composite",
+    "callbacks": [
+      {
+        "type": "metric",
+        "on_step": 1,
+        "loss_prefix": "loss"
+      },
+      {
+        "type": "validation",
+        "on_step": 64,
+        "pred_prefix": "logits",
+        "labels_prefix": "labels",
+        "metrics": {
+          "ndcg@5": {
+            "type": "ndcg",
+            "k": 5
+          },
+          "ndcg@10": {
+            "type": "ndcg",
+            "k": 10
+          },
+          "ndcg@20": {
+            "type": "ndcg",
+            "k": 20
+          },
+          "recall@5": {
+            "type": "recall",
+            "k": 5
+          },
+          "recall@10": {
+            "type": "recall",
+            "k": 10
+          },
+          "recall@20": {
+            "type": "recall",
+            "k": 20
+          },
+          "coverage@5": {
+            "type": "coverage",
+            "k": 5
+          },
+          "coverage@10": {
+            "type": "coverage",
+            "k": 10
+          },
+          "coverage@20": {
+            "type": "coverage",
+            "k": 20
+          }
+        }
+      },
+      {
+        "type": "eval",
+        "on_step": 256,
+        "pred_prefix": "logits",
+        "labels_prefix": "labels",
+        "metrics": {
+          "ndcg@5": {
+            "type": "ndcg",
+            "k": 5
+          },
+          "ndcg@10": {
+            "type": "ndcg",
+            "k": 10
+          },
+          "ndcg@20": {
+            "type": "ndcg",
+            "k": 20
+          },
+          "recall@5": {
+            "type": "recall",
+            "k": 5
+          },
+          "recall@10": {
+            "type": "recall",
+            "k": 10
+          },
+          "recall@20": {
+            "type": "recall",
+            "k": 20
+          },
+          "coverage@5": {
+            "type": "coverage",
+            "k": 5
+          },
+          "coverage@10": {
+            "type": "coverage",
+            "k": 10
+          },
+          "coverage@20": {
+            "type": "coverage",
+            "k": 20
+          }
+        }
+      }
+    ]
+  }
+}
diff --git a/modeling/dataloader/batch_processors.py b/modeling/dataloader/batch_processors.py
@@ -1,3 +1,5 @@
+import json
+import re
 import torch
 from utils import MetaParent
 
@@ -16,7 +18,7 @@ def __call__(self, batch):
 
 class BasicBatchProcessor(BaseBatchProcessor, config_name='basic'):
 
-    def __call__(self, batch):
+    def __call__(self, batch, convert_to_tensor=True):
         processed_batch = {}
 
         for key in batch[0].keys():
@@ -31,7 +33,60 @@ def __call__(self, batch):
                     processed_batch[f'{prefix}.ids'].extend(sample[f'{prefix}.ids'])
                     processed_batch[f'{prefix}.length'].append(sample[f'{prefix}.length'])
 
+        if convert_to_tensor:
+            for part, values in processed_batch.items():
+                processed_batch[part] = torch.tensor(values, dtype=torch.long)
+
+        return processed_batch
+
+
+class LetterBatchProcessor(BasicBatchProcessor, config_name='letter'):
+    def __init__(self, mapping: dict[int, list[int]], semantic_length: int):
+        self._prefixes = ['item', 'labels', 'positive', 'negative']
+        self._semantic_length = semantic_length
+        self._mapping = mapping
+
+        assert sorted(mapping.keys()) == list(range(len(mapping))), "Item ids must be consecutive"
+        self._mapping_tensor = torch.zeros((len(mapping), semantic_length), dtype=torch.long)
+        for item_id, semantic_ids in mapping.items():
+            self._mapping_tensor[item_id] = torch.tensor(semantic_ids, dtype=torch.long)
+
+    @classmethod
+    def create_from_config(cls, config, **kwargs):
+        mapping_path = config["beauty_index_json"]
+        with open(mapping_path, "r") as f:
+            mapping = json.load(f)
+
+        semantic_length = config["semantic_length"]
+
+        parsed = {}
+
+        for key, semantic_ids in mapping.items():
+            numbers = [int(re.search(r'\d+', item).group()) for item in semantic_ids]
+            assert len(numbers) == semantic_length, "All semantic ids must have the same length"
+            parsed[int(key)] = numbers
+
+        return cls(mapping=parsed, semantic_length=semantic_length)
+
+    def __call__(self, batch):
+        processed_batch = super().__call__(batch, convert_to_tensor=False)
+
+        for prefix in self._prefixes:
+            if f"{prefix}.ids" in processed_batch:
+                ids = processed_batch[f"{prefix}.ids"]
+                lengths = processed_batch[f"{prefix}.length"]
+
+                flattened_semantic_ids = []
+
+                for _id in ids:
+                    flattened_semantic_ids.extend(self._mapping[_id])
+
+                processed_batch[f"semantic_{prefix}.ids"] = flattened_semantic_ids
+                processed_batch[f"semantic_{prefix}.length"] = [length * self._semantic_length for length in lengths]
+
         for part, values in processed_batch.items():
             processed_batch[part] = torch.tensor(values, dtype=torch.long)
+
+        processed_batch["all_semantic_ids"] = self._mapping_tensor
 
         return processed_batch