diff --git a/.gitignore b/.gitignore
index 5576121c..9041dd9e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,14 @@
+# IDE
 .idea
+
+# Python
 __pycache__
-data/*
+.venv
+
+# Data
 tensorboard_logs/*
+data/*
 checkpoints/*
+
+# Dev
+uv.lock
diff --git a/configs/train/letter.json b/configs/train/letter.json
new file mode 100644
index 00000000..390313cc
--- /dev/null
+++ b/configs/train/letter.json
@@ -0,0 +1,173 @@
+{
+  "experiment_name": "letter_data",
+  "best_metric": "validation/ndcg@20",
+  "train_epochs_num": 100,
+  "dataset": {
+    "type": "letter_full",
+    "path_to_data_dir": "../data",
+    "name": "Beauty_letter",
+    "max_sequence_length": 50,
+    "samplers": {
+      "type": "last_item_prediction",
+      "negative_sampler_type": "random"
+    },
+    "beauty_inter_json": "../../LETTER/data/Beauty/Beauty.inter.json"
+  },
+  "dataloader": {
+    "train": {
+      "type": "torch",
+      "batch_size": 256,
+      "batch_processor": {
+        "type": "letter",
+        "beauty_index_json": "../../LETTER/data/Beauty/Beauty.index.json",
+        "semantic_length": 4
+      },
+      "drop_last": true,
+      "shuffle": true
+    },
+    "validation": {
+      "type": "torch",
+      "batch_size": 256,
+      "batch_processor": {
+        "type": "letter",
+        "beauty_index_json": "../../LETTER/data/Beauty/Beauty.index.json",
+        "semantic_length": 4
+      },
+      "drop_last": false,
+      "shuffle": false
+    }
+  },
+  "model": {
+    "type": "sasrec",
+    "sequence_prefix": "item",
+    "positive_prefix": "positive",
+    "negative_prefix": "negative",
+    "candidate_prefix": "candidates",
+    "embedding_dim": 64,
+    "num_heads": 2,
+    "num_layers": 2,
+    "dim_feedforward": 256,
+    "dropout": 0.3,
+    "activation": "gelu",
+    "layer_norm_eps": 1e-9,
+    "initializer_range": 0.02
+  },
+  "optimizer": {
+    "type": "basic",
+    "optimizer": {
+      "type": "adam",
+      "lr": 0.001
+    },
+    "clip_grad_threshold": 5.0
+  },
+  "loss": {
+    "type": "composite",
+    "losses": [
+      {
+        "type": "sasrec",
+        "positive_prefix": "positive_scores",
+        "negative_prefix": "negative_scores",
+        "output_prefix": "downstream_loss"
+      }
+    ],
+    "output_prefix": "loss"
+  },
+  "callback": {
+    "type": "composite",
+    "callbacks": [
+      {
+        "type": "metric",
+        "on_step": 1,
+        "loss_prefix": "loss"
+      },
+      {
+        "type": "validation",
+        "on_step": 64,
+        "pred_prefix": "logits",
+        "labels_prefix": "labels",
+        "metrics": {
+          "ndcg@5": {
+            "type": "ndcg",
+            "k": 5
+          },
+          "ndcg@10": {
+            "type": "ndcg",
+            "k": 10
+          },
+          "ndcg@20": {
+            "type": "ndcg",
+            "k": 20
+          },
+          "recall@5": {
+            "type": "recall",
+            "k": 5
+          },
+          "recall@10": {
+            "type": "recall",
+            "k": 10
+          },
+          "recall@20": {
+            "type": "recall",
+            "k": 20
+          },
+          "coverage@5": {
+            "type": "coverage",
+            "k": 5
+          },
+          "coverage@10": {
+            "type": "coverage",
+            "k": 10
+          },
+          "coverage@20": {
+            "type": "coverage",
+            "k": 20
+          }
+        }
+      },
+      {
+        "type": "eval",
+        "on_step": 256,
+        "pred_prefix": "logits",
+        "labels_prefix": "labels",
+        "metrics": {
+          "ndcg@5": {
+            "type": "ndcg",
+            "k": 5
+          },
+          "ndcg@10": {
+            "type": "ndcg",
+            "k": 10
+          },
+          "ndcg@20": {
+            "type": "ndcg",
+            "k": 20
+          },
+          "recall@5": {
+            "type": "recall",
+            "k": 5
+          },
+          "recall@10": {
+            "type": "recall",
+            "k": 10
+          },
+          "recall@20": {
+            "type": "recall",
+            "k": 20
+          },
+          "coverage@5": {
+            "type": "coverage",
+            "k": 5
+          },
+          "coverage@10": {
+            "type": "coverage",
+            "k": 10
+          },
+          "coverage@20": {
+            "type": "coverage",
+            "k": 20
+          }
+        }
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/modeling/dataloader/batch_processors.py b/modeling/dataloader/batch_processors.py
index 436f98fe..307aa53d 100644
--- a/modeling/dataloader/batch_processors.py
+++ b/modeling/dataloader/batch_processors.py
@@ -1,3 +1,5 @@
+import json
+import re
 import torch
 from utils import MetaParent
 
@@ -16,7 +18,7 @@ def __call__(self, batch):
 
 class BasicBatchProcessor(BaseBatchProcessor, config_name='basic'):
 
-    def __call__(self, batch):
+    def __call__(self, batch, convert_to_tensor=True):
         processed_batch = {}
 
         for key in batch[0].keys():
@@ -31,7 +33,60 @@ def __call__(self, batch):
                     processed_batch[f'{prefix}.ids'].extend(sample[f'{prefix}.ids'])
                     processed_batch[f'{prefix}.length'].append(sample[f'{prefix}.length'])
 
+        if convert_to_tensor:
+            for part, values in processed_batch.items():
+                processed_batch[part] = torch.tensor(values, dtype=torch.long)
+
+        return processed_batch
+
+
+class LetterBatchProcessor(BasicBatchProcessor, config_name='letter'):
+    def __init__(self, mapping: dict[int, list[int]], semantic_length: int):
+        self._prefixes = ['item', 'labels', 'positive', 'negative']
+        self._semantic_length = semantic_length
+        self._mapping = mapping
+        
+        assert sorted(mapping.keys()) == list(range(len(mapping))), "Item ids must be consecutive"
+        self._mapping_tensor = torch.zeros((len(mapping), semantic_length), dtype=torch.long)
+        for item_id, semantic_ids in mapping.items():
+            self._mapping_tensor[item_id] = torch.tensor(semantic_ids, dtype=torch.long)
+    
+    @classmethod
+    def create_from_config(cls, config, **kwargs):
+        mapping_path = config["beauty_index_json"]
+        with open(mapping_path, "r") as f:
+            mapping = json.load(f)
+            
+        semantic_length = config["semantic_length"]
+
+        parsed = {}
+            
+        for key, semantic_ids in mapping.items():
+            numbers = [int(re.search(r'\d+', item).group()) for item in semantic_ids]
+            assert len(numbers) == semantic_length, "All semantic ids must have the same length"
+            parsed[int(key)] = numbers
+            
+        return cls(mapping=parsed, semantic_length=semantic_length)
+    
+    def __call__(self, batch):
+        processed_batch = super().__call__(batch, convert_to_tensor=False)
+                    
+        for prefix in self._prefixes:
+            if f"{prefix}.ids" in processed_batch:
+                ids = processed_batch[f"{prefix}.ids"]
+                lengths = processed_batch[f"{prefix}.length"]
+                
+                flattened_semantic_ids = []
+                
+                for _id in ids:
+                    flattened_semantic_ids.extend(self._mapping[_id])
+                    
+                processed_batch[f"semantic_{prefix}.ids"] = flattened_semantic_ids
+                processed_batch[f"semantic_{prefix}.length"] = [length * self._semantic_length for length in lengths]
+
         for part, values in processed_batch.items():
             processed_batch[part] = torch.tensor(values, dtype=torch.long)
+            
+        processed_batch["all_semantic_ids"] = self._mapping_tensor
 
         return processed_batch
diff --git a/modeling/dataset/base.py b/modeling/dataset/base.py
index 42a1516e..dc0e5178 100644
--- a/modeling/dataset/base.py
+++ b/modeling/dataset/base.py
@@ -1,4 +1,5 @@
 from collections import defaultdict
+import json
 
 from tqdm import tqdm
 
@@ -616,3 +617,193 @@ def meta(self):
             'num_items': self.num_items,
             'max_sequence_length': self.max_sequence_length
         }
+
+
+class ScientificFullDataset(ScientificDataset, config_name="scientific_full"):
+    def __init__(
+        self,
+        train_sampler,
+        validation_sampler,
+        test_sampler,
+        num_users,
+        num_items,
+        max_sequence_length,
+    ):
+        self._train_sampler = train_sampler
+        self._validation_sampler = validation_sampler
+        self._test_sampler = test_sampler
+        self._num_users = num_users
+        self._num_items = num_items
+        self._max_sequence_length = max_sequence_length
+
+    @classmethod
+    def create_from_config(cls, config, **kwargs):
+        data_dir_path = os.path.join(config["path_to_data_dir"], config["name"])
+        max_sequence_length = config["max_sequence_length"]
+        max_user_id, max_item_id = 0, 0
+        train_dataset, validation_dataset, test_dataset = [], [], []
+
+        dataset_path = os.path.join(data_dir_path, "{}.txt".format("all_data"))
+        with open(dataset_path, "r") as f:
+            data = f.readlines()
+
+        for sample in data:
+            sample = sample.strip("\n").split(" ")
+            user_id = int(sample[0])
+            item_ids = [int(item_id) for item_id in sample[1:]]
+
+            max_user_id = max(max_user_id, user_id)
+            max_item_id = max(max_item_id, max(item_ids))
+
+            assert len(item_ids) >= 5
+            
+            # item_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+            # prefix_length: 5, 6, 7, 8, 9, 10
+            for prefix_length in range(5, len(item_ids) + 1):
+                # prefix = [1, 2, 3, 4, 5]
+                # prefix = [1, 2, 3, 4, 5, 6]
+                # prefix = [1, 2, 3, 4, 5, 6, 7]
+                # prefix = [1, 2, 3, 4, 5, 6, 7, 8]
+                # prefix = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+                # prefix = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+                
+                
+                
+                prefix = item_ids[
+                    :prefix_length
+                ]  # TODO no sliding window, only incrmenting sequence from last 50 items
+
+                # prefix[:-2] = [1, 2, 3]
+                # prefix[:-2] = [1, 2, 3, 4]
+                # prefix[:-2] = [1, 2, 3, 4, 5]
+                # prefix[:-2] = [1, 2, 3, 4, 5, 6]
+                # prefix[:-2] = [1, 2, 3, 4, 5, 6, 7]
+                # prefix[:-2] = [1, 2, 3, 4, 5, 6, 7, 8]
+
+                train_dataset.append(
+                    {
+                        "user.ids": [user_id],
+                        "user.length": 1,
+                        "item.ids": prefix[:-2][-max_sequence_length:],
+                        "item.length": len(prefix[:-2][-max_sequence_length:]),
+                    }
+                )
+                assert len(prefix[:-2][-max_sequence_length:]) == len(
+                    set(prefix[:-2][-max_sequence_length:])
+                )
+
+            # item_ids[:-1] = [1, 2, 3, 4, 5, 6, 7, 8, 9]
+            validation_dataset.append(
+                {
+                    "user.ids": [user_id],
+                    "user.length": 1,
+                    "item.ids": item_ids[:-1][-max_sequence_length:],
+                    "item.length": len(item_ids[:-1][-max_sequence_length:]),
+                }
+            )
+            assert len(item_ids[:-1][-max_sequence_length:]) == len(
+                set(item_ids[:-1][-max_sequence_length:])
+            )
+            
+            # item_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+            test_dataset.append(
+                {
+                    "user.ids": [user_id],
+                    "user.length": 1,
+                    "item.ids": item_ids[-max_sequence_length:],
+                    "item.length": len(item_ids[-max_sequence_length:]),
+                }
+            )
+            assert len(item_ids[-max_sequence_length:]) == len(
+                set(item_ids[-max_sequence_length:])
+            )
+
+        logger.info("Train dataset size: {}".format(len(train_dataset)))
+        logger.info("Validation dataset size: {}".format(len(validation_dataset)))
+        logger.info("Test dataset size: {}".format(len(test_dataset)))
+        logger.info("Max user id: {}".format(max_user_id))
+        logger.info("Max item id: {}".format(max_item_id))
+        logger.info("Max sequence length: {}".format(max_sequence_length))
+        logger.info(
+            "{} dataset sparsity: {}".format(
+                config["name"],
+                (len(train_dataset) + len(test_dataset)) / max_user_id / max_item_id,
+            )
+        )
+
+        train_sampler = TrainSampler.create_from_config(
+            config["samplers"],
+            dataset=train_dataset,
+            num_users=max_user_id,
+            num_items=max_item_id,
+        )
+        validation_sampler = EvalSampler.create_from_config(
+            config["samplers"],
+            dataset=validation_dataset,
+            num_users=max_user_id,
+            num_items=max_item_id,
+        )
+        test_sampler = EvalSampler.create_from_config(
+            config["samplers"],
+            dataset=test_dataset,
+            num_users=max_user_id,
+            num_items=max_item_id,
+        )
+
+        return cls(
+            train_sampler=train_sampler,
+            validation_sampler=validation_sampler,
+            test_sampler=test_sampler,
+            num_users=max_user_id,
+            num_items=max_item_id,
+            max_sequence_length=max_sequence_length,
+        )
+
+
+class LetterFullDataset(ScientificFullDataset, config_name="letter_full"):
+    def __init__(
+        self,
+        train_sampler,
+        validation_sampler,
+        test_sampler,
+        num_users,
+        num_items,
+        max_sequence_length,
+    ):
+        self._train_sampler = train_sampler
+        self._validation_sampler = validation_sampler
+        self._test_sampler = test_sampler
+        self._num_users = num_users
+        self._num_items = num_items
+        self._max_sequence_length = max_sequence_length
+
+    @classmethod
+    def create_from_config(cls, config, **kwargs):
+        user_interactions_path = os.path.join(config["beauty_inter_json"])
+        with open(user_interactions_path, "r") as f:
+            user_interactions = json.load(f)
+            
+        dir_path = os.path.join(config["path_to_data_dir"], config["name"])
+
+        os.makedirs(dir_path, exist_ok=True)
+        dataset_path = os.path.join(dir_path, "all_data.txt")
+        
+        logger.info(f"Saving data to {dataset_path}")
+        
+        # Map from LETTER format to Our format
+        with open(dataset_path, "w") as f:
+            for user_id, item_ids in user_interactions.items():
+                items_repr = map(str, item_ids)
+                f.write(f"{user_id} {' '.join(items_repr)}\n")
+
+        dataset = ScientificFullDataset.create_from_config(config, **kwargs)
+
+        return cls(
+            train_sampler=dataset._train_sampler,
+            validation_sampler=dataset._validation_sampler,
+            test_sampler=dataset._test_sampler,
+            num_users=dataset._num_users,
+            num_items=dataset._num_items,
+            max_sequence_length=dataset._max_sequence_length,
+        )
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..f32390d0
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,38 @@
+[project]
+name = "irec"
+version = "0.1.0"
+description = "IRec framework"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = [
+    "faiss-cpu>=1",
+    "pandas>=2",
+    "scipy>=1",
+    "seaborn>=0.13.2",
+    "tensorboard>=2",
+    "torch>=2.7",
+    "transformers>=4.51",
+    "tqdm>=4",
+    "jupyter>=1",
+]
+
+[tool.uv.sources]
+torch = [
+  { index = "pytorch-cu128", marker = "sys_platform != 'darwin'" },
+  { index = "pytorch-cpu", marker = "sys_platform == 'darwin'" },
+]
+
+[[tool.uv.index]]
+name = "pytorch-cu128"
+url = "https://download.pytorch.org/whl/cu128"
+explicit = true
+
+[[tool.uv.index]]
+name = "pytorch-cpu"
+url = "https://download.pytorch.org/whl/cpu"
+explicit = true
+
+[dependency-groups]
+dev = [
+    "ruff>=0.11.4",
+]