diff --git a/.gitignore b/.gitignore index 5576121c..9041dd9e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,14 @@ +# IDE .idea + +# Python __pycache__ -data/* +.venv + +# Data tensorboard_logs/* +data/* checkpoints/* + +# Dev +uv.lock diff --git a/configs/train/letter.json b/configs/train/letter.json new file mode 100644 index 00000000..390313cc --- /dev/null +++ b/configs/train/letter.json @@ -0,0 +1,173 @@ +{ + "experiment_name": "letter_data", + "best_metric": "validation/ndcg@20", + "train_epochs_num": 100, + "dataset": { + "type": "letter_full", + "path_to_data_dir": "../data", + "name": "Beauty_letter", + "max_sequence_length": 50, + "samplers": { + "type": "last_item_prediction", + "negative_sampler_type": "random" + }, + "beauty_inter_json": "../../LETTER/data/Beauty/Beauty.inter.json" + }, + "dataloader": { + "train": { + "type": "torch", + "batch_size": 256, + "batch_processor": { + "type": "letter", + "beauty_index_json": "../../LETTER/data/Beauty/Beauty.index.json", + "semantic_length": 4 + }, + "drop_last": true, + "shuffle": true + }, + "validation": { + "type": "torch", + "batch_size": 256, + "batch_processor": { + "type": "letter", + "beauty_index_json": "../../LETTER/data/Beauty/Beauty.index.json", + "semantic_length": 4 + }, + "drop_last": false, + "shuffle": false + } + }, + "model": { + "type": "sasrec", + "sequence_prefix": "item", + "positive_prefix": "positive", + "negative_prefix": "negative", + "candidate_prefix": "candidates", + "embedding_dim": 64, + "num_heads": 2, + "num_layers": 2, + "dim_feedforward": 256, + "dropout": 0.3, + "activation": "gelu", + "layer_norm_eps": 1e-9, + "initializer_range": 0.02 + }, + "optimizer": { + "type": "basic", + "optimizer": { + "type": "adam", + "lr": 0.001 + }, + "clip_grad_threshold": 5.0 + }, + "loss": { + "type": "composite", + "losses": [ + { + "type": "sasrec", + "positive_prefix": "positive_scores", + "negative_prefix": "negative_scores", + "output_prefix": "downstream_loss" + } + ], + "output_prefix": "loss" + }, + "callback": { + "type": "composite", + "callbacks": [ + { + "type": "metric", + "on_step": 1, + "loss_prefix": "loss" + }, + { + "type": "validation", + "on_step": 64, + "pred_prefix": "logits", + "labels_prefix": "labels", + "metrics": { + "ndcg@5": { + "type": "ndcg", + "k": 5 + }, + "ndcg@10": { + "type": "ndcg", + "k": 10 + }, + "ndcg@20": { + "type": "ndcg", + "k": 20 + }, + "recall@5": { + "type": "recall", + "k": 5 + }, + "recall@10": { + "type": "recall", + "k": 10 + }, + "recall@20": { + "type": "recall", + "k": 20 + }, + "coverage@5": { + "type": "coverage", + "k": 5 + }, + "coverage@10": { + "type": "coverage", + "k": 10 + }, + "coverage@20": { + "type": "coverage", + "k": 20 + } + } + }, + { + "type": "eval", + "on_step": 256, + "pred_prefix": "logits", + "labels_prefix": "labels", + "metrics": { + "ndcg@5": { + "type": "ndcg", + "k": 5 + }, + "ndcg@10": { + "type": "ndcg", + "k": 10 + }, + "ndcg@20": { + "type": "ndcg", + "k": 20 + }, + "recall@5": { + "type": "recall", + "k": 5 + }, + "recall@10": { + "type": "recall", + "k": 10 + }, + "recall@20": { + "type": "recall", + "k": 20 + }, + "coverage@5": { + "type": "coverage", + "k": 5 + }, + "coverage@10": { + "type": "coverage", + "k": 10 + }, + "coverage@20": { + "type": "coverage", + "k": 20 + } + } + } + ] + } +} \ No newline at end of file diff --git a/modeling/dataloader/batch_processors.py b/modeling/dataloader/batch_processors.py index 436f98fe..307aa53d 100644 --- a/modeling/dataloader/batch_processors.py +++ b/modeling/dataloader/batch_processors.py @@ -1,3 +1,5 @@ +import json +import re import torch from utils import MetaParent @@ -16,7 +18,7 @@ def __call__(self, batch): class BasicBatchProcessor(BaseBatchProcessor, config_name='basic'): - def __call__(self, batch): + def __call__(self, batch, convert_to_tensor=True): processed_batch = {} for key in batch[0].keys(): @@ -31,7 +33,60 @@ def __call__(self, batch): processed_batch[f'{prefix}.ids'].extend(sample[f'{prefix}.ids']) processed_batch[f'{prefix}.length'].append(sample[f'{prefix}.length']) + if convert_to_tensor: + for part, values in processed_batch.items(): + processed_batch[part] = torch.tensor(values, dtype=torch.long) + + return processed_batch + + +class LetterBatchProcessor(BasicBatchProcessor, config_name='letter'): + def __init__(self, mapping: dict[int, list[int]], semantic_length: int): + self._prefixes = ['item', 'labels', 'positive', 'negative'] + self._semantic_length = semantic_length + self._mapping = mapping + + assert sorted(mapping.keys()) == list(range(len(mapping))), "Item ids must be consecutive" + self._mapping_tensor = torch.zeros((len(mapping), semantic_length), dtype=torch.long) + for item_id, semantic_ids in mapping.items(): + self._mapping_tensor[item_id] = torch.tensor(semantic_ids, dtype=torch.long) + + @classmethod + def create_from_config(cls, config, **kwargs): + mapping_path = config["beauty_index_json"] + with open(mapping_path, "r") as f: + mapping = json.load(f) + + semantic_length = config["semantic_length"] + + parsed = {} + + for key, semantic_ids in mapping.items(): + numbers = [int(re.search(r'\d+', item).group()) for item in semantic_ids] + assert len(numbers) == semantic_length, "All semantic ids must have the same length" + parsed[int(key)] = numbers + + return cls(mapping=parsed, semantic_length=semantic_length) + + def __call__(self, batch): + processed_batch = super().__call__(batch, convert_to_tensor=False) + + for prefix in self._prefixes: + if f"{prefix}.ids" in processed_batch: + ids = processed_batch[f"{prefix}.ids"] + lengths = processed_batch[f"{prefix}.length"] + + flattened_semantic_ids = [] + + for _id in ids: + flattened_semantic_ids.extend(self._mapping[_id]) + + processed_batch[f"semantic_{prefix}.ids"] = flattened_semantic_ids + processed_batch[f"semantic_{prefix}.length"] = [length * self._semantic_length for length in lengths] + for part, values in processed_batch.items(): processed_batch[part] = torch.tensor(values, dtype=torch.long) + + processed_batch["all_semantic_ids"] = self._mapping_tensor return processed_batch diff --git a/modeling/dataset/base.py b/modeling/dataset/base.py index 42a1516e..dc0e5178 100644 --- a/modeling/dataset/base.py +++ b/modeling/dataset/base.py @@ -1,4 +1,5 @@ from collections import defaultdict +import json from tqdm import tqdm @@ -616,3 +617,193 @@ def meta(self): 'num_items': self.num_items, 'max_sequence_length': self.max_sequence_length } + + +class ScientificFullDataset(ScientificDataset, config_name="scientific_full"): + def __init__( + self, + train_sampler, + validation_sampler, + test_sampler, + num_users, + num_items, + max_sequence_length, + ): + self._train_sampler = train_sampler + self._validation_sampler = validation_sampler + self._test_sampler = test_sampler + self._num_users = num_users + self._num_items = num_items + self._max_sequence_length = max_sequence_length + + @classmethod + def create_from_config(cls, config, **kwargs): + data_dir_path = os.path.join(config["path_to_data_dir"], config["name"]) + max_sequence_length = config["max_sequence_length"] + max_user_id, max_item_id = 0, 0 + train_dataset, validation_dataset, test_dataset = [], [], [] + + dataset_path = os.path.join(data_dir_path, "{}.txt".format("all_data")) + with open(dataset_path, "r") as f: + data = f.readlines() + + for sample in data: + sample = sample.strip("\n").split(" ") + user_id = int(sample[0]) + item_ids = [int(item_id) for item_id in sample[1:]] + + max_user_id = max(max_user_id, user_id) + max_item_id = max(max_item_id, max(item_ids)) + + assert len(item_ids) >= 5 + + # item_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + + # prefix_length: 5, 6, 7, 8, 9, 10 + for prefix_length in range(5, len(item_ids) + 1): + # prefix = [1, 2, 3, 4, 5] + # prefix = [1, 2, 3, 4, 5, 6] + # prefix = [1, 2, 3, 4, 5, 6, 7] + # prefix = [1, 2, 3, 4, 5, 6, 7, 8] + # prefix = [1, 2, 3, 4, 5, 6, 7, 8, 9] + # prefix = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + + + + prefix = item_ids[ + :prefix_length + ] # TODO no sliding window, only incrmenting sequence from last 50 items + + # prefix[:-2] = [1, 2, 3] + # prefix[:-2] = [1, 2, 3, 4] + # prefix[:-2] = [1, 2, 3, 4, 5] + # prefix[:-2] = [1, 2, 3, 4, 5, 6] + # prefix[:-2] = [1, 2, 3, 4, 5, 6, 7] + # prefix[:-2] = [1, 2, 3, 4, 5, 6, 7, 8] + + train_dataset.append( + { + "user.ids": [user_id], + "user.length": 1, + "item.ids": prefix[:-2][-max_sequence_length:], + "item.length": len(prefix[:-2][-max_sequence_length:]), + } + ) + assert len(prefix[:-2][-max_sequence_length:]) == len( + set(prefix[:-2][-max_sequence_length:]) + ) + + # item_ids[:-1] = [1, 2, 3, 4, 5, 6, 7, 8, 9] + validation_dataset.append( + { + "user.ids": [user_id], + "user.length": 1, + "item.ids": item_ids[:-1][-max_sequence_length:], + "item.length": len(item_ids[:-1][-max_sequence_length:]), + } + ) + assert len(item_ids[:-1][-max_sequence_length:]) == len( + set(item_ids[:-1][-max_sequence_length:]) + ) + + # item_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + test_dataset.append( + { + "user.ids": [user_id], + "user.length": 1, + "item.ids": item_ids[-max_sequence_length:], + "item.length": len(item_ids[-max_sequence_length:]), + } + ) + assert len(item_ids[-max_sequence_length:]) == len( + set(item_ids[-max_sequence_length:]) + ) + + logger.info("Train dataset size: {}".format(len(train_dataset))) + logger.info("Validation dataset size: {}".format(len(validation_dataset))) + logger.info("Test dataset size: {}".format(len(test_dataset))) + logger.info("Max user id: {}".format(max_user_id)) + logger.info("Max item id: {}".format(max_item_id)) + logger.info("Max sequence length: {}".format(max_sequence_length)) + logger.info( + "{} dataset sparsity: {}".format( + config["name"], + (len(train_dataset) + len(test_dataset)) / max_user_id / max_item_id, + ) + ) + + train_sampler = TrainSampler.create_from_config( + config["samplers"], + dataset=train_dataset, + num_users=max_user_id, + num_items=max_item_id, + ) + validation_sampler = EvalSampler.create_from_config( + config["samplers"], + dataset=validation_dataset, + num_users=max_user_id, + num_items=max_item_id, + ) + test_sampler = EvalSampler.create_from_config( + config["samplers"], + dataset=test_dataset, + num_users=max_user_id, + num_items=max_item_id, + ) + + return cls( + train_sampler=train_sampler, + validation_sampler=validation_sampler, + test_sampler=test_sampler, + num_users=max_user_id, + num_items=max_item_id, + max_sequence_length=max_sequence_length, + ) + + +class LetterFullDataset(ScientificFullDataset, config_name="letter_full"): + def __init__( + self, + train_sampler, + validation_sampler, + test_sampler, + num_users, + num_items, + max_sequence_length, + ): + self._train_sampler = train_sampler + self._validation_sampler = validation_sampler + self._test_sampler = test_sampler + self._num_users = num_users + self._num_items = num_items + self._max_sequence_length = max_sequence_length + + @classmethod + def create_from_config(cls, config, **kwargs): + user_interactions_path = os.path.join(config["beauty_inter_json"]) + with open(user_interactions_path, "r") as f: + user_interactions = json.load(f) + + dir_path = os.path.join(config["path_to_data_dir"], config["name"]) + + os.makedirs(dir_path, exist_ok=True) + dataset_path = os.path.join(dir_path, "all_data.txt") + + logger.info(f"Saving data to {dataset_path}") + + # Map from LETTER format to Our format + with open(dataset_path, "w") as f: + for user_id, item_ids in user_interactions.items(): + items_repr = map(str, item_ids) + f.write(f"{user_id} {' '.join(items_repr)}\n") + + dataset = ScientificFullDataset.create_from_config(config, **kwargs) + + return cls( + train_sampler=dataset._train_sampler, + validation_sampler=dataset._validation_sampler, + test_sampler=dataset._test_sampler, + num_users=dataset._num_users, + num_items=dataset._num_items, + max_sequence_length=dataset._max_sequence_length, + ) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..f32390d0 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,38 @@ +[project] +name = "irec" +version = "0.1.0" +description = "IRec framework" +readme = "README.md" +requires-python = ">=3.13" +dependencies = [ + "faiss-cpu>=1", + "pandas>=2", + "scipy>=1", + "seaborn>=0.13.2", + "tensorboard>=2", + "torch>=2.7", + "transformers>=4.51", + "tqdm>=4", + "jupyter>=1", +] + +[tool.uv.sources] +torch = [ + { index = "pytorch-cu128", marker = "sys_platform != 'darwin'" }, + { index = "pytorch-cpu", marker = "sys_platform == 'darwin'" }, +] + +[[tool.uv.index]] +name = "pytorch-cu128" +url = "https://download.pytorch.org/whl/cu128" +explicit = true + +[[tool.uv.index]] +name = "pytorch-cpu" +url = "https://download.pytorch.org/whl/cpu" +explicit = true + +[dependency-groups] +dev = [ + "ruff>=0.11.4", +]