Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
# IDE
.idea

# Python
__pycache__
data/*
.venv

# Data
tensorboard_logs/*
data/*
checkpoints/*

# Dev
uv.lock
173 changes: 173 additions & 0 deletions configs/train/letter.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
{
"experiment_name": "letter_data",
"best_metric": "validation/ndcg@20",
"train_epochs_num": 100,
"dataset": {
"type": "letter_full",
"path_to_data_dir": "../data",
"name": "Beauty_letter",
"max_sequence_length": 50,
"samplers": {
"type": "last_item_prediction",
"negative_sampler_type": "random"
},
"beauty_inter_json": "../../LETTER/data/Beauty/Beauty.inter.json"

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

place letter repo in same folder as irec

},
"dataloader": {
"train": {
"type": "torch",
"batch_size": 256,
"batch_processor": {
"type": "letter",
"beauty_index_json": "../../LETTER/data/Beauty/Beauty.index.json",
"semantic_length": 4
},
"drop_last": true,
"shuffle": true
},
"validation": {
"type": "torch",
"batch_size": 256,
"batch_processor": {
"type": "letter",
"beauty_index_json": "../../LETTER/data/Beauty/Beauty.index.json",
"semantic_length": 4
},
"drop_last": false,
"shuffle": false
}
},
"model": {
"type": "sasrec",
"sequence_prefix": "item",
"positive_prefix": "positive",
"negative_prefix": "negative",
"candidate_prefix": "candidates",
"embedding_dim": 64,
"num_heads": 2,
"num_layers": 2,
"dim_feedforward": 256,
"dropout": 0.3,
"activation": "gelu",
"layer_norm_eps": 1e-9,
"initializer_range": 0.02
},
"optimizer": {
"type": "basic",
"optimizer": {
"type": "adam",
"lr": 0.001
},
"clip_grad_threshold": 5.0
},
"loss": {
"type": "composite",
"losses": [
{
"type": "sasrec",
"positive_prefix": "positive_scores",
"negative_prefix": "negative_scores",
"output_prefix": "downstream_loss"
}
],
"output_prefix": "loss"
},
"callback": {
"type": "composite",
"callbacks": [
{
"type": "metric",
"on_step": 1,
"loss_prefix": "loss"
},
{
"type": "validation",
"on_step": 64,
"pred_prefix": "logits",
"labels_prefix": "labels",
"metrics": {
"ndcg@5": {
"type": "ndcg",
"k": 5
},
"ndcg@10": {
"type": "ndcg",
"k": 10
},
"ndcg@20": {
"type": "ndcg",
"k": 20
},
"recall@5": {
"type": "recall",
"k": 5
},
"recall@10": {
"type": "recall",
"k": 10
},
"recall@20": {
"type": "recall",
"k": 20
},
"coverage@5": {
"type": "coverage",
"k": 5
},
"coverage@10": {
"type": "coverage",
"k": 10
},
"coverage@20": {
"type": "coverage",
"k": 20
}
}
},
{
"type": "eval",
"on_step": 256,
"pred_prefix": "logits",
"labels_prefix": "labels",
"metrics": {
"ndcg@5": {
"type": "ndcg",
"k": 5
},
"ndcg@10": {
"type": "ndcg",
"k": 10
},
"ndcg@20": {
"type": "ndcg",
"k": 20
},
"recall@5": {
"type": "recall",
"k": 5
},
"recall@10": {
"type": "recall",
"k": 10
},
"recall@20": {
"type": "recall",
"k": 20
},
"coverage@5": {
"type": "coverage",
"k": 5
},
"coverage@10": {
"type": "coverage",
"k": 10
},
"coverage@20": {
"type": "coverage",
"k": 20
}
}
}
]
}
}
57 changes: 56 additions & 1 deletion modeling/dataloader/batch_processors.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import json
import re
import torch
from utils import MetaParent

Expand All @@ -16,7 +18,7 @@ def __call__(self, batch):

class BasicBatchProcessor(BaseBatchProcessor, config_name='basic'):

def __call__(self, batch):
def __call__(self, batch, convert_to_tensor=True):
processed_batch = {}

for key in batch[0].keys():
Expand All @@ -31,7 +33,60 @@ def __call__(self, batch):
processed_batch[f'{prefix}.ids'].extend(sample[f'{prefix}.ids'])
processed_batch[f'{prefix}.length'].append(sample[f'{prefix}.length'])

if convert_to_tensor:
for part, values in processed_batch.items():
processed_batch[part] = torch.tensor(values, dtype=torch.long)

return processed_batch


class LetterBatchProcessor(BasicBatchProcessor, config_name='letter'):
def __init__(self, mapping: dict[int, list[int]], semantic_length: int):
self._prefixes = ['item', 'labels', 'positive', 'negative']
self._semantic_length = semantic_length
self._mapping = mapping

assert sorted(mapping.keys()) == list(range(len(mapping))), "Item ids must be consecutive"
self._mapping_tensor = torch.zeros((len(mapping), semantic_length), dtype=torch.long)
for item_id, semantic_ids in mapping.items():
self._mapping_tensor[item_id] = torch.tensor(semantic_ids, dtype=torch.long)

@classmethod
def create_from_config(cls, config, **kwargs):
mapping_path = config["beauty_index_json"]
with open(mapping_path, "r") as f:
mapping = json.load(f)

semantic_length = config["semantic_length"]

parsed = {}

for key, semantic_ids in mapping.items():
numbers = [int(re.search(r'\d+', item).group()) for item in semantic_ids]
assert len(numbers) == semantic_length, "All semantic ids must have the same length"
parsed[int(key)] = numbers

return cls(mapping=parsed, semantic_length=semantic_length)

def __call__(self, batch):
processed_batch = super().__call__(batch, convert_to_tensor=False)

for prefix in self._prefixes:
if f"{prefix}.ids" in processed_batch:
ids = processed_batch[f"{prefix}.ids"]
lengths = processed_batch[f"{prefix}.length"]

flattened_semantic_ids = []

for _id in ids:
flattened_semantic_ids.extend(self._mapping[_id])

processed_batch[f"semantic_{prefix}.ids"] = flattened_semantic_ids
processed_batch[f"semantic_{prefix}.length"] = [length * self._semantic_length for length in lengths]

for part, values in processed_batch.items():
processed_batch[part] = torch.tensor(values, dtype=torch.long)

processed_batch["all_semantic_ids"] = self._mapping_tensor

return processed_batch
Loading