[refactor] SID: add ResidualQuantizer / BaseSidModel base classes

WhiteSwan1 · claude · WhiteSwan1 · commit 03a2a7aea971 · 2026-06-03T10:42:57.000Z
First of three PRs splitting the Semantic-ID models onto a shared base. Purely
additive — only the backend-agnostic foundation, no concrete quantizer or
model and no edits to existing files. RQ-KMeans follows in PR2, RQ-VAE in PR3.

What this adds:
- ResidualQuantizer (abstract): owns the shared state (embed_dim, per-layer
  codebook sizes via normalize_n_embed, residual-normalization flag, layer
  list; asserts n_layers &gt;= 1) and the shared residual walk — _residual_pass
  drives the concrete get_codes / decode_codes / output_dim. Subclasses
  implement just _quantize_layer (encode) and _lookup_code (decode), plus
  forward and get_codebook_embeddings.
- BaseSidModel (abstract): the shared SID model scaffold — embedding-feature
  extraction, loss/metric init (reconstruction MSE via
  torchmetrics.MeanSquaredError + codebook coverage via UniqueRatio), and
  shared config parsing — that SidRqkmeans / SidRqvae subclass.
- UniqueRatio (tzrec/metrics/unique_ratio.py): codebook-coverage metric (mean
  per-batch unique-row ratio) with empty-batch guard + DDP reduction.

Tests: normalize_n_embed; the abstract-base contract; the concrete residual
walk via a fake one-primitive subclass; and the UniqueRatio metric.

No proto changes and no edits to existing modules; __init__.py is a bare
package marker (no re-exports). The QuantizeForwardMode / QuantizeOutput /
ResidualQuantizerOutput types and the concrete SidRqkmeans / SidRqvae models
ship with the code that uses them in PR2 / PR3.

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/tzrec/metrics/unique_ratio.py b/tzrec/metrics/unique_ratio.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2026, Alibaba Group;
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#    http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torchmetrics import Metric
+
+
+class UniqueRatio(Metric):
+    """Mean per-batch unique-SID ratio (distinct rows / batch size).
+
+    Averages, over batches, the fraction of distinct semantic-ID rows in each
+    batch. It is a cheap (two-scalar state) **diversity proxy**, NOT global
+    codebook coverage: a SID repeated across different batches counts as
+    distinct in each, and smaller batches bias the value toward 1.0. Empty
+    batches are skipped; the per-rank sums reduce by ``sum`` (a count-weighted
+    mean).
+    """
+
+    higher_is_better = True
+    is_differentiable = False
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.add_state("ratio_sum", default=torch.tensor(0.0), dist_reduce_fx="sum")
+        self.add_state("count", default=torch.tensor(0.0), dist_reduce_fx="sum")
+
+    def update(self, codes: torch.Tensor) -> None:
+        """Accumulate one batch's distinct-row ratio.
+
+        Args:
+            codes (Tensor): semantic-ID codes, shape (B, n_layers).
+        """
+        batch_size = codes.shape[0]
+        if batch_size == 0:
+            return
+        unique = torch.unique(codes, dim=0).shape[0]
+        self.ratio_sum += unique / batch_size
+        self.count += 1
+
+    def compute(self) -> torch.Tensor:
+        """Mean per-batch unique ratio (NaN before any non-empty update)."""
+        return self.ratio_sum / self.count
diff --git a/tzrec/metrics/unique_ratio_test.py b/tzrec/metrics/unique_ratio_test.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2026, Alibaba Group;
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#    http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from tzrec.metrics.unique_ratio import UniqueRatio
+
+
+class UniqueRatioTest(unittest.TestCase):
+    def test_single_batch_ratio(self) -> None:
+        metric = UniqueRatio()
+        # 3 distinct rows out of 4 -> 0.75.
+        metric.update(torch.tensor([[1, 2], [1, 2], [3, 4], [5, 6]]))
+        self.assertAlmostEqual(metric.compute().item(), 0.75, places=6)
+
+    def test_mean_over_batches(self) -> None:
+        metric = UniqueRatio()
+        metric.update(torch.tensor([[1, 1], [1, 1]]))  # 1/2 = 0.5
+        metric.update(torch.tensor([[1, 1], [2, 2]]))  # 2/2 = 1.0
+        # Per-batch mean = 0.75 (a global distinct/total would give 0.5).
+        self.assertAlmostEqual(metric.compute().item(), 0.75, places=6)
+
+    def test_empty_batch_skipped(self) -> None:
+        metric = UniqueRatio()
+        metric.update(torch.empty(0, 3, dtype=torch.long))
+        self.assertEqual(metric.count.item(), 0.0)
+        self.assertTrue(torch.isnan(metric.compute()))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tzrec/models/sid_model.py b/tzrec/models/sid_model.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2026, Alibaba Group;
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#    http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""BaseSidModel: shared base for semantic-ID generation models."""
+
+from typing import Any, Dict, List, Optional
+
+import torch
+import torchmetrics
+
+from tzrec.datasets.utils import BASE_DATA_GROUP, Batch
+from tzrec.features.feature import BaseFeature
+from tzrec.metrics.unique_ratio import UniqueRatio
+from tzrec.models.model import BaseModel
+from tzrec.protos.model_pb2 import ModelConfig
+
+
+class BaseSidModel(BaseModel):
+    """Shared base for semantic-ID (SID) generation models.
+
+    Factors the structure common to :class:`SidRqvae` (RQ-VAE) and
+    :class:`SidRqkmeans` (residual K-Means):
+
+    - the shared config fields every SID proto carries —
+      ``embedding_feature_name`` (``_embedding_feature_name``), ``input_dim``
+      (``_input_dim``), ``normalize_residuals`` (``_normalize_residuals``),
+      and the per-layer ``codebook`` (``_n_embed_list`` / ``_n_layers``),
+    - reading the item-embedding feature out of ``Batch.dense_features``,
+    - the eval metrics every SID model reports — reconstruction ``mse`` and
+      ``unique_sid_ratio`` (mean per-batch unique-SID ratio, a diversity
+      proxy).
+
+    Subclasses build their quantizer in ``__init__`` (after calling
+    ``super().__init__``) and implement :meth:`predict` and :meth:`loss`.
+    They extend :meth:`init_metric` (via ``super()``) and implement
+    :meth:`update_metric` to populate the registered metrics
+    (:meth:`update_train_metric` defaults to a no-op).
+
+    Args:
+        model_config (ModelConfig): an instance of ModelConfig.
+        features (list): list of features.
+        labels (list): list of label names.
+        sample_weights (list): sample weight names.
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        features: List[BaseFeature],
+        labels: List[str],
+        sample_weights: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(model_config, features, labels, sample_weights, **kwargs)
+
+        cfg = self._model_config
+        # Config fields shared by every SID model (present on each SID proto
+        # message): the item-embedding feature, the input dimension, the
+        # residual-normalization toggle, and the per-layer codebook.
+        self._embedding_feature_name = cfg.embedding_feature_name
+        self._input_dim = cfg.input_dim
+        self._normalize_residuals = cfg.normalize_residuals
+
+        assert cfg.codebook, "codebook must be set, e.g. [256, 256, 256]"
+        self._n_embed_list = list(cfg.codebook)
+        self._n_layers = len(self._n_embed_list)
+
+    def _extract_feature(
+        self, batch: Batch, feature_name: Optional[str] = None
+    ) -> torch.Tensor:
+        """Extract a named dense feature from ``Batch.dense_features``.
+
+        Args:
+            batch (Batch): input batch data.
+            feature_name (str, optional): feature name to extract.
+                Defaults to ``self._embedding_feature_name``.
+        """
+        if feature_name is None:
+            feature_name = self._embedding_feature_name
+        kt = batch.dense_features[BASE_DATA_GROUP]
+        return kt[feature_name]
+
+    def init_loss(self) -> None:
+        """Initialize loss modules.
+
+        SID models compute their losses internally and pass them through
+        ``predictions``; there is no external loss module to register.
+        """
+        pass
+
+    def init_metric(self) -> None:
+        """Initialize the eval metrics shared by all SID models.
+
+        ``mse``: reconstruction error (input vs. quantized / decoded).
+        ``unique_sid_ratio``: mean per-batch unique-SID ratio (distinct rows /
+        batch size; a batch-size-sensitive diversity proxy, not global
+        coverage). Subclasses call ``super().init_metric()`` then add extras.
+        """
+        self._metric_modules["mse"] = torchmetrics.MeanSquaredError()
+        self._metric_modules["unique_sid_ratio"] = UniqueRatio()
+
+    def update_train_metric(
+        self,
+        predictions: Dict[str, torch.Tensor],
+        batch: Batch,
+    ) -> None:
+        """Update train-path metric state.
+
+        Default is a no-op: K-Means has no train-time codes, so only models
+        with a meaningful train signal (RQ-VAE) override this.
+        """
+        return
diff --git a/tzrec/modules/sid_generation/__init__.py b/tzrec/modules/sid_generation/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) 2026, Alibaba Group;
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#    http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tzrec/modules/sid_generation/residual_quantizer.py b/tzrec/modules/sid_generation/residual_quantizer.py
diff --git a/tzrec/modules/sid_generation/residual_quantizer_test.py b/tzrec/modules/sid_generation/residual_quantizer_test.py