[bugfix] export: cache feature-permute order to avoid per-forward H2D sync (#527)

tiankongdeguiji · claude · web-flow · commit 6c8e2bd31de5 · 2026-05-26T16:33:10.000+08:00
Co-authored-by: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/tzrec/utils/export_util.py b/tzrec/utils/export_util.py
@@ -34,13 +34,17 @@
 from torchrec.inference.modules import quantize_embeddings
 from torchrec.modules.embedding_configs import BaseEmbeddingConfig
 from torchrec.modules.embedding_modules import (
+    EmbeddingBagCollection,
     EmbeddingBagCollectionInterface,
     EmbeddingCollection,
     EmbeddingCollectionInterface,
 )
 from torchrec.quant.embedding_modules import (
     EmbeddingCollection as QuantEmbeddingCollection,
 )
+from torchrec.quant.embedding_modules import (
+    quant_prep_enable_cache_features_order,
+)
 from torchrec.sparse import jagged_tensor
 
 from tzrec.acc import utils as acc_utils
@@ -213,9 +217,16 @@ def export_model_normal(
             logger.info("quantize embeddings...")
             additional_qconfig_spec_keys = []
             additional_mapping = {}
+            cache_order_types = [EmbeddingBagCollection]
             if acc_utils.is_ec_quant():
                 additional_qconfig_spec_keys.append(EmbeddingCollection)
                 additional_mapping[EmbeddingCollection] = QuantEmbeddingCollection
+                cache_order_types.append(EmbeddingCollection)
+            # Cache the feature-permute order as an on-device buffer instead of
+            # rebuilding `torch.tensor(order, device=cuda)` (a blocking H2D copy)
+            # on every forward. Must run before quantize_embeddings so the quant
+            # modules pick it up via `from_float`.
+            quant_prep_enable_cache_features_order(model, cache_order_types)
             quantize_embeddings(
                 model,
                 dtype=acc_utils.quant_dtype(),
diff --git a/tzrec/version.py b/tzrec/version.py
@@ -9,4 +9,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "1.2.13"
+__version__ = "1.2.14"