qwen2/3vl: memoize HF processor _merge_kwargs by call signature

aswinvisva · aswinvisva · commit 2b7db688c97f · 2026-06-24T09:21:28.000-07:00
ProcessorMixin._merge_kwargs (transformers) is pure but runs on every
processor call. When all requests pass the same kwargs (the common
deployment case), caching by signature converts a per-call merge into
an O(1) lookup after the first call.

Implemented as a wrapper installed on the processor instance at
construction time, so it doesn't require any change to transformers.
Cache key is the repr of sorted kwargs items; values are deep-copied
on get and put because callers mutate the returned dict.

Signed-off-by: Aswin Visva &lt;31215515+aswinvisva@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_qwen2vl.py b/tensorrt_llm/_torch/models/modeling_qwen2vl.py
@@ -77,6 +77,41 @@
 PAD_INDEX = -100  # NOTE: refer to https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py#L269
 
 
+def _install_merge_kwargs_cache(processor) -> None:
+    """Memoize ``processor._merge_kwargs`` by input kwargs signature.
+
+    ``ProcessorMixin._merge_kwargs`` is pure but runs on every processor
+    call. When all requests pass the same kwargs (the common deployment
+    case), caching by signature reduces it to an O(1) lookup after the
+    first call.
+
+    Values are deep-copied on get and put because callers mutate the
+    returned dict.
+    """
+    import copy
+
+    if getattr(processor, "_merge_kwargs_cached_installed", False):
+        return
+
+    cache: dict = {}
+    orig = processor._merge_kwargs
+
+    def _cached_merge_kwargs(*args, **kwargs):
+        try:
+            key = repr(sorted(kwargs.items()))
+        except Exception:
+            return orig(*args, **kwargs)
+        hit = cache.get(key)
+        if hit is not None:
+            return copy.deepcopy(hit)
+        result = orig(*args, **kwargs)
+        cache[key] = copy.deepcopy(result)
+        return result
+
+    processor._merge_kwargs = _cached_merge_kwargs
+    processor._merge_kwargs_cached_installed = True
+
+
 def _prepare_qwen_vl_vision_attn_metadata(
         seq_lens: List[int],
         attn_metadata: AttentionMetadata) -> AttentionMetadata:
@@ -186,6 +221,7 @@ def __init__(self,
             model_path,
             use_fast=self.use_fast,
             trust_remote_code=trust_remote_code)
+        _install_merge_kwargs_cache(self._processor)
 
         self.tllm_multimodal_token_id = self.get_vocab_size() + 1
         # temporal patch size for video frames