fix(vendor): honor llm2vec length and numpy flags

David Berenstein · cursoragent · davidberenstein1957 · commit fb6d9675aef4 · 2026-05-08T11:01:14.000+02:00
Patch two upstream llm2vec behavior bugs found in review so downstream VLM
metrics use caller-provided doc_max_length and can return numpy when requested.
Document Pruna's vendor deviations in NOTICE for traceability.

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;
diff --git a/src/pruna/evaluation/metrics/vendor/NOTICE.oneig_llm2vec b/src/pruna/evaluation/metrics/vendor/NOTICE.oneig_llm2vec
@@ -10,3 +10,8 @@ See the project repository for full license text.
 ``oneig_llm2vec/modeling_llama_encoder.py`` is derived from
 McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp (Hugging Face Hub);
 Pruna relaxes the upstream flash-attention-only constraint for CPU use.
+
+Pruna also includes two minimal compatibility fixes in
+``oneig_llm2vec/llm2vec.py``:
+- Preserve constructor-provided ``doc_max_length`` instead of hardcoding 512.
+- Honor ``convert_to_numpy=True`` in ``encode()`` by returning ``numpy.ndarray``.
diff --git a/src/pruna/evaluation/metrics/vendor/oneig_llm2vec/llm2vec.py b/src/pruna/evaluation/metrics/vendor/oneig_llm2vec/llm2vec.py
@@ -85,7 +85,7 @@ def __init__(
         self.pooling_mode = pooling_mode
         self.skip_instruction = skip_instruction
         self.max_length = max_length
-        self.doc_max_length = 512
+        self.doc_max_length = doc_max_length
         self.config = model.config
 
     @classmethod
@@ -448,6 +448,8 @@ def encode(
         all_embeddings = torch.cat(all_embeddings, dim=0)
         all_embeddings = all_embeddings[np.argsort(length_sorted_idx)]
         all_embeddings = all_embeddings.to(torch.float32)
+        if convert_to_numpy:
+            return all_embeddings.cpu().numpy()
         return all_embeddings
 
     def save(self, output_path, merge_before_save=False, save_config=True):