From 185090543c78fdb3da36dfde23b751f67ca94d1d Mon Sep 17 00:00:00 2001
From: Paolo Calvi <paolo.cesare.calvi@gmail.com>
Date: Fri, 3 Apr 2026 15:49:12 +0200
Subject: [PATCH 1/6] fix: prevent embedding 422 from strict OpenAI-compatible
 endpoints

LiteLLM >=1.80.11 sends encoding_format=null in embedding requests when the
parameter is not explicitly set (BerriAI/litellm#19174). Strict validators
such as DeepInfra, vLLM, and HuggingFace TEI reject null with:

  422 Unprocessable Entity: Input should be 'float' or 'base64'

Default to 'float' (the OpenAI spec default) before merging caller kwargs,
so any explicitly configured value still takes precedence.
---
 models.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/models.py b/models.py
index eeeac830b2..e997703017 100644
--- a/models.py
+++ b/models.py
@@ -600,7 +600,8 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]:
         # Apply rate limiting if configured
         apply_rate_limiter_sync(self.a0_model_conf, " ".join(texts))
 
-        resp = embedding(model=self.model_name, input=texts, **self.kwargs)
+        embed_kwargs = {"encoding_format": "float", **self.kwargs}
+        resp = embedding(model=self.model_name, input=texts, **embed_kwargs)
         return [
             item.get("embedding") if isinstance(item, dict) else item.embedding  # type: ignore
             for item in resp.data  # type: ignore
@@ -610,7 +611,8 @@ def embed_query(self, text: str) -> List[float]:
         # Apply rate limiting if configured
         apply_rate_limiter_sync(self.a0_model_conf, text)
 
-        resp = embedding(model=self.model_name, input=[text], **self.kwargs)
+        embed_kwargs = {"encoding_format": "float", **self.kwargs}
+        resp = embedding(model=self.model_name, input=[text], **embed_kwargs)
         item = resp.data[0]  # type: ignore
         return item.get("embedding") if isinstance(item, dict) else item.embedding  # type: ignore
 

From be42cf173fec1600af584ceaa528faff01188a44 Mon Sep 17 00:00:00 2001
From: Paolo Calvi <paolo.cesare.calvi@gmail.com>
Date: Sat, 4 Apr 2026 05:03:33 +0200
Subject: [PATCH 2/6] fix: subtract 500-token safety margin from embedding ctx
 to handle tokenizer divergence

cl100k_base and bge-m3's SentencePiece tokenizer diverge by ~2-3% on the same
text. A document at exactly ctx_length cl100k tokens can exceed ctx_length
model tokens, causing 400 errors. 500-token margin provides sufficient headroom.
---
 models.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/models.py b/models.py
index e997703017..b2726957c1 100644
--- a/models.py
+++ b/models.py
@@ -601,6 +601,10 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]:
         apply_rate_limiter_sync(self.a0_model_conf, " ".join(texts))
 
         embed_kwargs = {"encoding_format": "float", **self.kwargs}
+        # Subtract 500 tokens from ctx_length: cl100k_base and the model's own tokenizer
+        # (e.g. bge-m3 SentencePiece) can diverge by ~2-3%, causing 400 errors at the boundary.
+        ctx = max((self.a0_model_conf.ctx_length if self.a0_model_conf else 0) or 8192, 1000) - 500
+        texts = [trim_to_tokens(t, ctx, "start", ellipsis="") for t in texts]
         resp = embedding(model=self.model_name, input=texts, **embed_kwargs)
         return [
             item.get("embedding") if isinstance(item, dict) else item.embedding  # type: ignore
@@ -612,6 +616,8 @@ def embed_query(self, text: str) -> List[float]:
         apply_rate_limiter_sync(self.a0_model_conf, text)
 
         embed_kwargs = {"encoding_format": "float", **self.kwargs}
+        ctx = max((self.a0_model_conf.ctx_length if self.a0_model_conf else 0) or 8192, 1000) - 500
+        text = trim_to_tokens(text, ctx, "start", ellipsis="")
         resp = embedding(model=self.model_name, input=[text], **embed_kwargs)
         item = resp.data[0]  # type: ignore
         return item.get("embedding") if isinstance(item, dict) else item.embedding  # type: ignore

From 6c65b78fda7a5e676562427c1abdedc7cbf79c86 Mon Sep 17 00:00:00 2001
From: Paolo Calvi <paolo.cesare.calvi@gmail.com>
Date: Sat, 4 Apr 2026 05:16:30 +0200
Subject: [PATCH 3/6] fix: use 20% proportional ctx reduction for embedding
 truncation

Fixed-500-token margin was insufficient: cl100k_base and bge-m3 SentencePiece
diverge by up to ~6.5% on the same content. A text with 7692 cl100k tokens
can have 8193 bge-m3 tokens, just over the 8192 limit.

20% reduction (ctx * 0.80) provides safe headroom for up to 25% tokenizer
divergence across any model size, without needing to know the exact divergence
for a given content type.
---
 models.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/models.py b/models.py
index b2726957c1..d75920c214 100644
--- a/models.py
+++ b/models.py
@@ -601,9 +601,7 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]:
         apply_rate_limiter_sync(self.a0_model_conf, " ".join(texts))
 
         embed_kwargs = {"encoding_format": "float", **self.kwargs}
-        # Subtract 500 tokens from ctx_length: cl100k_base and the model's own tokenizer
-        # (e.g. bge-m3 SentencePiece) can diverge by ~2-3%, causing 400 errors at the boundary.
-        ctx = max((self.a0_model_conf.ctx_length if self.a0_model_conf else 0) or 8192, 1000) - 500
+        ctx = int(max((self.a0_model_conf.ctx_length if self.a0_model_conf else 0) or 8192, 1000) * 0.80)
         texts = [trim_to_tokens(t, ctx, "start", ellipsis="") for t in texts]
         resp = embedding(model=self.model_name, input=texts, **embed_kwargs)
         return [
@@ -616,7 +614,7 @@ def embed_query(self, text: str) -> List[float]:
         apply_rate_limiter_sync(self.a0_model_conf, text)
 
         embed_kwargs = {"encoding_format": "float", **self.kwargs}
-        ctx = max((self.a0_model_conf.ctx_length if self.a0_model_conf else 0) or 8192, 1000) - 500
+        ctx = int(max((self.a0_model_conf.ctx_length if self.a0_model_conf else 0) or 8192, 1000) * 0.80)
         text = trim_to_tokens(text, ctx, "start", ellipsis="")
         resp = embedding(model=self.model_name, input=[text], **embed_kwargs)
         item = resp.data[0]  # type: ignore

From ff0fb49ffd4b599d49208276e1f7a959b3ee01dd Mon Sep 17 00:00:00 2001
From: Paolo Calvi <paolo.cesare.calvi@gmail.com>
Date: Sat, 4 Apr 2026 05:23:09 +0200
Subject: [PATCH 4/6] fix: retry embedding with halved text on context-length
 error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Token counting with cl100k_base is unreliable as a guard for models with
different tokenizers (bge-m3 SentencePiece). For dense code content, bge-m3
can use 2x+ more tokens than cl100k for the same text, so no static margin
is sufficient. On a 400 context-length error, retry once with 50% of the
text — guaranteed to succeed for any content type with no API overhead
on normal inputs.
---
 models.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/models.py b/models.py
index d75920c214..c57aaa8711 100644
--- a/models.py
+++ b/models.py
@@ -603,7 +603,14 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]:
         embed_kwargs = {"encoding_format": "float", **self.kwargs}
         ctx = int(max((self.a0_model_conf.ctx_length if self.a0_model_conf else 0) or 8192, 1000) * 0.80)
         texts = [trim_to_tokens(t, ctx, "start", ellipsis="") for t in texts]
-        resp = embedding(model=self.model_name, input=texts, **embed_kwargs)
+        try:
+            resp = embedding(model=self.model_name, input=texts, **embed_kwargs)
+        except Exception as e:
+            if "input_tokens" in str(e) and "context" in str(e):
+                texts = [t[: len(t) // 2] for t in texts]
+                resp = embedding(model=self.model_name, input=texts, **embed_kwargs)
+            else:
+                raise
         return [
             item.get("embedding") if isinstance(item, dict) else item.embedding  # type: ignore
             for item in resp.data  # type: ignore
@@ -616,7 +623,13 @@ def embed_query(self, text: str) -> List[float]:
         embed_kwargs = {"encoding_format": "float", **self.kwargs}
         ctx = int(max((self.a0_model_conf.ctx_length if self.a0_model_conf else 0) or 8192, 1000) * 0.80)
         text = trim_to_tokens(text, ctx, "start", ellipsis="")
-        resp = embedding(model=self.model_name, input=[text], **embed_kwargs)
+        try:
+            resp = embedding(model=self.model_name, input=[text], **embed_kwargs)
+        except Exception as e:
+            if "input_tokens" in str(e) and "context" in str(e):
+                resp = embedding(model=self.model_name, input=[text[: len(text) // 2]], **embed_kwargs)
+            else:
+                raise
         item = resp.data[0]  # type: ignore
         return item.get("embedding") if isinstance(item, dict) else item.embedding  # type: ignore
 

From 1b52ed9624963b15d0d5533f4aa4e3fdf54615c0 Mon Sep 17 00:00:00 2001
From: Paolo Calvi <paolo.cesare.calvi@gmail.com>
Date: Sat, 4 Apr 2026 05:42:52 +0200
Subject: [PATCH 5/6] fix: use status_code==400 instead of string matching to
 detect context-length errors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

String matching on str(e) was unreliable — litellm exception str() representation
varies by version. HTTP 400 is the canonical signal for context-length errors
from OpenAI-compatible embedding endpoints.
---
 models.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/models.py b/models.py
index c57aaa8711..cee412efed 100644
--- a/models.py
+++ b/models.py
@@ -609,8 +609,12 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]:
             if "input_tokens" in str(e) and "context" in str(e):
                 texts = [t[: len(t) // 2] for t in texts]
                 resp = embedding(model=self.model_name, input=texts, **embed_kwargs)
-            else:
-                raise
+                break
+            except Exception as e:
+                if getattr(e, "status_code", None) == 400 and any(len(t) > 10 for t in texts):
+                    texts = [t[: max(len(t) // 2, 10)] for t in texts]
+                else:
+                    raise
         return [
             item.get("embedding") if isinstance(item, dict) else item.embedding  # type: ignore
             for item in resp.data  # type: ignore
@@ -623,13 +627,15 @@ def embed_query(self, text: str) -> List[float]:
         embed_kwargs = {"encoding_format": "float", **self.kwargs}
         ctx = int(max((self.a0_model_conf.ctx_length if self.a0_model_conf else 0) or 8192, 1000) * 0.80)
         text = trim_to_tokens(text, ctx, "start", ellipsis="")
-        try:
-            resp = embedding(model=self.model_name, input=[text], **embed_kwargs)
-        except Exception as e:
-            if "input_tokens" in str(e) and "context" in str(e):
-                resp = embedding(model=self.model_name, input=[text[: len(text) // 2]], **embed_kwargs)
-            else:
-                raise
+        while True:
+            try:
+                resp = embedding(model=self.model_name, input=[text], **embed_kwargs)
+                break
+            except Exception as e:
+                if getattr(e, "status_code", None) == 400 and len(text) > 10:
+                    text = text[: max(len(text) // 2, 10)]
+                else:
+                    raise
         item = resp.data[0]  # type: ignore
         return item.get("embedding") if isinstance(item, dict) else item.embedding  # type: ignore
 

From 03ec88e516cedd342d1ff489bf29c2ddce51532b Mon Sep 17 00:00:00 2001
From: Paolo Calvi <paolo.cesare.calvi@gmail.com>
Date: Sat, 4 Apr 2026 05:45:09 +0200
Subject: [PATCH 6/6] fix: cap fallback memory recall query at 4000 chars

When memory_recall_query_prep=false (default), the fallback query is
user_message + full_history (up to 10000 chars). At dense content densities
(~1 char/token) this exceeds the embedding model's 8192-token limit.
4000 chars is ~1000 tokens for any content type, more than sufficient
for semantic similarity search.
---
 .../python/message_loop_prompts_after/_50_recall_memories.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plugins/_memory/extensions/python/message_loop_prompts_after/_50_recall_memories.py b/plugins/_memory/extensions/python/message_loop_prompts_after/_50_recall_memories.py
index 42fb393b0a..a754f39cc4 100644
--- a/plugins/_memory/extensions/python/message_loop_prompts_after/_50_recall_memories.py
+++ b/plugins/_memory/extensions/python/message_loop_prompts_after/_50_recall_memories.py
@@ -117,7 +117,7 @@ async def search_memories(self, log_item: log.LogItem, loop_data: LoopData, **kw
         
         # otherwise use the message and history as query
         else:
-            query = user_instruction + "\n\n" + history
+            query = (user_instruction + "\n\n" + history)[:4000]
 
         # if there is no query (or just dash by the LLM), do not continue
         if not query or len(query) <= 3: