From 185090543c78fdb3da36dfde23b751f67ca94d1d Mon Sep 17 00:00:00 2001 From: Paolo Calvi Date: Fri, 3 Apr 2026 15:49:12 +0200 Subject: [PATCH 1/6] fix: prevent embedding 422 from strict OpenAI-compatible endpoints LiteLLM >=1.80.11 sends encoding_format=null in embedding requests when the parameter is not explicitly set (BerriAI/litellm#19174). Strict validators such as DeepInfra, vLLM, and HuggingFace TEI reject null with: 422 Unprocessable Entity: Input should be 'float' or 'base64' Default to 'float' (the OpenAI spec default) before merging caller kwargs, so any explicitly configured value still takes precedence. --- models.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/models.py b/models.py index eeeac830b2..e997703017 100644 --- a/models.py +++ b/models.py @@ -600,7 +600,8 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]: # Apply rate limiting if configured apply_rate_limiter_sync(self.a0_model_conf, " ".join(texts)) - resp = embedding(model=self.model_name, input=texts, **self.kwargs) + embed_kwargs = {"encoding_format": "float", **self.kwargs} + resp = embedding(model=self.model_name, input=texts, **embed_kwargs) return [ item.get("embedding") if isinstance(item, dict) else item.embedding # type: ignore for item in resp.data # type: ignore @@ -610,7 +611,8 @@ def embed_query(self, text: str) -> List[float]: # Apply rate limiting if configured apply_rate_limiter_sync(self.a0_model_conf, text) - resp = embedding(model=self.model_name, input=[text], **self.kwargs) + embed_kwargs = {"encoding_format": "float", **self.kwargs} + resp = embedding(model=self.model_name, input=[text], **embed_kwargs) item = resp.data[0] # type: ignore return item.get("embedding") if isinstance(item, dict) else item.embedding # type: ignore From be42cf173fec1600af584ceaa528faff01188a44 Mon Sep 17 00:00:00 2001 From: Paolo Calvi Date: Sat, 4 Apr 2026 05:03:33 +0200 Subject: [PATCH 2/6] fix: subtract 500-token safety margin from embedding ctx to handle tokenizer divergence cl100k_base and bge-m3's SentencePiece tokenizer diverge by ~2-3% on the same text. A document at exactly ctx_length cl100k tokens can exceed ctx_length model tokens, causing 400 errors. 500-token margin provides sufficient headroom. --- models.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/models.py b/models.py index e997703017..b2726957c1 100644 --- a/models.py +++ b/models.py @@ -601,6 +601,10 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]: apply_rate_limiter_sync(self.a0_model_conf, " ".join(texts)) embed_kwargs = {"encoding_format": "float", **self.kwargs} + # Subtract 500 tokens from ctx_length: cl100k_base and the model's own tokenizer + # (e.g. bge-m3 SentencePiece) can diverge by ~2-3%, causing 400 errors at the boundary. + ctx = max((self.a0_model_conf.ctx_length if self.a0_model_conf else 0) or 8192, 1000) - 500 + texts = [trim_to_tokens(t, ctx, "start", ellipsis="") for t in texts] resp = embedding(model=self.model_name, input=texts, **embed_kwargs) return [ item.get("embedding") if isinstance(item, dict) else item.embedding # type: ignore @@ -612,6 +616,8 @@ def embed_query(self, text: str) -> List[float]: apply_rate_limiter_sync(self.a0_model_conf, text) embed_kwargs = {"encoding_format": "float", **self.kwargs} + ctx = max((self.a0_model_conf.ctx_length if self.a0_model_conf else 0) or 8192, 1000) - 500 + text = trim_to_tokens(text, ctx, "start", ellipsis="") resp = embedding(model=self.model_name, input=[text], **embed_kwargs) item = resp.data[0] # type: ignore return item.get("embedding") if isinstance(item, dict) else item.embedding # type: ignore From 6c65b78fda7a5e676562427c1abdedc7cbf79c86 Mon Sep 17 00:00:00 2001 From: Paolo Calvi Date: Sat, 4 Apr 2026 05:16:30 +0200 Subject: [PATCH 3/6] fix: use 20% proportional ctx reduction for embedding truncation Fixed-500-token margin was insufficient: cl100k_base and bge-m3 SentencePiece diverge by up to ~6.5% on the same content. A text with 7692 cl100k tokens can have 8193 bge-m3 tokens, just over the 8192 limit. 20% reduction (ctx * 0.80) provides safe headroom for up to 25% tokenizer divergence across any model size, without needing to know the exact divergence for a given content type. --- models.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/models.py b/models.py index b2726957c1..d75920c214 100644 --- a/models.py +++ b/models.py @@ -601,9 +601,7 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]: apply_rate_limiter_sync(self.a0_model_conf, " ".join(texts)) embed_kwargs = {"encoding_format": "float", **self.kwargs} - # Subtract 500 tokens from ctx_length: cl100k_base and the model's own tokenizer - # (e.g. bge-m3 SentencePiece) can diverge by ~2-3%, causing 400 errors at the boundary. - ctx = max((self.a0_model_conf.ctx_length if self.a0_model_conf else 0) or 8192, 1000) - 500 + ctx = int(max((self.a0_model_conf.ctx_length if self.a0_model_conf else 0) or 8192, 1000) * 0.80) texts = [trim_to_tokens(t, ctx, "start", ellipsis="") for t in texts] resp = embedding(model=self.model_name, input=texts, **embed_kwargs) return [ @@ -616,7 +614,7 @@ def embed_query(self, text: str) -> List[float]: apply_rate_limiter_sync(self.a0_model_conf, text) embed_kwargs = {"encoding_format": "float", **self.kwargs} - ctx = max((self.a0_model_conf.ctx_length if self.a0_model_conf else 0) or 8192, 1000) - 500 + ctx = int(max((self.a0_model_conf.ctx_length if self.a0_model_conf else 0) or 8192, 1000) * 0.80) text = trim_to_tokens(text, ctx, "start", ellipsis="") resp = embedding(model=self.model_name, input=[text], **embed_kwargs) item = resp.data[0] # type: ignore From ff0fb49ffd4b599d49208276e1f7a959b3ee01dd Mon Sep 17 00:00:00 2001 From: Paolo Calvi Date: Sat, 4 Apr 2026 05:23:09 +0200 Subject: [PATCH 4/6] fix: retry embedding with halved text on context-length error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Token counting with cl100k_base is unreliable as a guard for models with different tokenizers (bge-m3 SentencePiece). For dense code content, bge-m3 can use 2x+ more tokens than cl100k for the same text, so no static margin is sufficient. On a 400 context-length error, retry once with 50% of the text — guaranteed to succeed for any content type with no API overhead on normal inputs. --- models.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/models.py b/models.py index d75920c214..c57aaa8711 100644 --- a/models.py +++ b/models.py @@ -603,7 +603,14 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]: embed_kwargs = {"encoding_format": "float", **self.kwargs} ctx = int(max((self.a0_model_conf.ctx_length if self.a0_model_conf else 0) or 8192, 1000) * 0.80) texts = [trim_to_tokens(t, ctx, "start", ellipsis="") for t in texts] - resp = embedding(model=self.model_name, input=texts, **embed_kwargs) + try: + resp = embedding(model=self.model_name, input=texts, **embed_kwargs) + except Exception as e: + if "input_tokens" in str(e) and "context" in str(e): + texts = [t[: len(t) // 2] for t in texts] + resp = embedding(model=self.model_name, input=texts, **embed_kwargs) + else: + raise return [ item.get("embedding") if isinstance(item, dict) else item.embedding # type: ignore for item in resp.data # type: ignore @@ -616,7 +623,13 @@ def embed_query(self, text: str) -> List[float]: embed_kwargs = {"encoding_format": "float", **self.kwargs} ctx = int(max((self.a0_model_conf.ctx_length if self.a0_model_conf else 0) or 8192, 1000) * 0.80) text = trim_to_tokens(text, ctx, "start", ellipsis="") - resp = embedding(model=self.model_name, input=[text], **embed_kwargs) + try: + resp = embedding(model=self.model_name, input=[text], **embed_kwargs) + except Exception as e: + if "input_tokens" in str(e) and "context" in str(e): + resp = embedding(model=self.model_name, input=[text[: len(text) // 2]], **embed_kwargs) + else: + raise item = resp.data[0] # type: ignore return item.get("embedding") if isinstance(item, dict) else item.embedding # type: ignore From 1b52ed9624963b15d0d5533f4aa4e3fdf54615c0 Mon Sep 17 00:00:00 2001 From: Paolo Calvi Date: Sat, 4 Apr 2026 05:42:52 +0200 Subject: [PATCH 5/6] fix: use status_code==400 instead of string matching to detect context-length errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit String matching on str(e) was unreliable — litellm exception str() representation varies by version. HTTP 400 is the canonical signal for context-length errors from OpenAI-compatible embedding endpoints. --- models.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/models.py b/models.py index c57aaa8711..cee412efed 100644 --- a/models.py +++ b/models.py @@ -609,8 +609,12 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]: if "input_tokens" in str(e) and "context" in str(e): texts = [t[: len(t) // 2] for t in texts] resp = embedding(model=self.model_name, input=texts, **embed_kwargs) - else: - raise + break + except Exception as e: + if getattr(e, "status_code", None) == 400 and any(len(t) > 10 for t in texts): + texts = [t[: max(len(t) // 2, 10)] for t in texts] + else: + raise return [ item.get("embedding") if isinstance(item, dict) else item.embedding # type: ignore for item in resp.data # type: ignore @@ -623,13 +627,15 @@ def embed_query(self, text: str) -> List[float]: embed_kwargs = {"encoding_format": "float", **self.kwargs} ctx = int(max((self.a0_model_conf.ctx_length if self.a0_model_conf else 0) or 8192, 1000) * 0.80) text = trim_to_tokens(text, ctx, "start", ellipsis="") - try: - resp = embedding(model=self.model_name, input=[text], **embed_kwargs) - except Exception as e: - if "input_tokens" in str(e) and "context" in str(e): - resp = embedding(model=self.model_name, input=[text[: len(text) // 2]], **embed_kwargs) - else: - raise + while True: + try: + resp = embedding(model=self.model_name, input=[text], **embed_kwargs) + break + except Exception as e: + if getattr(e, "status_code", None) == 400 and len(text) > 10: + text = text[: max(len(text) // 2, 10)] + else: + raise item = resp.data[0] # type: ignore return item.get("embedding") if isinstance(item, dict) else item.embedding # type: ignore From 03ec88e516cedd342d1ff489bf29c2ddce51532b Mon Sep 17 00:00:00 2001 From: Paolo Calvi Date: Sat, 4 Apr 2026 05:45:09 +0200 Subject: [PATCH 6/6] fix: cap fallback memory recall query at 4000 chars When memory_recall_query_prep=false (default), the fallback query is user_message + full_history (up to 10000 chars). At dense content densities (~1 char/token) this exceeds the embedding model's 8192-token limit. 4000 chars is ~1000 tokens for any content type, more than sufficient for semantic similarity search. --- .../python/message_loop_prompts_after/_50_recall_memories.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/_memory/extensions/python/message_loop_prompts_after/_50_recall_memories.py b/plugins/_memory/extensions/python/message_loop_prompts_after/_50_recall_memories.py index 42fb393b0a..a754f39cc4 100644 --- a/plugins/_memory/extensions/python/message_loop_prompts_after/_50_recall_memories.py +++ b/plugins/_memory/extensions/python/message_loop_prompts_after/_50_recall_memories.py @@ -117,7 +117,7 @@ async def search_memories(self, log_item: log.LogItem, loop_data: LoopData, **kw # otherwise use the message and history as query else: - query = user_instruction + "\n\n" + history + query = (user_instruction + "\n\n" + history)[:4000] # if there is no query (or just dash by the LLM), do not continue if not query or len(query) <= 3: