feat: implement dynamic max chunks per batch from env variable (#898)

NickCrews · web-flow · commit 3e054856bdc1 · 2025-12-17T10:04:58.000+01:00
- max chunks per batch when sending using "gemini"
- add ability to configure this even more arbitrarily using env vars

Signed-off-by: Nick Crews &lt;nicholas.b.crews@gmail.com&gt;
diff --git a/projects/pgai/pgai/vectorizer/embedders/litellm.py b/projects/pgai/pgai/vectorizer/embedders/litellm.py
@@ -1,3 +1,4 @@
+import os
 from collections.abc import AsyncGenerator, Callable
 from typing import Any, Literal
 
@@ -61,6 +62,14 @@ def _max_chunks_per_batch(self) -> int:
         # Note: deferred import to avoid import overhead
         import litellm
 
+        if result := os.getenv("PGAI_LITELLM_MAX_CHUNKS_PER_BATCH"):
+            try:
+                return int(result)
+            except ValueError:
+                logger.warn(
+                    "Value for PGAI_LITELLM_MAX_CHUNKS_PER_BATCH is not a valid int. Continuing with default provider value"
+                )
+
         _, custom_llm_provider, _, _ = litellm.get_llm_provider(self.model)  # type: ignore
         match custom_llm_provider:
             case "cohere":
@@ -71,6 +80,8 @@ def _max_chunks_per_batch(self) -> int:
                 return 2048  # https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/embeddings?tabs=console#verify-inputs-dont-exceed-the-maximum-length
             case "bedrock":
                 return 96  # NOTE: currently (Jan 2025) Bedrock only supports embeddings with Cohere or Titan models. The Titan API only processes one input per request, which LiteLLM already handles under the hood. We assume that the Cohere API has the same input limits as above.
+            case "gemini":
+                return 250  # https://docs.cloud.google.com/vertex-ai/docs/quotas#text-embedding-limits
             case "huggingface":
                 return 2048  # NOTE: There is not documented limit. In testing we got a response for a request with 10k (short) inputs.
             case "mistral":
@@ -80,10 +91,11 @@ def _max_chunks_per_batch(self) -> int:
             case "voyage":
                 return 128  # see https://docs.voyageai.com/reference/embeddings-api
             case _:
+                fallback = 5
                 logger.warn(
-                    f"unknown provider '{custom_llm_provider}', falling back to conservative max chunks per batch"
+                    f"unknown provider '{custom_llm_provider}', falling back to {fallback} max chunks per batch"
                 )
-                return 5
+                return fallback
 
     @override
     def _max_tokens_per_batch(self) -> int | None: