vectorize-io
diff --git a/‎hindsight-api-slim/hindsight_api/config.py‎
Lines changed: 16 additions & 0 deletions b/‎hindsight-api-slim/hindsight_api/config.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎hindsight-api-slim/hindsight_api/engine/bank_attribution.py‎
Lines changed: 34 additions & 0 deletions b/‎hindsight-api-slim/hindsight_api/engine/bank_attribution.py‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎hindsight-api-slim/hindsight_api/engine/cross_encoder.py‎
Lines changed: 1 addition & 1 deletion b/‎hindsight-api-slim/hindsight_api/engine/cross_encoder.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎hindsight-api-slim/hindsight_api/engine/embeddings.py‎
Lines changed: 2 additions & 0 deletions b/‎hindsight-api-slim/hindsight_api/engine/embeddings.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎hindsight-api-slim/hindsight_api/engine/memory_engine.py‎
Lines changed: 51 additions & 1 deletion b/‎hindsight-api-slim/hindsight_api/engine/memory_engine.py‎
Lines changed: 51 additions & 1 deletion
diff --git a/‎hindsight-api-slim/hindsight_api/engine/providers/openai_compatible_llm.py‎
Lines changed: 5 additions & 0 deletions b/‎hindsight-api-slim/hindsight_api/engine/providers/openai_compatible_llm.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎hindsight-api-slim/hindsight_api/engine/retain/embedding_utils.py‎
Lines changed: 9 additions & 1 deletion b/‎hindsight-api-slim/hindsight_api/engine/retain/embedding_utils.py‎
Lines changed: 9 additions & 1 deletion
@@ -145,6 +145,7 @@ def normalize_config_dict(config: dict[str, Any]) -> dict[str, Any]:
 ENV_LLM_EXTRA_BODY = "HINDSIGHT_API_LLM_EXTRA_BODY"
 ENV_LLM_DEFAULT_HEADERS = "HINDSIGHT_API_LLM_DEFAULT_HEADERS"
 ENV_LLM_STRICT_SCHEMA = "HINDSIGHT_API_LLM_STRICT_SCHEMA"
+ENV_LLM_SEND_BANK_AS_USER = "HINDSIGHT_API_LLM_SEND_BANK_AS_USER"
 
 # LiteLLM Router chain — provider-specific config consumed by the "litellmrouter"
 # provider. Each entry is a deployment; the Router tries them in declared order and
@@ -254,6 +255,7 @@ def normalize_config_dict(config: dict[str, Any]) -> dict[str, Any]:
 ENV_EMBEDDINGS_OPENROUTER_MODEL = "HINDSIGHT_API_EMBEDDINGS_OPENROUTER_MODEL"
 ENV_RERANKER_OPENROUTER_API_KEY = "HINDSIGHT_API_RERANKER_OPENROUTER_API_KEY"
 ENV_RERANKER_OPENROUTER_MODEL = "HINDSIGHT_API_RERANKER_OPENROUTER_MODEL"
+ENV_RERANKER_OPENROUTER_BASE_URL = "HINDSIGHT_API_RERANKER_OPENROUTER_BASE_URL"
 
 # ZeroEntropy configuration (embeddings)
 ENV_EMBEDDINGS_ZEROENTROPY_API_KEY = "HINDSIGHT_API_EMBEDDINGS_ZEROENTROPY_API_KEY"
@@ -620,6 +622,7 @@ def normalize_config_dict(config: dict[str, Any]) -> dict[str, Any]:
 DEFAULT_LLM_MAX_BACKOFF = 60.0  # Max backoff cap in seconds for retry exponential backoff
 DEFAULT_LLM_TIMEOUT = 120.0  # seconds
 DEFAULT_LLM_REASONING_EFFORT = "low"
+DEFAULT_LLM_SEND_BANK_AS_USER = False  # Opt-in: tag provider calls with user=<bank_id>
 
 # Vertex AI defaults
 DEFAULT_LLM_VERTEXAI_PROJECT_ID = None  # Required for Vertex AI
@@ -740,6 +743,7 @@ def _parse_strategy_boosts(raw: str | None) -> dict[str, str]:
 # OpenRouter defaults
 DEFAULT_EMBEDDINGS_OPENROUTER_MODEL = "perplexity/pplx-embed-v1-0.6b"
 DEFAULT_RERANKER_OPENROUTER_MODEL = "cohere/rerank-v3.5"
+DEFAULT_RERANKER_OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1/rerank"
 
 # ZeroEntropy defaults
 DEFAULT_EMBEDDINGS_ZEROENTROPY_MODEL = "zembed-1"
@@ -1229,6 +1233,11 @@ class HindsightConfig:
         dict | None
     )  # Custom headers passed as default_headers to provider SDK clients (e.g. {"X-Component-Id": "hindsight"} for proxies / request tracing)
     llm_strict_schema: bool  # Grammar-enforce structured output via the provider's strongest schema mode (see DEFAULT_LLM_STRICT_SCHEMA)
+    # Tags outbound OpenAI-compatible LLM + embedding calls with `user=<bank_id>` for
+    # per-bank cost attribution. Downstream cost gateways (OpenRouter usage accounting,
+    # LiteLLM, Helicone) key attribution on the OpenAI `user` field. Opt-in; never
+    # overrides a `user` the caller already set.
+    llm_send_bank_as_user: bool
 
     # LiteLLM Router chain (provider-specific; consumed by the "litellmrouter" provider).
     # List of deployment dicts evaluated in order with fallback on transient errors.
@@ -1360,6 +1369,7 @@ class HindsightConfig:
     reranker_cohere_timeout: float
     reranker_openrouter_api_key: str | None
     reranker_openrouter_model: str
+    reranker_openrouter_base_url: str
     reranker_openrouter_timeout: float
     reranker_litellm_api_base: str
     reranker_litellm_api_key: str | None
@@ -1610,6 +1620,7 @@ class HindsightConfig:
         "embeddings_tei_base_url",
         "reranker_tei_base_url",
         "reranker_cohere_base_url",
+        "reranker_openrouter_base_url",
         "embeddings_zeroentropy_base_url",
         "reranker_zeroentropy_base_url",
         "reranker_siliconflow_base_url",
@@ -1904,6 +1915,8 @@ def from_env(cls) -> "HindsightConfig":
             llm_extra_body=json.loads(os.getenv(ENV_LLM_EXTRA_BODY, "null")),
             llm_default_headers=json.loads(os.getenv(ENV_LLM_DEFAULT_HEADERS, "null")),
             llm_strict_schema=os.getenv(ENV_LLM_STRICT_SCHEMA, str(DEFAULT_LLM_STRICT_SCHEMA)).lower() in ("true", "1"),
+            llm_send_bank_as_user=os.getenv(ENV_LLM_SEND_BANK_AS_USER, str(DEFAULT_LLM_SEND_BANK_AS_USER)).lower()
+            in ("true", "1"),
             llm_litellmrouter_config=_parse_llm_router_config(ENV_LLM_LITELLMROUTER_CONFIG),
             # Vertex AI
             llm_vertexai_project_id=os.getenv(ENV_LLM_VERTEXAI_PROJECT_ID) or DEFAULT_LLM_VERTEXAI_PROJECT_ID,
@@ -2183,6 +2196,9 @@ def from_env(cls) -> "HindsightConfig":
             or os.getenv(ENV_OPENROUTER_API_KEY)
             or os.getenv(ENV_LLM_API_KEY),
             reranker_openrouter_model=os.getenv(ENV_RERANKER_OPENROUTER_MODEL, DEFAULT_RERANKER_OPENROUTER_MODEL),
+            reranker_openrouter_base_url=os.getenv(
+                ENV_RERANKER_OPENROUTER_BASE_URL, DEFAULT_RERANKER_OPENROUTER_BASE_URL
+            ),
             reranker_openrouter_timeout=float(
                 os.getenv(ENV_RERANKER_OPENROUTER_TIMEOUT, str(DEFAULT_RERANKER_OPENROUTER_TIMEOUT))
             ),
 
@@ -0,0 +1,34 @@
+"""Per-bank provider cost attribution via the OpenAI ``user`` field.
+
+Shared by the OpenAI-compatible LLM path and the OpenAI embeddings path so both
+tag outbound requests identically. Opt-in via ``HINDSIGHT_API_LLM_SEND_BANK_AS_USER``;
+downstream cost gateways (OpenRouter usage accounting, LiteLLM, Helicone) key spend
+on the OpenAI ``user`` field.
+
+Note: when enabled, the bank id is transmitted to the upstream provider as the
+end-user identifier. Banks that are themselves end-user identifiers are therefore
+forwarded to the provider — which is exactly what the OpenAI ``user`` field is for,
+but operators should opt in with that in mind.
+"""
+
+from typing import Any
+
+
+def apply_bank_attribution(request: dict[str, Any]) -> None:
+    """Tag ``request`` with ``user=<bank_id>`` for per-bank cost attribution.
+
+    Mutates ``request`` in place. No-op when the flag is off, no bank is in context,
+    or the caller already set ``user`` — we never override an explicit value.
+    """
+    if "user" in request:
+        return
+    # Lazy imports: memory_engine imports the embeddings/provider modules that call
+    # this, so a top-level import of memory_engine here would be circular.
+    from ..config import get_config
+    from .memory_engine import get_current_bank_id
+
+    if not get_config().llm_send_bank_as_user:
+        return
+    bank_id = get_current_bank_id()
+    if bank_id:
+        request["user"] = bank_id
@@ -1679,7 +1679,7 @@ def create_cross_encoder_from_env() -> CrossEncoderModel:
         return CohereCrossEncoder(
             api_key=api_key,
             model=config.reranker_openrouter_model,
-            base_url="https://openrouter.ai/api/v1/rerank",
+            base_url=config.reranker_openrouter_base_url,
             timeout=config.reranker_openrouter_timeout,
         )
     elif provider == "flashrank":
 
@@ -57,6 +57,7 @@
     ENV_EMBEDDINGS_ZEROENTROPY_ENCODING_FORMAT,
     ENV_LLM_API_KEY,
 )
+from .bank_attribution import apply_bank_attribution
 
 logger = logging.getLogger(__name__)
 
@@ -705,6 +706,7 @@ def encode(self, texts: list[str]) -> list[list[float]]:
             }
             if self.dimensions is not None:
                 request["dimensions"] = self.dimensions
+            apply_bank_attribution(request)
 
             response = self._client.embeddings.create(**request)
 
 
@@ -11,14 +11,16 @@
 
 import asyncio
 import contextvars
+import functools
+import inspect
 import json
 import logging
 import time
 import uuid
 from collections.abc import Awaitable, Callable
 from dataclasses import dataclass, field
 from datetime import UTC, datetime, timedelta, timezone
-from typing import TYPE_CHECKING, Any, Literal, cast, overload
+from typing import TYPE_CHECKING, Any, Literal, ParamSpec, TypeVar, cast, overload
 
 import asyncpg
 import httpx
@@ -67,6 +69,12 @@
 # Context variable for current schema (async-safe, per-task isolation)
 # Note: default is None, actual default comes from config via get_current_schema()
 _current_schema: contextvars.ContextVar[str | None] = contextvars.ContextVar("current_schema", default=None)
+
+# Context variable for the bank an operation runs for (async-safe, per-task isolation).
+# Set by the engine wherever it learns the bank (recall/retain/batch/task execution) so
+# downstream provider calls can attribute spend per bank — e.g. tagging the OpenAI `user`
+# field for cost gateways. None outside a bank-scoped operation.
+_current_bank_id: contextvars.ContextVar[str | None] = contextvars.ContextVar("current_bank_id", default=None)
 MENTAL_MODEL_PENDING_CONTENT = "Generating content..."
 
 
@@ -79,6 +87,44 @@ def get_current_schema() -> str:
     return schema
 
 
+def get_current_bank_id() -> str | None:
+    """Get the bank id of the in-flight operation, or None outside a bank-scoped context."""
+    return _current_bank_id.get()
+
+
+_P = ParamSpec("_P")
+_R = TypeVar("_R")
+
+
+def _bind_bank_id(
+    arg: str = "bank_id", key: str | None = None
+) -> Callable[[Callable[_P, Awaitable[_R]]], Callable[_P, Awaitable[_R]]]:
+    """Bind ``_current_bank_id`` to an argument of the wrapped coroutine for the call's duration.
+
+    ``arg`` names the parameter carrying the bank id; ``key`` optionally pulls it out of a
+    dict-valued argument (e.g. ``task_dict["bank_id"]``). Token-based set/reset (including on
+    exception) keeps the binding scoped to the call.
+    """
+
+    def decorate(func: Callable[_P, Awaitable[_R]]) -> Callable[_P, Awaitable[_R]]:
+        sig = inspect.signature(func)
+
+        @functools.wraps(func)
+        async def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R:
+            value = sig.bind(*args, **kwargs).arguments.get(arg)
+            if key is not None and isinstance(value, dict):
+                value = value.get(key)
+            token = _current_bank_id.set(value if isinstance(value, str) else None)
+            try:
+                return await func(*args, **kwargs)
+            finally:
+                _current_bank_id.reset(token)
+
+        return wrapper
+
+    return decorate
+
+
 def count_tokens(text: str) -> int:
     """Count tokens in text using tiktoken (cl100k_base encoding for GPT-4/3.5)."""
     return len(_get_tiktoken_encoding().encode(text))
@@ -1626,6 +1672,7 @@ async def _handle_refresh_mental_model(self, task_dict: dict[str, Any]):
 
         logger.info(f"[REFRESH_MENTAL_MODEL_TASK] Completed for bank_id={bank_id}, mental_model_id={mental_model_id}")
 
+    @_bind_bank_id("task_dict", key="bank_id")
     async def execute_task(self, task_dict: dict[str, Any]):
         """
         Execute a task by routing it to the appropriate handler.
@@ -2933,6 +2980,7 @@ def retain(
         ctx = request_context if request_context is not None else RC()
         return asyncio.run(self.retain_async(bank_id, content, context, event_date, request_context=ctx))
 
+    @_bind_bank_id()
     async def retain_async(
         self,
         bank_id: str,
@@ -2979,6 +3027,7 @@ async def retain_async(
         # Return the first (and only) list of unit IDs
         return result[0] if result else []
 
+    @_bind_bank_id()
     async def retain_batch_async(
         self,
         bank_id: str,
@@ -3706,6 +3755,7 @@ def recall(
             )
         )
 
+    @_bind_bank_id()
     async def recall_async(
         self,
         bank_id: str,
 
@@ -33,6 +33,7 @@
 from openai import APIConnectionError, APIStatusError, AsyncOpenAI, LengthFinishReasonError
 
 from hindsight_api.config import DEFAULT_LLM_TIMEOUT, ENV_LLM_TIMEOUT
+from hindsight_api.engine.bank_attribution import apply_bank_attribution
 from hindsight_api.engine.llm_interface import LLMInterface, OutputTooLongError
 from hindsight_api.engine.response_models import LLMToolCall, LLMToolCallResult, TokenUsage
 from hindsight_api.metrics import get_metrics_collector
@@ -595,6 +596,8 @@ async def call(
                     call_params["messages"] = _ensure_json_word_in_user_message(call_params["messages"])
                     call_params["response_format"] = {"type": "json_object"}
 
+        apply_bank_attribution(call_params)
+
         last_exception = None
 
         for attempt in range(max_retries + 1):
@@ -945,6 +948,8 @@ async def call_with_tools(
         if extra_body:
             call_params["extra_body"] = extra_body
 
+        apply_bank_attribution(call_params)
+
         last_exception = None
 
         for attempt in range(max_retries + 1):
 
@@ -3,6 +3,7 @@
 """
 
 import asyncio
+import contextvars
 import logging
 from typing import Literal, Protocol
 
@@ -89,7 +90,14 @@ async def generate_embeddings_batch(
     """
     try:
         loop = asyncio.get_event_loop()
-        embeddings = await loop.run_in_executor(None, _encode_with_input_type, embeddings_backend, texts, input_type)
+        # run_in_executor runs the encode in a worker thread, which does NOT inherit
+        # the caller's contextvars. Capture the current context and run the encode
+        # inside it so context-dependent behavior (e.g. per-bank `user` attribution
+        # read via get_current_bank_id()) survives the thread hop.
+        ctx = contextvars.copy_context()
+        embeddings = await loop.run_in_executor(
+            None, lambda: ctx.run(_encode_with_input_type, embeddings_backend, texts, input_type)
+        )
     except Exception as e:
         raise Exception(f"Failed to generate batch embeddings: {str(e)}")
Original file line number	Diff line number	Diff line change
`@@ -1679,7 +1679,7 @@ def create_cross_encoder_from_env() -> CrossEncoderModel:`
`1679`	`1679`	`return CohereCrossEncoder(`
`1680`	`1680`	`api_key=api_key,`
`1681`	`1681`	`model=config.reranker_openrouter_model,`
`1682`		`- base_url="https://openrouter.ai/api/v1/rerank",`
	`1682`	`+ base_url=config.reranker_openrouter_base_url,`
`1683`	`1683`	`timeout=config.reranker_openrouter_timeout,`
`1684`	`1684`	`)`
`1685`	`1685`	`elif provider == "flashrank":`
Original file line number	Diff line number	Diff line change
`@@ -57,6 +57,7 @@`
`57`	`57`	`ENV_EMBEDDINGS_ZEROENTROPY_ENCODING_FORMAT,`
`58`	`58`	`ENV_LLM_API_KEY,`
`59`	`59`	`)`
	`60`	`+from .bank_attribution import apply_bank_attribution`
`60`	`61`
`61`	`62`	`logger = logging.getLogger(__name__)`
`62`	`63`
`@@ -705,6 +706,7 @@ def encode(self, texts: list[str]) -> list[list[float]]:`
`705`	`706`	`}`
`706`	`707`	`if self.dimensions is not None:`
`707`	`708`	`request["dimensions"] = self.dimensions`
	`709`	`+ apply_bank_attribution(request)`
`708`	`710`
`709`	`711`	`response = self._client.embeddings.create(**request)`
`710`	`712`