fix(llms): dynamic routing of Bedrock model-specific inference args (e.g., top_k) (#248)

iapoorv01 · web-flow · commit 294dee7411cf · 2026-06-29T18:43:45.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 - `pw.io.postgres.write` now streams each batch into PostgreSQL through the binary `COPY` protocol instead of issuing one `INSERT` per row, giving a large throughput improvement (up to ~100x) on bulk writes. Both output modes use it: stream-of-changes copies straight into the target, while snapshot mode stages each batch in a temporary table and merges it with a single set-based upsert/delete.
 
 ### Fixed
+- `BedrockChat` now correctly routes `top_k` and other model-specific arguments to the AWS Converse API via `additionalModelRequestFields`.
 - Improved concurrent write handling in pw.io.sqlite.write for SQLite databases. Writes to the same database file now produce deterministic output in multi-worker and multi-table setups.
 - `pw.io.elasticsearch.write` no longer fails when a minibatch is big enough that its Elasticsearch `_bulk` request would exceed a server-side limit. The connector reads both the cluster's `http.max_content_length` (the `413 Request Entity Too Large` limit) and `indexing_pressure.memory.limit` (the `429 Too Many Requests` limit, which on a small-heap node trips well below 100 MB) at start-up, and splits the buffered documents across as many bulk requests as needed to stay under whichever is hit first — so large batches are still written in as few requests as possible instead of being rejected. (Both limits fall back to a conservative default if they cannot be read.)
 - `pw.io.elasticsearch.write` now retries transient bulk failures with backoff instead of failing the run on the first hiccup. A whole-request rejection or an individual document failing with `429`/`503` (back-pressure / temporary unavailability) is retried — resending only the documents the server reports as not yet applied, so a retry never duplicates data — while deterministic per-document failures (e.g. a type-mismatched value rejected with `400`) are now logged and skipped rather than silently dropped.
diff --git a/python/pathway/xpacks/llm/llms.py b/python/pathway/xpacks/llm/llms.py
@@ -795,6 +795,7 @@ class BedrockChat(BaseChat):
         max_tokens: Maximum number of tokens to generate. Defaults to ``1024``.
         temperature: Sampling temperature (``0.0`` to ``1.0``).
         top_p: Top-p sampling parameter.
+        top_k: Top-k sampling parameter (supported by Anthropic models).
         stop_sequences: List of sequences that will stop generation.
 
     Example:
@@ -818,6 +819,9 @@ class BedrockChat(BaseChat):
     ROLE_SYSTEM = "system"
     _SUPPORTED_ROLES = {ROLE_USER, ROLE_ASSISTANT, ROLE_SYSTEM}
 
+    # Arguments specific to certain models (sent via additionalModelRequestFields)
+    _MODEL_SPECIFIC_ARGS = {"top_k"}
+
     @staticmethod
     def _convert_messages_to_bedrock_format(messages: list[dict]) -> list[dict]:
         """Convert OpenAI-style messages to AWS Bedrock Converse API format."""
@@ -971,6 +975,15 @@ async def __wrapped__(self, messages: list[dict] | pw.Json, **kwargs) -> str | N
                 "inferenceConfig": inference_config,
             }
 
+            # Extract model-specific parameters (like top_k) into additionalModelRequestFields
+            additional_fields = {}
+            for arg in self._MODEL_SPECIFIC_ARGS:
+                if arg in kwargs:
+                    additional_fields[arg] = kwargs.pop(arg)
+
+            if additional_fields:
+                converse_kwargs["additionalModelRequestFields"] = additional_fields
+
             if system_prompts:
                 converse_kwargs["system"] = system_prompts
 
@@ -1024,8 +1037,7 @@ def _accepts_call_arg(self, arg_name: str) -> bool:
             "temperature",
             "top_p",
             "stop_sequences",
-            "top_k",  # Some models support this
-        }
+        }.union(self._MODEL_SPECIFIC_ARGS)
         return arg_name in supported_args
 
 
diff --git a/python/pathway/xpacks/llm/tests/test_llms.py b/python/pathway/xpacks/llm/tests/test_llms.py
@@ -183,7 +183,7 @@ def test_bedrock_empty_init_kwargs():
     assert llm.model is None
 
 
-BEDROCK_VALID_ARGS = ["max_tokens", "temperature", "top_p", "stop_sequences"]
+BEDROCK_VALID_ARGS = ["max_tokens", "temperature", "top_p", "stop_sequences", "top_k"]
 BEDROCK_INVALID_ARGS = ["made_up_arg", "logit_bias"]
 
 
@@ -197,3 +197,30 @@ def test_bedrock_call_args(model_id, call_arg):
 
     # BedrockChat always returns based on supported_args, model_id doesn't affect it
     assert llm._accepts_call_arg(call_arg) is (call_arg in BEDROCK_VALID_ARGS)
+
+
+@pytest.mark.asyncio
+async def test_bedrock_dynamic_args_routing():
+    from unittest.mock import AsyncMock, patch
+
+    llm = llms.BedrockChat(model_id="anthropic.claude-3", region_name="us-east-1")
+
+    mock_client = AsyncMock()
+    mock_client.converse = AsyncMock(
+        return_value={"output": {"message": {"content": [{"text": "mocked"}]}}}
+    )
+
+    mock_session = AsyncMock()
+    mock_session.client.return_value.__aenter__.return_value = mock_client
+
+    with patch.object(llm, "_session", mock_session):
+        await llm.__wrapped__(
+            [{"role": "user", "content": "hi"}], top_k=250, temperature=0.7
+        )
+
+    mock_client.converse.assert_called_once()
+    call_kwargs = mock_client.converse.call_args.kwargs
+
+    assert call_kwargs["inferenceConfig"]["temperature"] == 0.7
+    assert "additionalModelRequestFields" in call_kwargs
+    assert call_kwargs["additionalModelRequestFields"]["top_k"] == 250