diff --git a/go/adk/pkg/agent/agent.go b/go/adk/pkg/agent/agent.go index 1aae3637d6..db5cf96ef5 100644 --- a/go/adk/pkg/agent/agent.go +++ b/go/adk/pkg/agent/agent.go @@ -304,6 +304,7 @@ func CreateLLM(ctx context.Context, m adk.Model, log logr.Logger) (adkmodel.LLM, Model: modelName, Region: region, AdditionalModelRequestFields: m.AdditionalModelRequestFields, + PromptCaching: m.PromptCaching, } return models.NewBedrockModelWithLogger(ctx, cfg, log) diff --git a/go/adk/pkg/models/bedrock.go b/go/adk/pkg/models/bedrock.go index d9db5a842e..7c2f8825a1 100644 --- a/go/adk/pkg/models/bedrock.go +++ b/go/adk/pkg/models/bedrock.go @@ -77,6 +77,13 @@ type BedrockConfig struct { Temperature *float64 TopP *float64 AdditionalModelRequestFields map[string]any + // PromptCaching, when true, appends a default CachePoint block at the + // end of the Converse request's system content array and the end of + // the tools array. Bedrock caches up to and including those markers + // across requests in the same region; cached prefix is billed at a + // reduced rate. The marker is silently ignored by Bedrock for models + // that do not support prompt caching. + PromptCaching bool } // BedrockModel implements model.LLM for Amazon Bedrock using the Converse API. @@ -151,7 +158,7 @@ func (m *BedrockModel) GenerateContent(ctx context.Context, req *model.LLMReques var toolConfig *types.ToolConfiguration nameMap := make(map[string]string) if req.Config != nil && len(req.Config.Tools) > 0 { - tools, nm := convertGenaiToolsToBedrock(req.Config.Tools) + tools, nm := convertGenaiToolsToBedrock(req.Config.Tools, m.Config.PromptCaching) nameMap = nm if len(tools) > 0 { toolConfig = &types.ToolConfiguration{ @@ -193,6 +200,16 @@ func (m *BedrockModel) GenerateContent(ctx context.Context, req *model.LLMReques Value: systemInstruction, }) } + // If prompt caching is enabled, mark the end of the system content + // as a cache breakpoint. Bedrock caches everything up to and including + // this point for ~5 minutes; subsequent requests with the same prefix + // hit the cache. Skipped for empty systems — caching nothing is a no-op + // that wastes a marker. + if m.Config.PromptCaching && len(systemPrompt) > 0 { + systemPrompt = append(systemPrompt, &types.SystemContentBlockMemberCachePoint{ + Value: types.CachePointBlock{Type: types.CachePointTypeDefault}, + }) + } additionalFields := m.buildAdditionalModelRequestFields() @@ -568,7 +585,12 @@ func convertGenaiContentsToBedrockMessages(contents []*genai.Content, nameMap ma // It sanitizes tool names to satisfy Bedrock's [a-zA-Z0-9_-]+ constraint and // returns the original->sanitized name mapping so callers can apply it to // conversation history and reverse it when restoring names from responses. -func convertGenaiToolsToBedrock(tools []*genai.Tool) ([]types.Tool, map[string]string) { +// +// When promptCaching is true, a CachePoint marker is appended after the +// last tool spec — Bedrock then caches the entire (typically large) tool +// definitions array for ~5 minutes, billing the prefix at a reduced rate +// on cache hits. +func convertGenaiToolsToBedrock(tools []*genai.Tool, promptCaching bool) ([]types.Tool, map[string]string) { if len(tools) == 0 { return nil, nil } @@ -625,6 +647,17 @@ func convertGenaiToolsToBedrock(tools []*genai.Tool) ([]types.Tool, map[string]s } } + // If prompt caching is enabled, append a CachePoint at the END of the + // tool list. Bedrock caches the entire tool definitions array up to + // this marker; this is usually the biggest single chunk of static + // prefix in an agent conversation and benefits most from caching. + // Skipped when there are no tools — a cache marker by itself is a no-op. + if promptCaching && len(bedrockTools) > 0 { + bedrockTools = append(bedrockTools, &types.ToolMemberCachePoint{ + Value: types.CachePointBlock{Type: types.CachePointTypeDefault}, + }) + } + return bedrockTools, nameMap } diff --git a/go/adk/pkg/models/bedrock_test.go b/go/adk/pkg/models/bedrock_test.go index de2d1c3caf..0f379d8d75 100644 --- a/go/adk/pkg/models/bedrock_test.go +++ b/go/adk/pkg/models/bedrock_test.go @@ -162,7 +162,7 @@ func TestConvertGenaiToolsToBedrock(t *testing.T) { }, }}}} - bt1, nm1 := convertGenaiToolsToBedrock(tools) + bt1, nm1 := convertGenaiToolsToBedrock(tools, false) schema := extractSchema(t, bt1, nm1) props := schema["properties"].(map[string]any) @@ -190,7 +190,7 @@ func TestConvertGenaiToolsToBedrock(t *testing.T) { }, }}}} - bt2, nm2 := convertGenaiToolsToBedrock(tools) + bt2, nm2 := convertGenaiToolsToBedrock(tools, false) schema := extractSchema(t, bt2, nm2) props, ok := schema["properties"].(map[string]any) if !ok || len(props) == 0 { @@ -211,7 +211,7 @@ func TestConvertGenaiToolsToBedrock(t *testing.T) { ParametersJsonSchema: s, }}}} - bt3, nm3 := convertGenaiToolsToBedrock(tools) + bt3, nm3 := convertGenaiToolsToBedrock(tools, false) schema := extractSchema(t, bt3, nm3) props, ok := schema["properties"].(map[string]any) if !ok || len(props) == 0 { @@ -366,7 +366,7 @@ func TestConvertGenaiToolsToBedrockSanitizesNames(t *testing.T) { {Name: "filesystem:read_file", Description: "Read a file"}, }}} - bedrockTools, nameMap := convertGenaiToolsToBedrock(tools) + bedrockTools, nameMap := convertGenaiToolsToBedrock(tools, false) if len(bedrockTools) != 2 { t.Fatalf("expected 2 tools, got %d", len(bedrockTools)) } @@ -424,3 +424,50 @@ func TestStreamingToolCallParseArgs(t *testing.T) { }) } } + +func TestConvertGenaiToolsToBedrockPromptCaching(t *testing.T) { + tools := []*genai.Tool{{FunctionDeclarations: []*genai.FunctionDeclaration{ + {Name: "get_weather", Description: "lookup weather"}, + {Name: "list_pods", Description: "list pods"}, + }}} + + t.Run("disabled: no cache marker appended", func(t *testing.T) { + out, _ := convertGenaiToolsToBedrock(tools, false) + if len(out) != 2 { + t.Fatalf("expected 2 tools, got %d", len(out)) + } + for i, tool := range out { + if _, ok := tool.(*types.ToolMemberCachePoint); ok { + t.Fatalf("did not expect a CachePoint at index %d when caching disabled", i) + } + } + }) + + t.Run("enabled: cache marker appended at the END of the tool list", func(t *testing.T) { + out, _ := convertGenaiToolsToBedrock(tools, true) + if len(out) != 3 { + t.Fatalf("expected 3 entries (2 tools + 1 CachePoint), got %d", len(out)) + } + // The first two must remain ToolSpec entries (order preserved). + for i := 0; i < 2; i++ { + if _, ok := out[i].(*types.ToolMemberToolSpec); !ok { + t.Fatalf("entry %d: expected ToolMemberToolSpec, got %T", i, out[i]) + } + } + // The trailing entry must be a CachePoint with type=default. + cp, ok := out[2].(*types.ToolMemberCachePoint) + if !ok { + t.Fatalf("trailing entry: expected ToolMemberCachePoint, got %T", out[2]) + } + if cp.Value.Type != types.CachePointTypeDefault { + t.Errorf("expected CachePointType=default, got %v", cp.Value.Type) + } + }) + + t.Run("enabled but no tools: no cache marker (skipped)", func(t *testing.T) { + out, _ := convertGenaiToolsToBedrock(nil, true) + if len(out) != 0 { + t.Fatalf("expected empty slice for no tools, got %d entries", len(out)) + } + }) +} diff --git a/go/api/adk/types.go b/go/api/adk/types.go index 602a457980..f825502440 100644 --- a/go/api/adk/types.go +++ b/go/api/adk/types.go @@ -251,6 +251,11 @@ type Bedrock struct { // additionalModelRequestFields in the Converse API. Use this for provider-specific // options outside the standard InferenceConfiguration block. AdditionalModelRequestFields map[string]any `json:"additional_model_request_fields,omitempty"` + // PromptCaching enables Bedrock prompt caching by appending a CachePoint + // block to the end of the system content array and the end of the tools + // array in the Converse request. See the v1alpha2.BedrockConfig CRD doc + // for context. + PromptCaching bool `json:"prompt_caching,omitempty"` } func (b *Bedrock) MarshalJSON() ([]byte, error) { diff --git a/go/api/config/crd/bases/kagent.dev_modelconfigs.yaml b/go/api/config/crd/bases/kagent.dev_modelconfigs.yaml index 00b21b6da0..50d115f7f4 100644 --- a/go/api/config/crd/bases/kagent.dev_modelconfigs.yaml +++ b/go/api/config/crd/bases/kagent.dev_modelconfigs.yaml @@ -483,6 +483,24 @@ spec: Claude extended thinking or top_k. Values are forwarded as-is to the API. Example: {"top_k": 5, "thinking": {"type": "enabled", "budget_tokens": 16000}} x-kubernetes-preserve-unknown-fields: true + promptCaching: + default: false + description: |- + PromptCaching enables Bedrock prompt caching by appending a CachePoint + block at the end of the Converse request's `system` content array and + the end of the `tools` array. Bedrock will cache the prefix up to and + including those cache points across requests in the same region for + roughly 5 minutes after first use, billing the cached portion at a + reduced rate on cache hits. + + Recommended for tool-using agents that make many Converse calls per + task with a stable system prompt and tool set — the per-call input + token count can drop by 70-90% on hit. Has no effect on models that + don't support caching; the marker is ignored by Bedrock for those. + + See https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html + for the current list of supported models and minimum prefix sizes. + type: boolean region: description: AWS region where the Bedrock model is available (e.g., us-east-1, us-west-2) diff --git a/go/api/v1alpha2/modelconfig_types.go b/go/api/v1alpha2/modelconfig_types.go index 0d08928681..6a9d03196b 100644 --- a/go/api/v1alpha2/modelconfig_types.go +++ b/go/api/v1alpha2/modelconfig_types.go @@ -256,6 +256,24 @@ type BedrockConfig struct { // +optional // +kubebuilder:pruning:PreserveUnknownFields AdditionalModelRequestFields *apiextensionsv1.JSON `json:"additionalModelRequestFields,omitempty"` + + // PromptCaching enables Bedrock prompt caching by appending a CachePoint + // block at the end of the Converse request's `system` content array and + // the end of the `tools` array. Bedrock will cache the prefix up to and + // including those cache points across requests in the same region for + // roughly 5 minutes after first use, billing the cached portion at a + // reduced rate on cache hits. + // + // Recommended for tool-using agents that make many Converse calls per + // task with a stable system prompt and tool set — the per-call input + // token count can drop by 70-90% on hit. Has no effect on models that + // don't support caching; the marker is ignored by Bedrock for those. + // + // See https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html + // for the current list of supported models and minimum prefix sizes. + // +optional + // +kubebuilder:default=false + PromptCaching bool `json:"promptCaching,omitempty"` } // SAPAICoreConfig contains SAP AI Core-specific configuration options. diff --git a/go/core/internal/controller/translator/agent/adk_api_translator.go b/go/core/internal/controller/translator/agent/adk_api_translator.go index d8e62b722d..4de11fa034 100644 --- a/go/core/internal/controller/translator/agent/adk_api_translator.go +++ b/go/core/internal/controller/translator/agent/adk_api_translator.go @@ -698,6 +698,7 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC }, Region: model.Spec.Bedrock.Region, AdditionalModelRequestFields: additionalFields, + PromptCaching: model.Spec.Bedrock.PromptCaching, } // Populate TLS fields in BaseModel diff --git a/helm/kagent-crds/templates/kagent.dev_modelconfigs.yaml b/helm/kagent-crds/templates/kagent.dev_modelconfigs.yaml index 00b21b6da0..50d115f7f4 100644 --- a/helm/kagent-crds/templates/kagent.dev_modelconfigs.yaml +++ b/helm/kagent-crds/templates/kagent.dev_modelconfigs.yaml @@ -483,6 +483,24 @@ spec: Claude extended thinking or top_k. Values are forwarded as-is to the API. Example: {"top_k": 5, "thinking": {"type": "enabled", "budget_tokens": 16000}} x-kubernetes-preserve-unknown-fields: true + promptCaching: + default: false + description: |- + PromptCaching enables Bedrock prompt caching by appending a CachePoint + block at the end of the Converse request's `system` content array and + the end of the `tools` array. Bedrock will cache the prefix up to and + including those cache points across requests in the same region for + roughly 5 minutes after first use, billing the cached portion at a + reduced rate on cache hits. + + Recommended for tool-using agents that make many Converse calls per + task with a stable system prompt and tool set — the per-call input + token count can drop by 70-90% on hit. Has no effect on models that + don't support caching; the marker is ignored by Bedrock for those. + + See https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html + for the current list of supported models and minimum prefix sizes. + type: boolean region: description: AWS region where the Bedrock model is available (e.g., us-east-1, us-west-2) diff --git a/python/packages/kagent-adk/src/kagent/adk/models/_bedrock.py b/python/packages/kagent-adk/src/kagent/adk/models/_bedrock.py index c1a83c045c..55116a198c 100644 --- a/python/packages/kagent-adk/src/kagent/adk/models/_bedrock.py +++ b/python/packages/kagent-adk/src/kagent/adk/models/_bedrock.py @@ -251,6 +251,12 @@ class KAgentBedrockLlm(KAgentTLSMixin, BaseLlm): extra_headers: Optional[dict[str, str]] = None additional_model_request_fields: Optional[dict[str, Any]] = None + # When True, append a CachePoint block to the end of the Converse + # request's `system` content array and the end of the `toolConfig.tools` + # array. Bedrock caches the prefix up to and including those markers + # across requests in the same region; cached portion is billed at a + # reduced rate on hit. See AWS docs for supported models / minimums. + prompt_caching: bool = False model_config = {"arbitrary_types_allowed": True} @@ -288,12 +294,23 @@ async def generate_content_async( text = "\n".join(p.text for p in si.parts or [] if p.text) if text: kwargs["system"] = [{"text": text}] + # If prompt caching is on, mark the end of the system content as + # a cache breakpoint. Bedrock caches everything up to and including + # this point for ~5 minutes; subsequent requests with the same + # prefix hit the cache. No-op if we didn't produce any system text. + if self.prompt_caching and kwargs.get("system"): + kwargs["system"].append({"cachePoint": {"type": "default"}}) if llm_request.config and llm_request.config.tools: genai_tools = [t for t in llm_request.config.tools if hasattr(t, "function_declarations")] if genai_tools: converse_tools = _convert_tools_to_converse(genai_tools, tool_name_map, tool_name_counter) if converse_tools: + # CachePoint at the END of the tool list: tool definitions + # are usually the biggest static chunk of an agent request + # and benefit most from caching. + if self.prompt_caching: + converse_tools.append({"cachePoint": {"type": "default"}}) kwargs["toolConfig"] = {"tools": converse_tools} # Reverse map lets us restore original tool names from sanitized names in Bedrock responses. diff --git a/python/packages/kagent-adk/src/kagent/adk/types.py b/python/packages/kagent-adk/src/kagent/adk/types.py index ccbfebbf00..c6bb0a8fd0 100644 --- a/python/packages/kagent-adk/src/kagent/adk/types.py +++ b/python/packages/kagent-adk/src/kagent/adk/types.py @@ -240,6 +240,11 @@ class Bedrock(BaseLLM): # additionalModelRequestFields in the Converse API. Use this for provider-specific # options outside the standard InferenceConfiguration block. additional_model_request_fields: dict | None = None + # prompt_caching enables Bedrock prompt caching: a CachePoint marker is + # appended to the end of the Converse request's system content array and + # toolConfig.tools array. Bedrock caches the prefix across requests in the + # same region; cached portion is billed at a reduced rate on hit. + prompt_caching: bool = False type: Literal["bedrock"] @@ -600,6 +605,7 @@ def _create_llm_from_model_config(model_config: ModelUnion): model=model_config.model, extra_headers=extra_headers, additional_model_request_fields=model_config.additional_model_request_fields, + prompt_caching=model_config.prompt_caching, **_transport_kwargs(model_config), ) if model_config.type == "sap_ai_core":