Python: Flatten hyperlight execute_code output (microsoft#5333)

eavanvalkenburg · web-flow · commit 69894eded89d · 2026-04-20T08:29:40.000Z
* small fix for hyperlight

* improved sandbox dependency
diff --git a/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py b/python/packages/hyperlight/agent_framework_hyperlight/_execute_code_tool.py
@@ -431,7 +431,7 @@ def _build_execution_contents(
             outputs.append(Content.from_text(stderr, raw_representation=result))
         if not outputs:
             outputs.append(Content.from_text("Code executed successfully without output."))
-        return [Content.from_code_interpreter_tool_result(outputs=outputs, raw_representation=result)]
+        return outputs
 
     error_details = stderr or "Unknown sandbox error"
     outputs.append(
@@ -441,12 +441,16 @@ def _build_execution_contents(
             raw_representation=result,
         )
     )
-    return [Content.from_code_interpreter_tool_result(outputs=outputs, raw_representation=result)]
+    return outputs
 
 
 def _make_sandbox_callback(tool_obj: FunctionTool) -> Callable[..., Any]:
     sandbox_tool = copy.copy(tool_obj)
-    sandbox_tool.result_parser = _passthrough_result_parser
+    # Auto-assign a passthrough parser so the raw return value round-trips through
+    # `ast.literal_eval` in the sandbox callback below. User-supplied parsers are
+    # left in place so callers can customize how results are exposed to the guest.
+    if sandbox_tool.result_parser is None:
+        sandbox_tool.result_parser = _passthrough_result_parser
 
     def _callback(**kwargs: Any) -> Any:
         async def _invoke() -> list[Content]:
@@ -765,6 +769,7 @@ def build_instructions(self, *, tools_visible_to_model: bool) -> str:
         return build_codeact_instructions(
             tools=config.tools,
             tools_visible_to_model=tools_visible_to_model,
+            filesystem_enabled=config.filesystem_enabled,
         )
 
     def create_run_tool(self) -> HyperlightExecuteCodeTool:
diff --git a/python/packages/hyperlight/agent_framework_hyperlight/_instructions.py b/python/packages/hyperlight/agent_framework_hyperlight/_instructions.py
@@ -68,6 +68,7 @@ def build_codeact_instructions(
     *,
     tools: Sequence[FunctionTool],
     tools_visible_to_model: bool,
+    filesystem_enabled: bool = False,
 ) -> str:
     """Build dynamic CodeAct instructions for the effective sandbox state."""
     usage_note = (
@@ -77,12 +78,24 @@ def build_codeact_instructions(
         else "Provider-owned sandbox tools are not exposed separately; use `execute_code` when you need them."
     )
 
+    output_note = (
+        "To surface results from `execute_code`, end the code with `print(...)`; the sandbox does not "
+        "return the value of the last expression."
+    )
+    if filesystem_enabled:
+        output_note += (
+            " For larger artifacts, write them to `/output/<filename>` instead — returned files will be "
+            "attached to the tool result."
+        )
+
     return f"""You have one primary tool: execute_code.
 
 Prefer one execute_code call per request when possible.
 Its tool description contains the current `call_tool(...)` guidance, sandbox
 tool registry, and capability limits.
 
+{output_note}
+
 {usage_note}
 """
 
diff --git a/python/packages/hyperlight/pyproject.toml b/python/packages/hyperlight/pyproject.toml
@@ -24,7 +24,7 @@ classifiers = [
 dependencies = [
     "agent-framework-core>=1.0.0,<2",
     "hyperlight-sandbox>=0.3.0,<0.4",
-    "hyperlight-sandbox-backend-wasm>=0.3.0,<0.4 ; (sys_platform == 'linux' or sys_platform == 'win32') and python_version < '3.14'",
+    "hyperlight-sandbox-backend-wasm>=0.3.0,<0.4 ; ((sys_platform == 'linux' and platform_machine == 'x86_64') or (sys_platform == 'win32' and platform_machine == 'AMD64')) and python_version < '3.14'",
     "hyperlight-sandbox-python-guest>=0.3.0,<0.4",
 ]
 
diff --git a/python/packages/hyperlight/samples/codeact_benchmark.py b/python/packages/hyperlight/samples/codeact_benchmark.py
@@ -0,0 +1,253 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+"""Benchmark CodeAct vs. traditional tool-calling for a multi-tool-call task.
+
+This sample runs the same prompt against the same FoundryChatClient twice:
+
+1. **Traditional tool-calling**: the five business tools are passed directly to
+   the agent, so the model calls each tool individually via the LLM tool-call
+   interface.
+2. **CodeAct**: the same tools are registered on a HyperlightCodeActProvider
+   and the model sees a single ``execute_code`` tool that calls them from
+   inside the Hyperlight sandbox via ``call_tool(...)``.
+
+The task (computing grand totals per user) naturally requires many tool calls
+to complete. At the end, the sample prints elapsed time and token usage for
+each run so the two approaches can be compared.
+
+Run with:
+    cd python
+    uv run --directory packages/hyperlight python samples/codeact_benchmark.py
+
+Required environment variables (loaded from ``.env`` if present):
+    FOUNDRY_PROJECT_ENDPOINT
+    FOUNDRY_MODEL
+"""
+
+from __future__ import annotations
+
+import asyncio
+import os
+import time
+from typing import Annotated, Any, Literal
+
+from agent_framework import Agent, AgentResponse, UsageDetails
+from agent_framework.foundry import FoundryChatClient
+from azure.identity import AzureCliCredential
+from dotenv import load_dotenv
+from pydantic import BaseModel, Field
+
+from agent_framework_hyperlight import HyperlightCodeActProvider
+
+load_dotenv()
+
+
+# 1. Deterministic "business" data and tools.
+
+_USERS: list[dict[str, Any]] = [
+    {"id": 1, "name": "Alice", "region": "EU", "tier": "gold"},
+    {"id": 2, "name": "Bob", "region": "US", "tier": "silver"},
+    {"id": 3, "name": "Charlie", "region": "US", "tier": "gold"},
+    {"id": 4, "name": "Diana", "region": "APAC", "tier": "bronze"},
+    {"id": 5, "name": "Evan", "region": "EU", "tier": "silver"},
+    {"id": 6, "name": "Fiona", "region": "US", "tier": "gold"},
+    {"id": 7, "name": "George", "region": "APAC", "tier": "gold"},
+    {"id": 8, "name": "Hana", "region": "EU", "tier": "bronze"},
+]
+
+_ORDERS: dict[int, list[dict[str, Any]]] = {
+    1: [{"product": "Widget", "qty": 3, "unit_price": 9.99}, {"product": "Gadget", "qty": 1, "unit_price": 19.99}],
+    2: [{"product": "Widget", "qty": 1, "unit_price": 9.99}],
+    3: [{"product": "Gadget", "qty": 2, "unit_price": 19.99}, {"product": "Thingamajig", "qty": 4, "unit_price": 4.50}],
+    4: [{"product": "Widget", "qty": 10, "unit_price": 9.99}],
+    5: [{"product": "Gadget", "qty": 1, "unit_price": 19.99}],
+    6: [{"product": "Widget", "qty": 2, "unit_price": 9.99}, {"product": "Thingamajig", "qty": 5, "unit_price": 4.50}],
+    7: [{"product": "Gadget", "qty": 3, "unit_price": 19.99}],
+    8: [{"product": "Thingamajig", "qty": 2, "unit_price": 4.50}],
+}
+
+_DISCOUNTS: dict[str, float] = {"gold": 0.20, "silver": 0.10, "bronze": 0.05}
+_TAX_RATES: dict[str, float] = {"EU": 0.21, "US": 0.08, "APAC": 0.10}
+
+
+def list_users() -> list[dict[str, Any]]:
+    """Return all users as a list of dictionaries.
+
+    Each entry has keys: id (int), name (str), region (str), tier (str).
+    """
+    return _USERS
+
+
+def get_orders_for_user(
+    user_id: Annotated[int, "The user id whose orders to retrieve."],
+) -> list[dict[str, Any]]:
+    """Return the user's orders as a list of dictionaries.
+
+    Each entry has keys: product (str), qty (int), unit_price (float).
+    """
+    return _ORDERS.get(user_id, [])
+
+
+def get_discount_rate(
+    tier: Annotated[Literal["gold", "silver", "bronze"], "The customer tier."],
+) -> float:
+    """Return the discount rate as a float fraction (e.g. 0.2 for 20%)."""
+    return _DISCOUNTS[tier]
+
+
+def get_tax_rate(
+    region: Annotated[Literal["EU", "US", "APAC"], "The region code."],
+) -> float:
+    """Return the tax rate as a float fraction (e.g. 0.21 for 21%)."""
+    return _TAX_RATES[region]
+
+
+def compute_line_total(
+    qty: Annotated[int, "Line item quantity."],
+    unit_price: Annotated[float, "Line item unit price."],
+    discount_rate: Annotated[float, "Discount rate as a fraction (e.g. 0.2 for 20%)."],
+    tax_rate: Annotated[float, "Tax rate as a fraction (e.g. 0.21 for 21%)."],
+) -> float:
+    """Compute a single order line total.
+
+    Formula: qty * unit_price * (1 - discount_rate) * (1 + tax_rate), rounded to 2 decimals.
+    """
+    subtotal = qty * unit_price
+    discounted = subtotal * (1.0 - discount_rate)
+    return round(discounted * (1.0 + tax_rate), 2)
+
+
+TOOLS = [list_users, get_orders_for_user, get_discount_rate, get_tax_rate, compute_line_total]
+
+
+# 2. Structured output schema shared between both runs.
+
+
+class UserTotal(BaseModel):
+    """A user's grand total of all their orders."""
+
+    user_id: int = Field(description="The user's id.")
+    name: str = Field(description="The user's display name.")
+    grand_total: float = Field(description="Sum of all line totals, rounded to 2 decimals.")
+
+
+class UserGrandTotals(BaseModel):
+    """Structured output schema for both runs."""
+
+    results: list[UserTotal] = Field(description="One entry per user, sorted by grand_total descending.")
+
+
+INSTRUCTIONS = "You are a careful assistant. Use the provided tools for every lookup and computation."
+
+BENCHMARK_PROMPT = (
+    "For every user in our system (there are 8 of them), compute the grand total of all their orders. "
+    "Use the compute_line_total tool for each user's orders, after looking up the relevant discount and "
+    "tax rates for that user. "
+    "Use the provided tools for EVERY data lookup (users, orders, discount rates, tax rates) and for EVERY "
+    "line-total computation via compute_line_total — do not invent values or hardcode any numbers. "
+    "The total per order item should apply the discount first and then the tax "
+    "(e.g. total = qty * unit_price * (1-discount) * (1+tax)). "
+    "Return one entry per user, sorted by grand_total descending."
+)
+
+
+def get_client() -> FoundryChatClient:
+    """Create a FoundryChatClient from environment variables."""
+    return FoundryChatClient(
+        project_endpoint=os.environ["FOUNDRY_PROJECT_ENDPOINT"],
+        model=os.environ["FOUNDRY_MODEL"],
+        credential=AzureCliCredential(),
+    )
+
+
+# 3. Two runners that share the same tools, prompt, and structured output schema.
+
+
+async def _run_traditional() -> tuple[float, AgentResponse]:
+    agent = Agent(
+        client=get_client(),
+        name="TraditionalAgent",
+        instructions=INSTRUCTIONS,
+        tools=TOOLS,
+        default_options={"response_format": UserGrandTotals},
+    )
+    start = time.perf_counter()
+    result = await agent.run(BENCHMARK_PROMPT)
+    elapsed = time.perf_counter() - start
+    return elapsed, result
+
+
+async def _run_codeact() -> tuple[float, AgentResponse]:
+    codeact = HyperlightCodeActProvider(
+        tools=TOOLS,
+        approval_mode="never_require",
+    )
+    agent = Agent(
+        client=get_client(),
+        name="CodeActAgent",
+        instructions=INSTRUCTIONS,
+        context_providers=[codeact],
+        default_options={"response_format": UserGrandTotals},
+    )
+    start = time.perf_counter()
+    result = await agent.run(BENCHMARK_PROMPT)
+    elapsed = time.perf_counter() - start
+    return elapsed, result
+
+
+# 4. Report results side by side.
+
+
+def _print_section(title: str) -> None:
+    bar = "=" * 70
+    print(f"\n{bar}\n{title}\n{bar}")
+
+
+def _format_usage(usage: UsageDetails | None) -> str:
+    if usage is None:
+        return "usage=<none>"
+    return (
+        f"input={usage.get('input_token_count') or 0:>6} "
+        f"output={usage.get('output_token_count') or 0:>6} "
+        f"total={usage.get('total_token_count') or 0:>6}"
+    )
+
+
+def _print_results(result: AgentResponse) -> None:
+    if result.value is not None:
+        for row in result.value.results:
+            print(f"  user_id={row.user_id:>2}  name={row.name:<8}  grand_total={row.grand_total:>8.2f}")
+    else:
+        print(result.text)
+
+
+async def main() -> None:
+    """Run the benchmark and print a comparison."""
+    trad_time, trad_result = await _run_traditional()
+    code_time, code_result = await _run_codeact()
+
+    _print_section("Traditional tool-calling")
+    print(f"time={trad_time:7.2f}s  {_format_usage(trad_result.usage_details)}")
+    _print_results(trad_result)
+
+    _print_section("CodeAct (HyperlightCodeActProvider)")
+    print(f"time={code_time:7.2f}s  {_format_usage(code_result.usage_details)}")
+    _print_results(code_result)
+
+    _print_section("Comparison")
+    trad_total = (trad_result.usage_details or {}).get("total_token_count") or 0
+    code_total = (code_result.usage_details or {}).get("total_token_count") or 0
+
+    def pct(new: float, old: float) -> str:
+        if old == 0:
+            return "n/a"
+        delta = (new - old) / old * 100
+        sign = "+" if delta >= 0 else ""
+        return f"{sign}{delta:.1f}%"
+
+    print(f"time   : traditional={trad_time:7.2f}s   codeact={code_time:7.2f}s   delta={pct(code_time, trad_time)}")
+    print(f"tokens : traditional={trad_total:7d}    codeact={code_total:7d}    delta={pct(code_total, trad_total)}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/python/packages/hyperlight/samples/codeact_context_provider.py b/python/packages/hyperlight/samples/codeact_context_provider.py
@@ -72,15 +72,11 @@ async def log_function_calls(
 
     result = context.result
     if function_name == "execute_code" and isinstance(result, list):
-        for item in result:
-            if item.type != "code_interpreter_tool_result":
-                continue
-
-            for output in item.outputs or []:
-                if output.type == "text" and output.text:
-                    print(f"{_GREEN}stdout:\n{output.text}{_RESET}")
-                if output.type == "error" and output.error_details:
-                    print(f"{_YELLOW}stderr:\n{output.error_details}{_RESET}")
+        for output in result:
+            if output.type == "text" and output.text:
+                print(f"{_GREEN}stdout:\n{output.text}{_RESET}")
+            elif output.type == "error" and output.error_details:
+                print(f"{_YELLOW}stderr:\n{output.error_details}{_RESET}")
     else:
         print(f"{_YELLOW}◀ {function_name} → {result!r}{_RESET}")
 
diff --git a/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py b/python/packages/hyperlight/tests/hyperlight/test_hyperlight_codeact.py
diff --git a/python/uv.lock b/python/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ classifiers = [`
`24`	`24`	`dependencies = [`
`25`	`25`	`"agent-framework-core>=1.0.0,<2",`
`26`	`26`	`"hyperlight-sandbox>=0.3.0,<0.4",`
`27`		`- "hyperlight-sandbox-backend-wasm>=0.3.0,<0.4 ; (sys_platform == 'linux' or sys_platform == 'win32') and python_version < '3.14'",`
	`27`	`+ "hyperlight-sandbox-backend-wasm>=0.3.0,<0.4 ; ((sys_platform == 'linux' and platform_machine == 'x86_64') or (sys_platform == 'win32' and platform_machine == 'AMD64')) and python_version < '3.14'",`
`28`	`28`	`"hyperlight-sandbox-python-guest>=0.3.0,<0.4",`
`29`	`29`	`]`
`30`	`30`