From 1fcd7f05a10b55c14a8044ac4d8d1b32eb222866 Mon Sep 17 00:00:00 2001 From: cmunley1 Date: Wed, 15 Apr 2026 00:36:02 -0700 Subject: [PATCH 01/16] init Signed-off-by: cmunley1 --- README.md | 2 + docs/resources-server/gymnasium-api.md | 288 ++++++++++++++++++ resources_servers/base_gymnasium/README.md | 9 + resources_servers/base_gymnasium/__init__.py | 99 ++++++ .../configs/base_gymnasium.yaml | 22 ++ .../base_gymnasium/data/.gitignore | 5 + .../base_gymnasium/data/example.jsonl | 5 + .../base_gymnasium/data/example_metrics.json | 38 +++ .../data/example_rollouts.jsonl | 5 + .../base_gymnasium/requirements.txt | 1 + .../base_gymnasium/tests/__init__.py | 0 .../base_gymnasium/tests/test_app.py | 33 ++ resources_servers/blackjack/README.md | 20 ++ resources_servers/blackjack/app.py | 118 +++++++ .../blackjack/configs/blackjack.yaml | 22 ++ resources_servers/blackjack/data/.gitignore | 5 + .../blackjack/data/example.jsonl | 5 + .../blackjack/data/example_metrics.json | 38 +++ .../blackjack/data/example_rollouts.jsonl | 5 + .../example_rollouts_aggregate_metrics.json | 87 ++++++ ...example_rollouts_materialized_inputs.jsonl | 5 + resources_servers/blackjack/requirements.txt | 1 + resources_servers/blackjack/tests/__init__.py | 0 resources_servers/blackjack/tests/test_app.py | 25 ++ .../gymnasium_agent/README.md | 5 + .../gymnasium_agent/__init__.py | 0 responses_api_agents/gymnasium_agent/app.py | 182 +++++++++++ .../configs/gymnasium_agent.yaml | 4 + .../gymnasium_agent/requirements.txt | 1 + .../gymnasium_agent/tests/__init__.py | 0 .../gymnasium_agent/tests/test_app.py | 48 +++ 31 files changed, 1078 insertions(+) create mode 100644 docs/resources-server/gymnasium-api.md create mode 100644 resources_servers/base_gymnasium/README.md create mode 100644 resources_servers/base_gymnasium/__init__.py create mode 100644 resources_servers/base_gymnasium/configs/base_gymnasium.yaml create mode 100644 resources_servers/base_gymnasium/data/.gitignore create mode 100644 resources_servers/base_gymnasium/data/example.jsonl create mode 100644 resources_servers/base_gymnasium/data/example_metrics.json create mode 100644 resources_servers/base_gymnasium/data/example_rollouts.jsonl create mode 100644 resources_servers/base_gymnasium/requirements.txt create mode 100644 resources_servers/base_gymnasium/tests/__init__.py create mode 100644 resources_servers/base_gymnasium/tests/test_app.py create mode 100644 resources_servers/blackjack/README.md create mode 100644 resources_servers/blackjack/app.py create mode 100644 resources_servers/blackjack/configs/blackjack.yaml create mode 100644 resources_servers/blackjack/data/.gitignore create mode 100644 resources_servers/blackjack/data/example.jsonl create mode 100644 resources_servers/blackjack/data/example_metrics.json create mode 100644 resources_servers/blackjack/data/example_rollouts.jsonl create mode 100644 resources_servers/blackjack/data/example_rollouts_aggregate_metrics.json create mode 100644 resources_servers/blackjack/data/example_rollouts_materialized_inputs.jsonl create mode 100644 resources_servers/blackjack/requirements.txt create mode 100644 resources_servers/blackjack/tests/__init__.py create mode 100644 resources_servers/blackjack/tests/test_app.py create mode 100644 responses_api_agents/gymnasium_agent/README.md create mode 100644 responses_api_agents/gymnasium_agent/__init__.py create mode 100644 responses_api_agents/gymnasium_agent/app.py create mode 100644 responses_api_agents/gymnasium_agent/configs/gymnasium_agent.yaml create mode 100644 responses_api_agents/gymnasium_agent/requirements.txt create mode 100644 responses_api_agents/gymnasium_agent/tests/__init__.py create mode 100644 responses_api_agents/gymnasium_agent/tests/test_app.py diff --git a/README.md b/README.md index 8766e97f0..02c0c5ffb 100644 --- a/README.md +++ b/README.md @@ -158,6 +158,8 @@ The Dataset column links to publicly available datasets (e.g., on HuggingFace). | Arc Agi | knowledge | Solve puzzles designed to test intelligence. See https://arcprize.org/arc-agi. | Improve puzzle-solving capabilities. | - | ✓ | - | arc_agi.yaml | - | | Aviary | agent | Multi-hop question answering on the HotPotQA dataset with Wikipedia search | Improve knowledge and agentic capability | ✓ | ✓ | Apache 2.0 | hotpotqa_aviary.yaml | - | | Aviary | math | GSM8k benchmark with calculator tool | Test math and agentic capability | ✓ | ✓ | Apache 2.0 | gsm8k_aviary.yaml | - | +| Base Gymnasium | other | Base class for Gymnasium-style servers. Not a standalone server. | - | - | - | - | base_gymnasium.yaml | - | +| Blackjack | games | Blackjack. Model hits or stands. Reward +1 win, 0 draw, -1 loss/bust. | - | - | - | - | blackjack.yaml | - | | Calendar | agent | Multi-turn calendar scheduling dataset. User states events and constraints in natural language; model schedules events to satisfy all constraints. | Improve multi-turn instruction following capabilities | ✓ | ✓ | Apache 2.0 | calendar.yaml | Nemotron-RL-agent-calendar_scheduling | | Calendar | agent | Multi-turn calendar scheduling dataset. User states events and constraints in natural language; model schedules events to satisfy all constraints. | Improve multi-turn instruction following capabilities | ✓ | ✓ | Creative Commons Attribution 4.0 International | calendar_v2.yaml | Nemotron-RL-Instruction-Following-Calendar-v2 | | Circle Click | other | Click on circles in images | - | - | - | - | circle_click.yaml | - | diff --git a/docs/resources-server/gymnasium-api.md b/docs/resources-server/gymnasium-api.md new file mode 100644 index 000000000..0fbd1a383 --- /dev/null +++ b/docs/resources-server/gymnasium-api.md @@ -0,0 +1,288 @@ +(gymnasium-api)= + +# Gymnasium API + + + +`GymnasiumServer` is a [Gymnasium](https://gymnasium.farama.org/)-style base class for resources servers. Implement `step()`, optionally `reset()`, pair with `gymnasium_agent`. + +## Interface + +```python +from resources_servers.base_gymnasium import GymnasiumServer +from nemo_gym.openai_utils import NeMoGymResponse + +class MyEnv(GymnasiumServer): + async def reset(self, metadata: dict, session_id=None) -> tuple[str | None, dict]: + # Called once at the start of each episode. + # Return (observation, info). + # None observation = use responses_create_params.input as-is. + return None, {} + + async def step(self, action: NeMoGymResponse, metadata: dict, session_id=None) -> tuple[str | None, float, bool, bool, dict]: + # Called after each model response. + # Return (observation, reward, terminated, truncated, info). + # observation: next prompt for the model, or None if episode is over. + # reward is only used when terminated=True. + # truncated=True means the episode hit the step limit. + ... +``` + +`metadata` contains the extra fields from `verifier_metadata` in your JSONL input. Access them via `metadata.get("field_name")`. + +`session_id` is a unique string per rollout, used to store and retrieve per-episode state. Stateless environments can ignore it. + +`reset()` is optional. The default implementation returns `(None, {})`. + +## Single-step + +Single-step environments are the common non agentic use case: one model call, then grade the output. Implement `step()` so it always returns `terminated=True`. + +```python +class MySingleStepEnv(GymnasiumServer): + async def step(self, action: NeMoGymResponse, metadata: dict, session_id=None): + response_text = _extract_text(action) + reward = 1.0 if metadata.get("answer") in response_text else 0.0 + return None, reward, True, False, {} +``` + +## Multi-step with action tags + +Multiple model calls per episode without tool calling. The model uses `` tags in its output, `step()` parses them and returns the next observation or terminates. + +```python +import re +from pydantic import Field + +class BlackjackEnv(GymnasiumServer): + session_state: dict = Field(default_factory=dict) + + async def reset(self, metadata, session_id=None): + hand = deal_hand() + self.session_state[session_id] = hand + return f"Your hand: {hand}. hit or stand?", {} + + async def step(self, action, metadata, session_id=None): + text = _extract_text(action) + m = re.search(r"\s*(hit|stand)\s*", text, re.IGNORECASE) + decision = m.group(1).lower() if m else "stand" + + hand = self.session_state[session_id] + if decision == "hit": + hand = hit(hand) + if bust(hand): + return None, -1.0, True, False, {} + return f"Your hand: {hand}. hit or stand?", 0.0, False, False, {} + + reward = score_against_dealer(hand) + return None, reward, True, False, {} +``` + +## Tool-use + +`action` is the full `NeMoGymResponse` from the model server, including any function calls already parsed by the model server's tool call parser. `step()` checks `action.output` for items with `type == "function_call"`, executes them, and returns the results as the next observation. Tool schemas go in `responses_create_params.tools` in your JSONL data so the model knows what tools are available. + +```python +import json +from pydantic import Field + +class MyToolEnv(GymnasiumServer): + session_state: dict = Field(default_factory=dict) + + async def reset(self, metadata, session_id=None): + self.session_state[session_id] = initialize(metadata) + return None, {} + + async def step(self, action, metadata, session_id=None): + tool_calls = [o for o in action.output if o.type == "function_call"] + + if tool_calls: + state = self.session_state[session_id] + results = [] + for call in tool_calls: + args = json.loads(call.arguments) + result = state["functions"][call.name](**args) + results.append(f"{call.name} -> {result}") + return "\n".join(results), 0.0, False, False, {} + + reward = self._grade(action, metadata) + return None, reward, True, False, {} +``` + +## Multi-turn + +`step()` returns the next user message as the observation. The `gymnasium_agent` appends it to the conversation and calls the model again. Return `None` to end. + +```python +class MyMultiTurnEnv(GymnasiumServer): + session_turns: dict = Field(default_factory=dict) + + async def reset(self, metadata, session_id=None): + self.session_turns[session_id] = 0 + return None, {} + + async def step(self, action, metadata, session_id=None): + follow_ups = metadata.get("follow_ups", []) + turn = self.session_turns.get(session_id, 0) + + if turn < len(follow_ups): + self.session_turns[session_id] = turn + 1 + return follow_ups[turn], 0.0, False, False, {} + + reward = self._grade(action, metadata) + return None, reward, True, False, {} +``` + +## Multi-turn with user model + +Instead of scripted follow-ups, call a second LLM to generate user replies. The policy never sees the user model's internals. Configure the user model as a separate model server in YAML. + +```python +from nemo_gym.server_utils import get_response_json, raise_for_status + +class UserSimEnv(GymnasiumServer): + user_model_server: str = "user_model" + + async def step(self, action, metadata, session_id=None): + assistant_text = _extract_text(action) + resp = await self.server_client.post( + server_name=self.user_model_server, + url_path="/v1/responses", + json={"input": [ + {"role": "system", "content": metadata.get("user_persona", "You are a user.")}, + {"role": "assistant", "content": assistant_text}, + ]}, + ) + await raise_for_status(resp) + data = await get_response_json(resp) + user_reply = _extract_text_from_json(data) + + if is_done(user_reply, metadata): + reward = self._grade(action, metadata) + return None, reward, True, False, {} + + return user_reply, 0.0, False, False, {} +``` + +## Multi-turn with user model and tools + +The user model can also call tools that the policy model does not have access to. For example, the user model might query a database or check a schedule to formulate realistic follow-up questions. The policy only sees the resulting text, not the tool calls or their outputs. + +Configure the user model's tool schemas in the `step()` call, not in the policy's `responses_create_params.tools`. + +```python +class UserWithToolsEnv(GymnasiumServer): + user_model_server: str = "user_model" + session_state: dict = Field(default_factory=dict) + + async def reset(self, metadata, session_id=None): + self.session_state[session_id] = initialize_user_tools(metadata) + return None, {} + + async def step(self, action, metadata, session_id=None): + assistant_text = _extract_text(action) + state = self.session_state[session_id] + + # User model gets its own tool schemas (hidden from the policy) + resp = await self.server_client.post( + server_name=self.user_model_server, + url_path="/v1/responses", + json={ + "input": [ + {"role": "system", "content": state["user_system_prompt"]}, + {"role": "assistant", "content": assistant_text}, + ], + "tools": state["user_tools"], # only the user model sees these + }, + ) + await raise_for_status(resp) + data = await get_response_json(resp) + + # Execute any tool calls the user model made + user_reply = await self._run_user_tool_loop(data, state) + + if is_done(user_reply, metadata): + reward = self._grade(action, metadata) + return None, reward, True, False, {} + + return user_reply, 0.0, False, False, {} +``` + +The policy sees `user_reply` as a plain user message. It has no knowledge of the user model's tools, tool calls, or tool results. This enables asymmetric information between the policy and the simulated user. + +## LLM-as-judge + +Use `step()` to call a judge model via `self.server_client` and score the output. The judge model must be configured as a separate model server. + +```python +class MyJudgeEnv(GymnasiumServer): + judge_server: str = "judge_model" # name of the model server in YAML + + async def step(self, action, metadata, session_id=None): + response_text = _extract_text(action) + judge_input = f"Question: {metadata.get('question')}\nAnswer: {response_text}\nIs this correct? Say YES or NO." + judge_resp = await self.server_client.post( + server_name=self.judge_server, + url_path="/v1/responses", + json={"input": [{"role": "user", "content": judge_input}]}, + ) + judgment = await judge_resp.json() + reward = 1.0 if "YES" in str(judgment.get("output_text", "")).upper() else 0.0 + return None, reward, True, False, {} +``` + +## Reward model + +Same pattern. Call a reward model endpoint and use its score directly. + +```python +class MyRewardModelEnv(GymnasiumServer): + rm_server: str = "reward_model" + + async def step(self, action, metadata, session_id=None): + resp = await self.server_client.post( + server_name=self.rm_server, + url_path="/v1/score", + json={"input": metadata.get("prompt"), "response": _extract_text(action)}, + ) + score = (await resp.json()).get("score", 0.0) + return None, score, True, False, {} +``` + +## YAML configuration + +`GymnasiumServer` pairs with `gymnasium_agent` instead of `simple_agent`. The agent config references the env server via `env_server`. + +```yaml +my_env_instance: + resources_servers: + my_env: + entrypoint: app.py + domain: knowledge + +my_gymnasium_agent_instance: + responses_api_agents: + gymnasium_agent: + entrypoint: app.py + env_server: + type: resources_servers + name: my_env + model_server: + type: responses_api_models + name: policy_model + max_steps: 10 + datasets: + - name: example + type: example + jsonl_fpath: resources_servers/my_env/data/example.jsonl +``` + +## Examples + +Working implementations are in the repository: + +- `resources_servers/reasoning_gym_env/` -- single-step grading +- `resources_servers/workplace_assistant_env/` -- stateful tool dispatch +- `resources_servers/example_multi_turn_env/` -- scripted multi-turn +- `resources_servers/blackjack_env/` -- multi-step game with action tags +- `resources_servers/tictactoe_env/` -- adversarial game diff --git a/resources_servers/base_gymnasium/README.md b/resources_servers/base_gymnasium/README.md new file mode 100644 index 000000000..c4fb04e3a --- /dev/null +++ b/resources_servers/base_gymnasium/README.md @@ -0,0 +1,9 @@ +# Gymnasium + +Base class (`GymnasiumServer`) for gymnasium-style resources servers. Not a standalone server. Import from here: + +```python +from resources_servers.base_gymnasium import GymnasiumServer +``` + +See `docs/resources-server/gymnasium-api.md` for usage. diff --git a/resources_servers/base_gymnasium/__init__.py b/resources_servers/base_gymnasium/__init__.py new file mode 100644 index 000000000..99734b299 --- /dev/null +++ b/resources_servers/base_gymnasium/__init__.py @@ -0,0 +1,99 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import abstractmethod +from typing import Optional + +from fastapi import FastAPI, Request +from pydantic import BaseModel, ConfigDict + +from nemo_gym.base_resources_server import BaseVerifyRequest, SimpleResourcesServer +from nemo_gym.openai_utils import NeMoGymResponse, NeMoGymResponseCreateParamsNonStreaming +from nemo_gym.server_utils import SESSION_ID_KEY + + +class EnvResetRequest(BaseModel): + model_config = ConfigDict(extra="allow") + responses_create_params: NeMoGymResponseCreateParamsNonStreaming + + +class EnvResetResponse(BaseModel): + observation: Optional[str] = None + info: dict = {} + + +class EnvStepRequest(BaseModel): + model_config = ConfigDict(extra="allow") + responses_create_params: NeMoGymResponseCreateParamsNonStreaming + response: NeMoGymResponse + + +class EnvStepResponse(BaseModel): + observation: Optional[str] = None + reward: float = 0.0 + terminated: bool = False + truncated: bool = False + info: dict = {} + + +def extract_text(response: NeMoGymResponse) -> str: + """Extract all text content from a NeMoGymResponse.""" + parts = [] + for item in response.output: + if item.type == "message": + content = item.content + if isinstance(content, str): + parts.append(content) + else: + for c in content: + if c.type == "output_text": + parts.append(c.text) + return "".join(parts) + + +class GymnasiumServer(SimpleResourcesServer): + """Gymnasium-style base class. Pair with gymnasium_agent. + + step() returns (observation, reward, terminated, truncated, info). + """ + + def setup_webserver(self) -> FastAPI: + app = FastAPI() + self.setup_session_middleware(app) + app.post("/reset")(self._reset_endpoint) + app.post("/step")(self._step_endpoint) + app.post("/aggregate_metrics")(self.aggregate_metrics) + return app + + async def _reset_endpoint(self, body: EnvResetRequest, request: Request) -> EnvResetResponse: + session_id = request.session.get(SESSION_ID_KEY) + obs, info = await self.reset(body.model_extra or {}, session_id) + return EnvResetResponse(observation=obs, info=info) + + async def _step_endpoint(self, body: EnvStepRequest, request: Request) -> EnvStepResponse: + session_id = request.session.get(SESSION_ID_KEY) + obs, reward, terminated, truncated, info = await self.step(body.response, body.model_extra or {}, session_id) + return EnvStepResponse(observation=obs, reward=reward, terminated=terminated, truncated=truncated, info=info) + + async def reset(self, metadata: dict, session_id: Optional[str] = None) -> tuple[Optional[str], dict]: + return None, {} + + @abstractmethod + async def step( + self, action: NeMoGymResponse, metadata: dict, session_id: Optional[str] = None + ) -> tuple[Optional[str], float, bool, bool, dict]: ... + + async def verify(self, body: BaseVerifyRequest) -> None: # type: ignore[override] + raise NotImplementedError("GymnasiumServer uses /step instead of /verify. Pair with gymnasium_agent.") diff --git a/resources_servers/base_gymnasium/configs/base_gymnasium.yaml b/resources_servers/base_gymnasium/configs/base_gymnasium.yaml new file mode 100644 index 000000000..4c79b9044 --- /dev/null +++ b/resources_servers/base_gymnasium/configs/base_gymnasium.yaml @@ -0,0 +1,22 @@ +base_gymnasium: + resources_servers: + base_gymnasium: + entrypoint: __init__.py + domain: other + verified: false + description: Base class for Gymnasium-style servers. Not a standalone server. +base_gymnasium_agent: + responses_api_agents: + gymnasium_agent: + entrypoint: app.py + env_server: + type: resources_servers + name: base_gymnasium + model_server: + type: responses_api_models + name: policy_model + max_steps: 1 + datasets: + - name: example + type: example + jsonl_fpath: resources_servers/base_gymnasium/data/example.jsonl diff --git a/resources_servers/base_gymnasium/data/.gitignore b/resources_servers/base_gymnasium/data/.gitignore new file mode 100644 index 000000000..4424b6fde --- /dev/null +++ b/resources_servers/base_gymnasium/data/.gitignore @@ -0,0 +1,5 @@ +*train.jsonl +*validation.jsonl +*train_prepare.jsonl +*validation_prepare.jsonl +*example_prepare.jsonl diff --git a/resources_servers/base_gymnasium/data/example.jsonl b/resources_servers/base_gymnasium/data/example.jsonl new file mode 100644 index 000000000..0e2c6d230 --- /dev/null +++ b/resources_servers/base_gymnasium/data/example.jsonl @@ -0,0 +1,5 @@ +{"responses_create_params": {"input": [{"role": "user", "content": "Hello"}]}, "agent_ref": {"type": "responses_api_agents", "name": "example_gymnasium_agent"}} +{"responses_create_params": {"input": [{"role": "user", "content": "Hello"}]}, "agent_ref": {"type": "responses_api_agents", "name": "example_gymnasium_agent"}} +{"responses_create_params": {"input": [{"role": "user", "content": "Hello"}]}, "agent_ref": {"type": "responses_api_agents", "name": "example_gymnasium_agent"}} +{"responses_create_params": {"input": [{"role": "user", "content": "Hello"}]}, "agent_ref": {"type": "responses_api_agents", "name": "example_gymnasium_agent"}} +{"responses_create_params": {"input": [{"role": "user", "content": "Hello"}]}, "agent_ref": {"type": "responses_api_agents", "name": "example_gymnasium_agent"}} diff --git a/resources_servers/base_gymnasium/data/example_metrics.json b/resources_servers/base_gymnasium/data/example_metrics.json new file mode 100644 index 000000000..a6b7d4e3e --- /dev/null +++ b/resources_servers/base_gymnasium/data/example_metrics.json @@ -0,0 +1,38 @@ +{ + "name": "example", + "type": "example", + "jsonl_fpath": "resources_servers/example_gymnasium/data/example.jsonl", + "num_repeats": 1, + "gitlab_identifier": null, + "huggingface_identifier": null, + "license": null, + "Number of examples": 5, + "Number of tools": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Standard deviation": 0.0 + }, + "Json-dumped number of words (proxy for token count)": { + "Total # non-null values": 5, + "Average": 5.0, + "Min": 5.0, + "Max": 5.0, + "Standard deviation": 0.0 + }, + "Number of turns": { + "Total # non-null values": 5, + "Average": 1.0, + "Min": 1.0, + "Max": 1.0, + "Standard deviation": 0.0 + }, + "Temperature": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Standard deviation": 0.0 + } +} \ No newline at end of file diff --git a/resources_servers/base_gymnasium/data/example_rollouts.jsonl b/resources_servers/base_gymnasium/data/example_rollouts.jsonl new file mode 100644 index 000000000..c8d8aafcc --- /dev/null +++ b/resources_servers/base_gymnasium/data/example_rollouts.jsonl @@ -0,0 +1,5 @@ +{"reward": 0.0, "terminated": true, "truncated": false, "responses_create_params": {"input": [{"role": "user", "content": "Hello"}]}, "response": {"id": "x", "created_at": 0, "model": "x", "object": "response", "output": [], "parallel_tool_calls": true, "tool_choice": "auto", "tools": []}} +{"reward": 0.0, "terminated": true, "truncated": false, "responses_create_params": {"input": [{"role": "user", "content": "Hello"}]}, "response": {"id": "x", "created_at": 0, "model": "x", "object": "response", "output": [], "parallel_tool_calls": true, "tool_choice": "auto", "tools": []}} +{"reward": 0.0, "terminated": true, "truncated": false, "responses_create_params": {"input": [{"role": "user", "content": "Hello"}]}, "response": {"id": "x", "created_at": 0, "model": "x", "object": "response", "output": [], "parallel_tool_calls": true, "tool_choice": "auto", "tools": []}} +{"reward": 0.0, "terminated": true, "truncated": false, "responses_create_params": {"input": [{"role": "user", "content": "Hello"}]}, "response": {"id": "x", "created_at": 0, "model": "x", "object": "response", "output": [], "parallel_tool_calls": true, "tool_choice": "auto", "tools": []}} +{"reward": 0.0, "terminated": true, "truncated": false, "responses_create_params": {"input": [{"role": "user", "content": "Hello"}]}, "response": {"id": "x", "created_at": 0, "model": "x", "object": "response", "output": [], "parallel_tool_calls": true, "tool_choice": "auto", "tools": []}} diff --git a/resources_servers/base_gymnasium/requirements.txt b/resources_servers/base_gymnasium/requirements.txt new file mode 100644 index 000000000..00ed83213 --- /dev/null +++ b/resources_servers/base_gymnasium/requirements.txt @@ -0,0 +1 @@ +-e nemo-gym[dev] @ ../../ diff --git a/resources_servers/base_gymnasium/tests/__init__.py b/resources_servers/base_gymnasium/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/resources_servers/base_gymnasium/tests/test_app.py b/resources_servers/base_gymnasium/tests/test_app.py new file mode 100644 index 000000000..a2f0677e9 --- /dev/null +++ b/resources_servers/base_gymnasium/tests/test_app.py @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from unittest.mock import MagicMock + +from nemo_gym.base_resources_server import BaseResourcesServerConfig +from nemo_gym.server_utils import ServerClient +from resources_servers.base_gymnasium import GymnasiumServer + + +class TestGymnasiumServer: + def test_sanity(self) -> None: + class ConcreteEnv(GymnasiumServer): + async def step(self, action, metadata, session_id=None): + return None, 1.0, True, False, {} + + config = BaseResourcesServerConfig(host="", port=0, entrypoint="", name="") + env = ConcreteEnv(config=config, server_client=MagicMock(spec=ServerClient)) + app = env.setup_webserver() + routes = {r.path for r in app.routes} + assert "/reset" in routes + assert "/step" in routes diff --git a/resources_servers/blackjack/README.md b/resources_servers/blackjack/README.md new file mode 100644 index 000000000..e9068b519 --- /dev/null +++ b/resources_servers/blackjack/README.md @@ -0,0 +1,20 @@ +# Blackjack Env + +Multi-step gymnasium-style Model hits or stands using `` tags until the hand ends. Game state managed per session. + +Example data provided in `data/example.jsonl` (system prompt only, no verifier_metadata needed). No train/validation data. + +## Run + +```bash +ng_run "+config_paths=[resources_servers/blackjack/configs/blackjack.yaml,responses_api_models/vllm_model/configs/vllm_model.yaml]" +``` + +## Collect rollouts + +```bash +ng_collect_rollouts \ + +agent_name=blackjack_gymnasium_agent \ + +input_jsonl_fpath=resources_servers/blackjack/data/example.jsonl \ + +output_jsonl_fpath=results/blackjack_rollouts.jsonl +``` diff --git a/resources_servers/blackjack/app.py b/resources_servers/blackjack/app.py new file mode 100644 index 000000000..551e4b891 --- /dev/null +++ b/resources_servers/blackjack/app.py @@ -0,0 +1,118 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Blackjack environment. + +Multi-step episode: the model hits or stands until the hand ends. +Reward: +1 win, 0 draw, -1 loss/bust. + +No verifier_metadata required. The JSONL just needs a system prompt. +""" + +import random +import re +from typing import Dict, Optional + +from pydantic import Field + +from nemo_gym.openai_utils import NeMoGymResponse +from resources_servers.base_gymnasium import GymnasiumServer, extract_text + + +_RANKS = ["2", "3", "4", "5", "6", "7", "8", "9", "10", "J", "Q", "K", "A"] + + +def _deal(): + return random.choice(_RANKS) + + +def _hand_value(hand: list[str]) -> int: + total = sum(10 if r in ("J", "Q", "K") else 11 if r == "A" else int(r) for r in hand) + aces = hand.count("A") + while total > 21 and aces: + total -= 10 + aces -= 1 + return total + + +def _fmt(hand: list[str]) -> str: + return "[" + ", ".join(hand) + "]" + + +class BlackjackEnv(GymnasiumServer): + session_state: Dict[str, dict] = Field(default_factory=dict) + + async def reset(self, metadata: dict, session_id: Optional[str] = None) -> tuple[Optional[str], dict]: + player = [_deal(), _deal()] + dealer = [_deal(), _deal()] + self.session_state[session_id] = {"player": player, "dealer": dealer} + obs = ( + f"Your hand: {_fmt(player)} = {_hand_value(player)}\n" + f"Dealer shows: {dealer[0]}\n" + f"Respond with hit or stand." + ) + return obs, {} + + async def step( + self, action: NeMoGymResponse, metadata: dict, session_id: Optional[str] = None + ) -> tuple[Optional[str], float, bool, bool, dict]: + state = self.session_state.get(session_id, {}) + player = state.get("player", []) + dealer = state.get("dealer", []) + text = extract_text(action) + m = re.search(r"\s*(hit|stand)\s*", text, re.IGNORECASE) + decision = m.group(1).lower() if m else ("hit" if "hit" in text.lower() else "stand") + + if decision == "hit": + player.append(_deal()) + val = _hand_value(player) + if val > 21: + return None, -1.0, True, False, {"result": "bust", "player": _fmt(player), "value": val} + obs = ( + f"Your hand: {_fmt(player)} = {val}\n" + f"Dealer shows: {dealer[0]}\n" + f"Respond with hit or stand." + ) + return obs, 0.0, False, False, {} + + # stand: dealer plays out + while _hand_value(dealer) < 17: + dealer.append(_deal()) + + pv, dv = _hand_value(player), _hand_value(dealer) + if dv > 21 or pv > dv: + reward, result = 1.0, "win" + elif pv == dv: + reward, result = 0.0, "draw" + else: + reward, result = -1.0, "loss" + + return ( + None, + reward, + True, + False, + { + "result": result, + "player": _fmt(player), + "player_value": pv, + "dealer": _fmt(dealer), + "dealer_value": dv, + }, + ) + + +if __name__ == "__main__": + BlackjackEnv.run_webserver() diff --git a/resources_servers/blackjack/configs/blackjack.yaml b/resources_servers/blackjack/configs/blackjack.yaml new file mode 100644 index 000000000..36d5609e1 --- /dev/null +++ b/resources_servers/blackjack/configs/blackjack.yaml @@ -0,0 +1,22 @@ +blackjack: + resources_servers: + blackjack: + entrypoint: app.py + domain: games + verified: false + description: Blackjack. Model hits or stands. Reward +1 win, 0 draw, -1 loss/bust. +blackjack_gymnasium_agent: + responses_api_agents: + gymnasium_agent: + entrypoint: app.py + env_server: + type: resources_servers + name: blackjack + model_server: + type: responses_api_models + name: policy_model + max_steps: 5 + datasets: + - name: example + type: example + jsonl_fpath: resources_servers/blackjack/data/example.jsonl diff --git a/resources_servers/blackjack/data/.gitignore b/resources_servers/blackjack/data/.gitignore new file mode 100644 index 000000000..4424b6fde --- /dev/null +++ b/resources_servers/blackjack/data/.gitignore @@ -0,0 +1,5 @@ +*train.jsonl +*validation.jsonl +*train_prepare.jsonl +*validation_prepare.jsonl +*example_prepare.jsonl diff --git a/resources_servers/blackjack/data/example.jsonl b/resources_servers/blackjack/data/example.jsonl new file mode 100644 index 000000000..f48a084e1 --- /dev/null +++ b/resources_servers/blackjack/data/example.jsonl @@ -0,0 +1,5 @@ +{"responses_create_params": {"input": [{"role": "system", "content": "You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag."}, {"role": "user", "content": "Deal me in."}]}, "agent_ref": {"type": "responses_api_agents", "name": "blackjack_gymnasium_agent"}} +{"responses_create_params": {"input": [{"role": "system", "content": "You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag."}, {"role": "user", "content": "Deal me in."}]}, "agent_ref": {"type": "responses_api_agents", "name": "blackjack_gymnasium_agent"}} +{"responses_create_params": {"input": [{"role": "system", "content": "You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag."}, {"role": "user", "content": "Deal me in."}]}, "agent_ref": {"type": "responses_api_agents", "name": "blackjack_gymnasium_agent"}} +{"responses_create_params": {"input": [{"role": "system", "content": "You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag."}, {"role": "user", "content": "Deal me in."}]}, "agent_ref": {"type": "responses_api_agents", "name": "blackjack_gymnasium_agent"}} +{"responses_create_params": {"input": [{"role": "system", "content": "You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag."}, {"role": "user", "content": "Deal me in."}]}, "agent_ref": {"type": "responses_api_agents", "name": "blackjack_gymnasium_agent"}} diff --git a/resources_servers/blackjack/data/example_metrics.json b/resources_servers/blackjack/data/example_metrics.json new file mode 100644 index 000000000..ca1f0810a --- /dev/null +++ b/resources_servers/blackjack/data/example_metrics.json @@ -0,0 +1,38 @@ +{ + "name": "example", + "type": "example", + "jsonl_fpath": "resources_servers/blackjack/data/example.jsonl", + "num_repeats": 1, + "gitlab_identifier": null, + "huggingface_identifier": null, + "license": null, + "Number of examples": 5, + "Number of tools": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Standard deviation": 0.0 + }, + "Json-dumped number of words (proxy for token count)": { + "Total # non-null values": 5, + "Average": 30.0, + "Min": 30.0, + "Max": 30.0, + "Standard deviation": 0.0 + }, + "Number of turns": { + "Total # non-null values": 5, + "Average": 1.0, + "Min": 1.0, + "Max": 1.0, + "Standard deviation": 0.0 + }, + "Temperature": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Standard deviation": 0.0 + } +} \ No newline at end of file diff --git a/resources_servers/blackjack/data/example_rollouts.jsonl b/resources_servers/blackjack/data/example_rollouts.jsonl new file mode 100644 index 000000000..6175f946f --- /dev/null +++ b/resources_servers/blackjack/data/example_rollouts.jsonl @@ -0,0 +1,5 @@ +{"responses_create_params":{"background":null,"include":null,"input":[{"content":"You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag.","role":"system","type":"message"},{"content":"Deal me in.","role":"user","type":"message"}],"instructions":null,"max_output_tokens":1024,"max_tool_calls":null,"metadata":null,"model":null,"parallel_tool_calls":true,"previous_response_id":null,"prompt":null,"reasoning":null,"service_tier":null,"store":null,"temperature":0.0,"text":null,"tool_choice":"auto","tools":[],"top_logprobs":null,"top_p":null,"truncation":null,"user":null,"stream":null},"response":{"id":"resp_0816d85808c14ee38cb42be5b1ae4863","created_at":1775592436.0,"error":null,"incomplete_details":null,"instructions":null,"metadata":null,"model":"Qwen/Qwen3-4B-Instruct-2507","object":"response","output":[{"prompt_token_ids":[151644,872,198,7771,1424,25,508,19,11,220,17,60,284,220,21,198,93909,4933,25,220,19,198,65354,448,366,1311,29,22492,522,1311,29,476,366,1311,29,2685,522,1311,14276,151645,198,151644,77091,198],"generation_token_ids":[27,1311,29,2685,522,1311,29,151645],"generation_log_probs":[-0.0001250427303602919,-1.5497195136049413e-6,-1.1920928244535389e-7,-0.693234920501709,0.0,0.0,-3.576278118089249e-7,-3.2186455882765586e-6],"id":"msg_b266aeb4d0be4afc80b0931cb97cd083","content":[{"annotations":[],"text":"stand","type":"output_text","logprobs":null}],"role":"assistant","status":"completed","type":"message"}],"parallel_tool_calls":true,"temperature":0.0,"tool_choice":"auto","tools":[],"top_p":null,"background":null,"conversation":null,"max_output_tokens":1024,"max_tool_calls":null,"previous_response_id":null,"prompt":null,"prompt_cache_key":null,"reasoning":null,"safety_identifier":null,"service_tier":null,"status":null,"text":null,"top_logprobs":null,"truncation":null,"usage":{"input_tokens":44,"input_tokens_details":{"cached_tokens":0},"output_tokens":8,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":52},"user":null},"reward":-1.0,"terminated":true,"truncated":false,"info":{"result":"loss","player":"[4, 2]","player_value":6,"dealer":"[4, 2, 2, K]","dealer_value":18},"_ng_task_index":0,"_ng_rollout_index":1,"agent_ref":{"name":"blackjack_agent"}} +{"responses_create_params":{"background":null,"include":null,"input":[{"content":"You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag.","role":"system","type":"message"},{"content":"Deal me in.","role":"user","type":"message"}],"instructions":null,"max_output_tokens":1024,"max_tool_calls":null,"metadata":null,"model":null,"parallel_tool_calls":true,"previous_response_id":null,"prompt":null,"reasoning":null,"service_tier":null,"store":null,"temperature":0.0,"text":null,"tool_choice":"auto","tools":[],"top_logprobs":null,"top_p":null,"truncation":null,"user":null,"stream":null},"response":{"id":"resp_c018550a3d174fe987939919762706f8","created_at":1775592436.0,"error":null,"incomplete_details":null,"instructions":null,"metadata":null,"model":"Qwen/Qwen3-4B-Instruct-2507","object":"response","output":[{"prompt_token_ids":[151644,872,198,7771,1424,25,508,24,11,220,24,60,284,220,16,23,198,93909,4933,25,220,18,198,65354,448,366,1311,29,22492,522,1311,29,476,366,1311,29,2685,522,1311,14276,151645,198,151644,77091,198],"generation_token_ids":[27,1311,29,2685,522,1311,29,151645],"generation_log_probs":[-1.5497195136049413e-6,-1.6689286894688848e-6,0.0,-0.00012838016846217215,0.0,0.0,-2.3841855067985307e-7,-3.576272320060525e-6],"id":"msg_02f9a7cb0eac41a8b13c2c690534a43b","content":[{"annotations":[],"text":"stand","type":"output_text","logprobs":null}],"role":"assistant","status":"completed","type":"message"}],"parallel_tool_calls":true,"temperature":0.0,"tool_choice":"auto","tools":[],"top_p":null,"background":null,"conversation":null,"max_output_tokens":1024,"max_tool_calls":null,"previous_response_id":null,"prompt":null,"prompt_cache_key":null,"reasoning":null,"safety_identifier":null,"service_tier":null,"status":null,"text":null,"top_logprobs":null,"truncation":null,"usage":{"input_tokens":45,"input_tokens_details":{"cached_tokens":0},"output_tokens":8,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":53},"user":null},"reward":1.0,"terminated":true,"truncated":false,"info":{"result":"win","player":"[9, 9]","player_value":18,"dealer":"[3, 9, 3, Q]","dealer_value":25},"_ng_task_index":0,"_ng_rollout_index":2,"agent_ref":{"name":"blackjack_agent"}} +{"responses_create_params":{"background":null,"include":null,"input":[{"content":"You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag.","role":"system","type":"message"},{"content":"Deal me in.","role":"user","type":"message"}],"instructions":null,"max_output_tokens":1024,"max_tool_calls":null,"metadata":null,"model":null,"parallel_tool_calls":true,"previous_response_id":null,"prompt":null,"reasoning":null,"service_tier":null,"store":null,"temperature":0.0,"text":null,"tool_choice":"auto","tools":[],"top_logprobs":null,"top_p":null,"truncation":null,"user":null,"stream":null},"response":{"id":"resp_1f8b38af629d4252818b94897f4e599a","created_at":1775592436.0,"error":null,"incomplete_details":null,"instructions":null,"metadata":null,"model":"Qwen/Qwen3-4B-Instruct-2507","object":"response","output":[{"prompt_token_ids":[151644,872,198,7771,1424,25,508,19,11,619,60,284,220,16,19,198,93909,4933,25,362,198,65354,448,366,1311,29,22492,522,1311,29,476,366,1311,29,2685,522,1311,14276,151645,198,151644,77091,198],"generation_token_ids":[27,1311,29,2685,522,1311,29,151645],"generation_log_probs":[-0.000011920858014491387,-2.0265558760002023e-6,0.0,-0.038049791008234024,0.0,0.0,-3.576278118089249e-7,-3.3378546504536644e-6],"id":"msg_9046c12edb9d48d992ce6cd8dbeed1eb","content":[{"annotations":[],"text":"stand","type":"output_text","logprobs":null}],"role":"assistant","status":"completed","type":"message"}],"parallel_tool_calls":true,"temperature":0.0,"tool_choice":"auto","tools":[],"top_p":null,"background":null,"conversation":null,"max_output_tokens":1024,"max_tool_calls":null,"previous_response_id":null,"prompt":null,"prompt_cache_key":null,"reasoning":null,"safety_identifier":null,"service_tier":null,"status":null,"text":null,"top_logprobs":null,"truncation":null,"usage":{"input_tokens":43,"input_tokens_details":{"cached_tokens":0},"output_tokens":8,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":51},"user":null},"reward":-1.0,"terminated":true,"truncated":false,"info":{"result":"loss","player":"[4, J]","player_value":14,"dealer":"[A, J]","dealer_value":21},"_ng_task_index":0,"_ng_rollout_index":4,"agent_ref":{"name":"blackjack_agent"}} +{"responses_create_params":{"background":null,"include":null,"input":[{"content":"You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag.","role":"system","type":"message"},{"content":"Deal me in.","role":"user","type":"message"}],"instructions":null,"max_output_tokens":1024,"max_tool_calls":null,"metadata":null,"model":null,"parallel_tool_calls":true,"previous_response_id":null,"prompt":null,"reasoning":null,"service_tier":null,"store":null,"temperature":0.0,"text":null,"tool_choice":"auto","tools":[],"top_logprobs":null,"top_p":null,"truncation":null,"user":null,"stream":null},"response":{"id":"resp_1d3acaf11fe142ffaaddc9925de94d78","created_at":1775592436.0,"error":null,"incomplete_details":null,"instructions":null,"metadata":null,"model":"Qwen/Qwen3-4B-Instruct-2507","object":"response","output":[{"prompt_token_ids":[151644,872,198,7771,1424,25,508,22,11,220,16,15,60,284,220,16,22,198,93909,4933,25,619,198,65354,448,366,1311,29,22492,522,1311,29,476,366,1311,29,2685,522,1311,14276,151645,198,151644,77091,198],"generation_token_ids":[27,1311,29,2685,522,1311,29,151645],"generation_log_probs":[-0.00004053033626405522,-1.6689286894688848e-6,0.0,-0.0011742371134459972,0.0,0.0,-2.3841855067985307e-7,-2.50339189733495e-6],"id":"msg_b2b71c2b4b5d4e059b77886de80db892","content":[{"annotations":[],"text":"stand","type":"output_text","logprobs":null}],"role":"assistant","status":"completed","type":"message"}],"parallel_tool_calls":true,"temperature":0.0,"tool_choice":"auto","tools":[],"top_p":null,"background":null,"conversation":null,"max_output_tokens":1024,"max_tool_calls":null,"previous_response_id":null,"prompt":null,"prompt_cache_key":null,"reasoning":null,"safety_identifier":null,"service_tier":null,"status":null,"text":null,"top_logprobs":null,"truncation":null,"usage":{"input_tokens":45,"input_tokens_details":{"cached_tokens":0},"output_tokens":8,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":53},"user":null},"reward":-1.0,"terminated":true,"truncated":false,"info":{"result":"loss","player":"[7, 10]","player_value":17,"dealer":"[J, 5, 3]","dealer_value":18},"_ng_task_index":0,"_ng_rollout_index":3,"agent_ref":{"name":"blackjack_agent"}} +{"responses_create_params":{"background":null,"include":null,"input":[{"content":"You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag.","role":"system","type":"message"},{"content":"Deal me in.","role":"user","type":"message"}],"instructions":null,"max_output_tokens":1024,"max_tool_calls":null,"metadata":null,"model":null,"parallel_tool_calls":true,"previous_response_id":null,"prompt":null,"reasoning":null,"service_tier":null,"store":null,"temperature":0.0,"text":null,"tool_choice":"auto","tools":[],"top_logprobs":null,"top_p":null,"truncation":null,"user":null,"stream":null},"response":{"id":"resp_a97b2558ac874f2ba5684a67e9b941c2","created_at":1775592436.0,"error":null,"incomplete_details":null,"instructions":null,"metadata":null,"model":"Qwen/Qwen3-4B-Instruct-2507","object":"response","output":[{"prompt_token_ids":[151644,872,198,7771,1424,25,508,22,11,1207,60,284,220,16,22,198,93909,4933,25,220,17,198,65354,448,366,1311,29,22492,522,1311,29,476,366,1311,29,2685,522,1311,14276,151645,198,151644,77091,198],"generation_token_ids":[27,1311,29,22492,522,1311,29,151645],"generation_log_probs":[-2.0265558760002023e-6,-1.6689286894688848e-6,-1.1920928244535389e-7,-0.10021035373210907,0.0,0.0,0.0,-5.722029527532868e-6],"id":"msg_601a2716de0a4eb38494d2131674c825","content":[{"annotations":[],"text":"hit","type":"output_text","logprobs":null}],"role":"assistant","status":"completed","type":"message"},{"prompt_token_ids":[151644,872,198,7771,1424,25,508,22,11,1207,60,284,220,16,22,198,93909,4933,25,220,17,198,65354,448,366,1311,29,22492,522,1311,29,476,366,1311,29,2685,522,1311,14276,151645,198,151644,77091,198,27,1311,29,22492,522,1311,29,151645,198,151644,872,198,7771,1424,25,508,22,11,1207,11,220,18,60,284,220,17,15,198,93909,4933,25,220,17,198,65354,448,366,1311,29,22492,522,1311,29,476,366,1311,29,2685,522,1311,14276,151645,198,151644,77091,198],"generation_token_ids":[27,1311,29,2685,522,1311,29,151645],"generation_log_probs":[-1.1920928244535389e-7,-2.3841855067985307e-7,-7.152555099310121e-7,0.0,0.0,-1.1920928244535389e-7,0.0,-1.6689286894688848e-6],"id":"msg_7de40025e0a7414594284d76461fe177","content":[{"annotations":[],"text":"stand","type":"output_text","logprobs":null}],"role":"assistant","status":"completed","type":"message"}],"parallel_tool_calls":true,"temperature":0.0,"tool_choice":"auto","tools":[],"top_p":null,"background":null,"conversation":null,"max_output_tokens":1024,"max_tool_calls":null,"previous_response_id":null,"prompt":null,"prompt_cache_key":null,"reasoning":null,"safety_identifier":null,"service_tier":null,"status":null,"text":null,"top_logprobs":null,"truncation":null,"usage":{"input_tokens":144,"input_tokens_details":{"cached_tokens":0},"output_tokens":16,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":160},"user":null},"reward":-1.0,"terminated":true,"truncated":false,"info":{"result":"loss","player":"[]","player_value":0,"dealer":"[8, A]","dealer_value":19},"_ng_task_index":0,"_ng_rollout_index":0,"agent_ref":{"name":"blackjack_agent"}} diff --git a/resources_servers/blackjack/data/example_rollouts_aggregate_metrics.json b/resources_servers/blackjack/data/example_rollouts_aggregate_metrics.json new file mode 100644 index 000000000..b48dc04c5 --- /dev/null +++ b/resources_servers/blackjack/data/example_rollouts_aggregate_metrics.json @@ -0,0 +1,87 @@ +[ + { + "agent_ref": { + "name": "blackjack_agent" + }, + "agent_metrics": { + "mean/reward": -0.6, + "mean/terminated": 1.0, + "mean/truncated": 0.0, + "mean/input_tokens": 64.2, + "mean/output_tokens": 9.6, + "mean/total_tokens": 73.8, + "max/reward": 1.0, + "max/terminated": 1.0, + "max/truncated": 0.0, + "max/input_tokens": 144.0, + "max/output_tokens": 16.0, + "max/total_tokens": 160.0, + "min/reward": -1.0, + "min/terminated": 1.0, + "min/truncated": 0.0, + "min/input_tokens": 43.0, + "min/output_tokens": 8.0, + "min/total_tokens": 51.0, + "median/reward": -1.0, + "median/terminated": 1.0, + "median/truncated": 0.0, + "median/input_tokens": 45.0, + "median/output_tokens": 8.0, + "median/total_tokens": 53.0, + "std/reward": 0.8944271909999161, + "std/terminated": 0.0, + "std/truncated": 0.0, + "std/input_tokens": 44.61726123374226, + "std/output_tokens": 3.5777087639996643, + "std/total_tokens": 48.19439801470706 + }, + "key_metrics": { + "mean/reward": -0.6, + "mean/terminated": 1.0, + "mean/truncated": 0.0, + "mean/input_tokens": 64.2, + "mean/output_tokens": 9.6, + "mean/total_tokens": 73.8 + }, + "group_level_metrics": [ + { + "mean/reward": -0.6, + "mean/terminated": 1.0, + "mean/truncated": 0.0, + "mean/input_tokens": 64.2, + "mean/output_tokens": 9.6, + "mean/total_tokens": 73.8, + "max/reward": 1.0, + "max/terminated": 1.0, + "max/truncated": 0.0, + "max/input_tokens": 144.0, + "max/output_tokens": 16.0, + "max/total_tokens": 160.0, + "min/reward": -1.0, + "min/terminated": 1.0, + "min/truncated": 0.0, + "min/input_tokens": 43.0, + "min/output_tokens": 8.0, + "min/total_tokens": 51.0, + "median/reward": -1.0, + "median/terminated": 1.0, + "median/truncated": 0.0, + "median/input_tokens": 45.0, + "median/output_tokens": 8.0, + "median/total_tokens": 53.0, + "std/reward": 0.8944271909999161, + "std/terminated": 0.0, + "std/truncated": 0.0, + "std/input_tokens": 44.61726123374226, + "std/output_tokens": 3.5777087639996643, + "std/total_tokens": 48.19439801470706, + "sample": { + "agent_ref": { + "name": "agent" + } + }, + "_ng_task_index": 0 + } + ] + } +] \ No newline at end of file diff --git a/resources_servers/blackjack/data/example_rollouts_materialized_inputs.jsonl b/resources_servers/blackjack/data/example_rollouts_materialized_inputs.jsonl new file mode 100644 index 000000000..d717903c4 --- /dev/null +++ b/resources_servers/blackjack/data/example_rollouts_materialized_inputs.jsonl @@ -0,0 +1,5 @@ +{"responses_create_params":{"input":[{"role":"system","content":"You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag."},{"role":"user","content":"Deal me in."}],"max_output_tokens":1024,"temperature":0.0},"agent_ref":{"name":"blackjack_agent"},"_ng_task_index":0,"_ng_rollout_index":0} +{"responses_create_params":{"input":[{"role":"system","content":"You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag."},{"role":"user","content":"Deal me in."}],"max_output_tokens":1024,"temperature":0.0},"agent_ref":{"name":"blackjack_agent"},"_ng_task_index":0,"_ng_rollout_index":1} +{"responses_create_params":{"input":[{"role":"system","content":"You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag."},{"role":"user","content":"Deal me in."}],"max_output_tokens":1024,"temperature":0.0},"agent_ref":{"name":"blackjack_agent"},"_ng_task_index":0,"_ng_rollout_index":2} +{"responses_create_params":{"input":[{"role":"system","content":"You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag."},{"role":"user","content":"Deal me in."}],"max_output_tokens":1024,"temperature":0.0},"agent_ref":{"name":"blackjack_agent"},"_ng_task_index":0,"_ng_rollout_index":3} +{"responses_create_params":{"input":[{"role":"system","content":"You are playing Blackjack. After seeing your hand, respond with hit or stand. Think briefly, then give your action tag."},{"role":"user","content":"Deal me in."}],"max_output_tokens":1024,"temperature":0.0},"agent_ref":{"name":"blackjack_agent"},"_ng_task_index":0,"_ng_rollout_index":4} diff --git a/resources_servers/blackjack/requirements.txt b/resources_servers/blackjack/requirements.txt new file mode 100644 index 000000000..00ed83213 --- /dev/null +++ b/resources_servers/blackjack/requirements.txt @@ -0,0 +1 @@ +-e nemo-gym[dev] @ ../../ diff --git a/resources_servers/blackjack/tests/__init__.py b/resources_servers/blackjack/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/resources_servers/blackjack/tests/test_app.py b/resources_servers/blackjack/tests/test_app.py new file mode 100644 index 000000000..4187b1544 --- /dev/null +++ b/resources_servers/blackjack/tests/test_app.py @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from unittest.mock import MagicMock + +from nemo_gym.base_resources_server import BaseResourcesServerConfig +from nemo_gym.server_utils import ServerClient +from resources_servers.blackjack.app import BlackjackEnv + + +class TestApp: + def test_sanity(self) -> None: + config = BaseResourcesServerConfig(host="", port=0, entrypoint="", name="") + BlackjackEnv(config=config, server_client=MagicMock(spec=ServerClient)) diff --git a/responses_api_agents/gymnasium_agent/README.md b/responses_api_agents/gymnasium_agent/README.md new file mode 100644 index 000000000..ae3e3347a --- /dev/null +++ b/responses_api_agents/gymnasium_agent/README.md @@ -0,0 +1,5 @@ +# Gymnasium Agent + +Agent for `GymnasiumServer` resources servers. Drives the reset/step loop. + +See `docs/resources-server/gymnasium-api.md` for usage. diff --git a/responses_api_agents/gymnasium_agent/__init__.py b/responses_api_agents/gymnasium_agent/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/responses_api_agents/gymnasium_agent/app.py b/responses_api_agents/gymnasium_agent/app.py new file mode 100644 index 000000000..dbd7a578f --- /dev/null +++ b/responses_api_agents/gymnasium_agent/app.py @@ -0,0 +1,182 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Agent for GymnasiumServer resources servers (resources_servers.base_gymnasium) which implements the Gymnasium API.""" + +from fastapi import Body, Request, Response +from pydantic import ConfigDict + +from nemo_gym.base_resources_server import ( + BaseRunRequest, + BaseVerifyResponse, +) +from nemo_gym.base_responses_api_agent import BaseResponsesAPIAgentConfig, SimpleResponsesAPIAgent +from nemo_gym.config_types import AggregateMetrics, AggregateMetricsRequest, ModelServerRef, ResourcesServerRef +from nemo_gym.openai_utils import ( + NeMoGymEasyInputMessage, + NeMoGymResponse, + NeMoGymResponseCreateParamsNonStreaming, +) +from nemo_gym.server_utils import get_response_json, raise_for_status +from resources_servers.base_gymnasium import EnvResetResponse, EnvStepResponse, extract_text + + +class GymnasiumAgentConfig(BaseResponsesAPIAgentConfig): + env_server: ResourcesServerRef + model_server: ModelServerRef + max_steps: int = 10 + + +class GymnasiumAgentRunRequest(BaseRunRequest): + model_config = ConfigDict(extra="allow") + + +class GymnasiumRunResponse(BaseVerifyResponse): + model_config = ConfigDict(extra="allow") + terminated: bool = False + truncated: bool = False + info: dict = {} + + +class GymnasiumAgent(SimpleResponsesAPIAgent): + config: GymnasiumAgentConfig + + async def responses( + self, + request: Request, + response: Response, + body: NeMoGymResponseCreateParamsNonStreaming = Body(), + ) -> NeMoGymResponse: + model_resp = await self.server_client.post( + server_name=self.config.model_server.name, + url_path="/v1/responses", + json=body, + cookies=request.cookies, + ) + await raise_for_status(model_resp) + result = NeMoGymResponse.model_validate(await get_response_json(model_resp)) + for k, v in model_resp.cookies.items(): + response.set_cookie(k, v) + return result + + async def run(self, request: Request, body: GymnasiumAgentRunRequest) -> GymnasiumRunResponse: + cookies = request.cookies + + # --- reset --- + reset_resp = await self.server_client.post( + server_name=self.config.env_server.name, + url_path="/reset", + json=body.model_dump(), + cookies=cookies, + ) + await raise_for_status(reset_resp) + reset_data = EnvResetResponse.model_validate(await get_response_json(reset_resp)) + cookies = reset_resp.cookies + + current_input = body.responses_create_params.model_copy(deep=True) + if isinstance(current_input.input, str): + current_input = current_input.model_copy( + update={"input": [NeMoGymEasyInputMessage(role="user", content=current_input.input)]} + ) + if reset_data.observation: + current_input = current_input.model_copy( + update={ + "input": list(current_input.input) + + [NeMoGymEasyInputMessage(role="user", content=reset_data.observation)] + } + ) + + all_outputs = [] + usage = None + model_server_cookies = None + step_data = EnvStepResponse(terminated=False, truncated=True, reward=0.0) + last_model_response = None + + # --- loop --- + for _ in range(self.config.max_steps): + model_resp = await self.server_client.post( + server_name=self.config.model_server.name, + url_path="/v1/responses", + json=current_input, + cookies=model_server_cookies, + ) + await raise_for_status(model_resp) + model_response = NeMoGymResponse.model_validate(await get_response_json(model_resp)) + model_server_cookies = model_resp.cookies + last_model_response = model_response + + all_outputs.extend(model_response.output) + + if model_response.usage: + if usage is None: + usage = model_response.usage.model_copy(deep=True) + else: + usage.input_tokens += model_response.usage.input_tokens + usage.output_tokens += model_response.usage.output_tokens + usage.total_tokens += model_response.usage.total_tokens + usage.input_tokens_details.cached_tokens = 0 + usage.output_tokens_details.reasoning_tokens = 0 + + step_resp = await self.server_client.post( + server_name=self.config.env_server.name, + url_path="/step", + json=body.model_dump() | {"response": model_response.model_dump()}, + cookies=cookies, + ) + await raise_for_status(step_resp) + step_data = EnvStepResponse.model_validate(await get_response_json(step_resp)) + cookies = step_resp.cookies + + if step_data.terminated or step_data.truncated: + break + + if step_data.observation: + current_input = current_input.model_copy( + update={ + "input": list(current_input.input) + + [ + NeMoGymEasyInputMessage(role="assistant", content=extract_text(model_response)), + NeMoGymEasyInputMessage(role="user", content=step_data.observation), + ] + } + ) + + else: # for/else: loop completed without break, meaning max_steps exhausted + step_data = step_data.model_copy(update={"truncated": True}) + + last_model_response.output = all_outputs + last_model_response.usage = usage + + return GymnasiumRunResponse( + responses_create_params=body.responses_create_params, + response=last_model_response, + reward=step_data.reward, + terminated=step_data.terminated, + truncated=step_data.truncated, + info=step_data.info, + ) + + async def aggregate_metrics(self, body: AggregateMetricsRequest = Body()) -> AggregateMetrics: + response = await self.server_client.post( + server_name=self.config.env_server.name, + url_path="/aggregate_metrics", + json=body, + ) + await raise_for_status(response) + return AggregateMetrics.model_validate(await get_response_json(response)) + + +if __name__ == "__main__": + GymnasiumAgent.run_webserver() diff --git a/responses_api_agents/gymnasium_agent/configs/gymnasium_agent.yaml b/responses_api_agents/gymnasium_agent/configs/gymnasium_agent.yaml new file mode 100644 index 000000000..5a088281d --- /dev/null +++ b/responses_api_agents/gymnasium_agent/configs/gymnasium_agent.yaml @@ -0,0 +1,4 @@ +gymnasium_agent: + responses_api_agents: + gymnasium_agent: + entrypoint: app.py diff --git a/responses_api_agents/gymnasium_agent/requirements.txt b/responses_api_agents/gymnasium_agent/requirements.txt new file mode 100644 index 000000000..00ed83213 --- /dev/null +++ b/responses_api_agents/gymnasium_agent/requirements.txt @@ -0,0 +1 @@ +-e nemo-gym[dev] @ ../../ diff --git a/responses_api_agents/gymnasium_agent/tests/__init__.py b/responses_api_agents/gymnasium_agent/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/responses_api_agents/gymnasium_agent/tests/test_app.py b/responses_api_agents/gymnasium_agent/tests/test_app.py new file mode 100644 index 000000000..31606af36 --- /dev/null +++ b/responses_api_agents/gymnasium_agent/tests/test_app.py @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest.mock import MagicMock + +from nemo_gym.config_types import ModelServerRef, ResourcesServerRef +from nemo_gym.server_utils import ServerClient +from responses_api_agents.gymnasium_agent.app import GymnasiumAgent, GymnasiumAgentConfig + + +def _make_agent(): + config = GymnasiumAgentConfig( + host="", + port=0, + entrypoint="", + name="test_gymnasium_agent", + env_server=ResourcesServerRef(type="resources_servers", name="my_env"), + model_server=ModelServerRef(type="responses_api_models", name="policy_model"), + ) + return GymnasiumAgent(config=config, server_client=MagicMock(spec=ServerClient)) + + +class TestGymnasiumAgent: + def test_setup_webserver(self): + agent = _make_agent() + app = agent.setup_webserver() + routes = {r.path for r in app.routes} + assert "/run" in routes + assert "/v1/responses" in routes + assert "/aggregate_metrics" in routes + + def test_config(self): + agent = _make_agent() + assert agent.config.env_server.name == "my_env" + assert agent.config.model_server.name == "policy_model" + assert agent.config.max_steps == 10 From 8d01a30dc127a4f8aa44c9a2b9bd67927b416516 Mon Sep 17 00:00:00 2001 From: cmunley1 Date: Wed, 15 Apr 2026 01:00:10 -0700 Subject: [PATCH 02/16] docs Signed-off-by: cmunley1 --- docs/resources-server/gymnasium-api.md | 118 +++--------------- resources_servers/blackjack/README.md | 4 +- resources_servers/blackjack/app.py | 9 +- .../gymnasium_agent/README.md | 4 +- responses_api_agents/gymnasium_agent/app.py | 10 +- 5 files changed, 34 insertions(+), 111 deletions(-) diff --git a/docs/resources-server/gymnasium-api.md b/docs/resources-server/gymnasium-api.md index 0fbd1a383..9b69e52f2 100644 --- a/docs/resources-server/gymnasium-api.md +++ b/docs/resources-server/gymnasium-api.md @@ -1,10 +1,9 @@ (gymnasium-api)= -# Gymnasium API - +# Gymnasium API -`GymnasiumServer` is a [Gymnasium](https://gymnasium.farama.org/)-style base class for resources servers. Implement `step()`, optionally `reset()`, pair with `gymnasium_agent`. +`GymnasiumServer` is a [Gymnasium](https://gymnasium.farama.org/)-style base class for resources servers. Implement `step()`, optionally `reset()`, use with `gymnasium_agent`. ## Interface @@ -14,25 +13,29 @@ from nemo_gym.openai_utils import NeMoGymResponse class MyEnv(GymnasiumServer): async def reset(self, metadata: dict, session_id=None) -> tuple[str | None, dict]: - # Called once at the start of each episode. - # Return (observation, info). + # Called once at the start of each episode to initial the environment + # Return (observation, info) # None observation = use responses_create_params.input as-is. return None, {} async def step(self, action: NeMoGymResponse, metadata: dict, session_id=None) -> tuple[str | None, float, bool, bool, dict]: - # Called after each model response. - # Return (observation, reward, terminated, truncated, info). - # observation: next prompt for the model, or None if episode is over. - # reward is only used when terminated=True. - # truncated=True means the episode hit the step limit. + # Called after each model response, executes any actions taken or implements other env logic ( tags, tool calls, user turns, or just single-step verification), computes rewards, determines if done. + # Return (observation, reward, terminated, truncated, info) + # observation: next message to model (tool result, rendering of state, user msg etc), or None if episode is over. + # reward: per-step reward, accumulated across all steps by the agent. + # terminated: True when the episode ends naturally (task solved, game over, etc). Tells the agent to stop. + # truncated: True when the episode is cut short (step limit, timeout, etc). + # info: arbitrary dict passed through to the final response. Use for diagnostics, scores, metadata. ... ``` -`metadata` contains the extra fields from `verifier_metadata` in your JSONL input. Access them via `metadata.get("field_name")`. +The main method to implement is step(). Implementing a custom `reset()` is optional to initialize environment state or return a message from the environment. The default implementation returns `(None, {})`. + +`metadata` is the `verifier_metadata` dict from your input JSONL, passed through unchanged. This is where you put task-specific data (expected answers, test cases, board configurations, etc.) that the environment needs for initialization or scoring. Access fields via `metadata.get("field_name")`. -`session_id` is a unique string per rollout, used to store and retrieve per-episode state. Stateless environments can ignore it. +`session_id` is a unique string per rollout. Use it as a key to store per-episode state (e.g. game boards, conversation history) in a dict on your server. Stateless environments can ignore it. -`reset()` is optional. The default implementation returns `(None, {})`. +`action` is the full `NeMoGymResponse` from the model server. Your `step()` parses whatever it needs from it: text content, `` tags, function calls, etc. Use the `extract_text()` helper for text, or inspect `action.output` directly for structured output like tool calls. ## Single-step @@ -48,7 +51,7 @@ class MySingleStepEnv(GymnasiumServer): ## Multi-step with action tags -Multiple model calls per episode without tool calling. The model uses `` tags in its output, `step()` parses them and returns the next observation or terminates. +Multiple model calls per episode without native tool calling. The model uses `` tags in its output, `step()` parses them and returns the next observation or terminates. ```python import re @@ -80,7 +83,7 @@ class BlackjackEnv(GymnasiumServer): ## Tool-use -`action` is the full `NeMoGymResponse` from the model server, including any function calls already parsed by the model server's tool call parser. `step()` checks `action.output` for items with `type == "function_call"`, executes them, and returns the results as the next observation. Tool schemas go in `responses_create_params.tools` in your JSONL data so the model knows what tools are available. +For tool-calling environments, `step()` checks `action.output` for items with `type == "function_call"`, executes them, and returns the results as the next observation. Tool schemas go in `responses_create_params.tools` in your JSONL data so the model knows what tools are available. ```python import json @@ -133,83 +136,6 @@ class MyMultiTurnEnv(GymnasiumServer): return None, reward, True, False, {} ``` -## Multi-turn with user model - -Instead of scripted follow-ups, call a second LLM to generate user replies. The policy never sees the user model's internals. Configure the user model as a separate model server in YAML. - -```python -from nemo_gym.server_utils import get_response_json, raise_for_status - -class UserSimEnv(GymnasiumServer): - user_model_server: str = "user_model" - - async def step(self, action, metadata, session_id=None): - assistant_text = _extract_text(action) - resp = await self.server_client.post( - server_name=self.user_model_server, - url_path="/v1/responses", - json={"input": [ - {"role": "system", "content": metadata.get("user_persona", "You are a user.")}, - {"role": "assistant", "content": assistant_text}, - ]}, - ) - await raise_for_status(resp) - data = await get_response_json(resp) - user_reply = _extract_text_from_json(data) - - if is_done(user_reply, metadata): - reward = self._grade(action, metadata) - return None, reward, True, False, {} - - return user_reply, 0.0, False, False, {} -``` - -## Multi-turn with user model and tools - -The user model can also call tools that the policy model does not have access to. For example, the user model might query a database or check a schedule to formulate realistic follow-up questions. The policy only sees the resulting text, not the tool calls or their outputs. - -Configure the user model's tool schemas in the `step()` call, not in the policy's `responses_create_params.tools`. - -```python -class UserWithToolsEnv(GymnasiumServer): - user_model_server: str = "user_model" - session_state: dict = Field(default_factory=dict) - - async def reset(self, metadata, session_id=None): - self.session_state[session_id] = initialize_user_tools(metadata) - return None, {} - - async def step(self, action, metadata, session_id=None): - assistant_text = _extract_text(action) - state = self.session_state[session_id] - - # User model gets its own tool schemas (hidden from the policy) - resp = await self.server_client.post( - server_name=self.user_model_server, - url_path="/v1/responses", - json={ - "input": [ - {"role": "system", "content": state["user_system_prompt"]}, - {"role": "assistant", "content": assistant_text}, - ], - "tools": state["user_tools"], # only the user model sees these - }, - ) - await raise_for_status(resp) - data = await get_response_json(resp) - - # Execute any tool calls the user model made - user_reply = await self._run_user_tool_loop(data, state) - - if is_done(user_reply, metadata): - reward = self._grade(action, metadata) - return None, reward, True, False, {} - - return user_reply, 0.0, False, False, {} -``` - -The policy sees `user_reply` as a plain user message. It has no knowledge of the user model's tools, tool calls, or tool results. This enables asymmetric information between the policy and the simulated user. - ## LLM-as-judge Use `step()` to call a judge model via `self.server_client` and score the output. The judge model must be configured as a separate model server. @@ -279,10 +205,4 @@ my_gymnasium_agent_instance: ## Examples -Working implementations are in the repository: - -- `resources_servers/reasoning_gym_env/` -- single-step grading -- `resources_servers/workplace_assistant_env/` -- stateful tool dispatch -- `resources_servers/example_multi_turn_env/` -- scripted multi-turn -- `resources_servers/blackjack_env/` -- multi-step game with action tags -- `resources_servers/tictactoe_env/` -- adversarial game +- `resources_servers/blackjack/` -- multi-step game with action tags diff --git a/resources_servers/blackjack/README.md b/resources_servers/blackjack/README.md index e9068b519..961c4c2f0 100644 --- a/resources_servers/blackjack/README.md +++ b/resources_servers/blackjack/README.md @@ -1,6 +1,8 @@ # Blackjack Env -Multi-step gymnasium-style Model hits or stands using `` tags until the hand ends. Game state managed per session. +Multi-step gymnasium-style environment. + +Model hits or stands using `` tags until the hand ends. Game state managed per session. Example data provided in `data/example.jsonl` (system prompt only, no verifier_metadata needed). No train/validation data. diff --git a/resources_servers/blackjack/app.py b/resources_servers/blackjack/app.py index 551e4b891..049874492 100644 --- a/resources_servers/blackjack/app.py +++ b/resources_servers/blackjack/app.py @@ -13,12 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Blackjack environment. - -Multi-step episode: the model hits or stands until the hand ends. -Reward: +1 win, 0 draw, -1 loss/bust. +""" +Blackjack environment in Gymnasium API style. -No verifier_metadata required. The JSONL just needs a system prompt. +Multi-step: the model hits or stands until the hand ends. +Reward: +1 win, 0 draw, -1 loss. """ import random diff --git a/responses_api_agents/gymnasium_agent/README.md b/responses_api_agents/gymnasium_agent/README.md index ae3e3347a..a91a145eb 100644 --- a/responses_api_agents/gymnasium_agent/README.md +++ b/responses_api_agents/gymnasium_agent/README.md @@ -1,5 +1,5 @@ # Gymnasium Agent -Agent for `GymnasiumServer` resources servers. Drives the reset/step loop. +Agent for Gymansium-style environments based on `GymnasiumServer` resources servers. Drives the reset/step loop. -See `docs/resources-server/gymnasium-api.md` for usage. +See `docs/resources-server/gymnasium-api.md` for more details. \ No newline at end of file diff --git a/responses_api_agents/gymnasium_agent/app.py b/responses_api_agents/gymnasium_agent/app.py index dbd7a578f..a4e919f47 100644 --- a/responses_api_agents/gymnasium_agent/app.py +++ b/responses_api_agents/gymnasium_agent/app.py @@ -74,7 +74,7 @@ async def responses( async def run(self, request: Request, body: GymnasiumAgentRunRequest) -> GymnasiumRunResponse: cookies = request.cookies - # --- reset --- + # call reset to initialize the environment reset_resp = await self.server_client.post( server_name=self.config.env_server.name, url_path="/reset", @@ -99,12 +99,13 @@ async def run(self, request: Request, body: GymnasiumAgentRunRequest) -> Gymnasi ) all_outputs = [] + total_reward = 0.0 usage = None model_server_cookies = None step_data = EnvStepResponse(terminated=False, truncated=True, reward=0.0) last_model_response = None - # --- loop --- + # step until done for _ in range(self.config.max_steps): model_resp = await self.server_client.post( server_name=self.config.model_server.name, @@ -137,6 +138,7 @@ async def run(self, request: Request, body: GymnasiumAgentRunRequest) -> Gymnasi ) await raise_for_status(step_resp) step_data = EnvStepResponse.model_validate(await get_response_json(step_resp)) + total_reward += step_data.reward cookies = step_resp.cookies if step_data.terminated or step_data.truncated: @@ -153,7 +155,7 @@ async def run(self, request: Request, body: GymnasiumAgentRunRequest) -> Gymnasi } ) - else: # for/else: loop completed without break, meaning max_steps exhausted + else: # loop completed without break, meaning max_steps reached step_data = step_data.model_copy(update={"truncated": True}) last_model_response.output = all_outputs @@ -162,7 +164,7 @@ async def run(self, request: Request, body: GymnasiumAgentRunRequest) -> Gymnasi return GymnasiumRunResponse( responses_create_params=body.responses_create_params, response=last_model_response, - reward=step_data.reward, + reward=total_reward, terminated=step_data.terminated, truncated=step_data.truncated, info=step_data.info, From 326654400bb7f14b550811e6efd86a9852a2c821 Mon Sep 17 00:00:00 2001 From: cmunley1 Date: Wed, 15 Apr 2026 01:27:43 -0700 Subject: [PATCH 03/16] updates Signed-off-by: cmunley1 --- README.md | 4 ++-- resources_servers/base_gymnasium/README.md | 2 +- resources_servers/base_gymnasium/configs/base_gymnasium.yaml | 1 + resources_servers/blackjack/README.md | 4 ++++ resources_servers/blackjack/configs/blackjack.yaml | 1 + 5 files changed, 9 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 02c0c5ffb..f6570fb18 100644 --- a/README.md +++ b/README.md @@ -158,8 +158,8 @@ The Dataset column links to publicly available datasets (e.g., on HuggingFace). | Arc Agi | knowledge | Solve puzzles designed to test intelligence. See https://arcprize.org/arc-agi. | Improve puzzle-solving capabilities. | - | ✓ | - | arc_agi.yaml | - | | Aviary | agent | Multi-hop question answering on the HotPotQA dataset with Wikipedia search | Improve knowledge and agentic capability | ✓ | ✓ | Apache 2.0 | hotpotqa_aviary.yaml | - | | Aviary | math | GSM8k benchmark with calculator tool | Test math and agentic capability | ✓ | ✓ | Apache 2.0 | gsm8k_aviary.yaml | - | -| Base Gymnasium | other | Base class for Gymnasium-style servers. Not a standalone server. | - | - | - | - | base_gymnasium.yaml | - | -| Blackjack | games | Blackjack. Model hits or stands. Reward +1 win, 0 draw, -1 loss/bust. | - | - | - | - | blackjack.yaml | - | +| Base Gymnasium | other | Base class for Gymnasium-style servers. Not a standalone server. | Reusable base class for step/reset style environments | - | - | - | base_gymnasium.yaml | - | +| Blackjack | games | Blackjack. Model hits or stands. Reward +1 win, 0 draw, -1 loss/bust. | Example gymnasium-style multi-step environment | - | - | - | blackjack.yaml | - | | Calendar | agent | Multi-turn calendar scheduling dataset. User states events and constraints in natural language; model schedules events to satisfy all constraints. | Improve multi-turn instruction following capabilities | ✓ | ✓ | Apache 2.0 | calendar.yaml | Nemotron-RL-agent-calendar_scheduling | | Calendar | agent | Multi-turn calendar scheduling dataset. User states events and constraints in natural language; model schedules events to satisfy all constraints. | Improve multi-turn instruction following capabilities | ✓ | ✓ | Creative Commons Attribution 4.0 International | calendar_v2.yaml | Nemotron-RL-Instruction-Following-Calendar-v2 | | Circle Click | other | Click on circles in images | - | - | - | - | circle_click.yaml | - | diff --git a/resources_servers/base_gymnasium/README.md b/resources_servers/base_gymnasium/README.md index c4fb04e3a..8007aa743 100644 --- a/resources_servers/base_gymnasium/README.md +++ b/resources_servers/base_gymnasium/README.md @@ -6,4 +6,4 @@ Base class (`GymnasiumServer`) for gymnasium-style resources servers. Not a stan from resources_servers.base_gymnasium import GymnasiumServer ``` -See `docs/resources-server/gymnasium-api.md` for usage. +See `docs/resources-server/gymnasium-api.md` for usage and `resources_servers/blackjack/` for a working example. diff --git a/resources_servers/base_gymnasium/configs/base_gymnasium.yaml b/resources_servers/base_gymnasium/configs/base_gymnasium.yaml index 4c79b9044..a90b6a66f 100644 --- a/resources_servers/base_gymnasium/configs/base_gymnasium.yaml +++ b/resources_servers/base_gymnasium/configs/base_gymnasium.yaml @@ -5,6 +5,7 @@ base_gymnasium: domain: other verified: false description: Base class for Gymnasium-style servers. Not a standalone server. + value: Reusable base class for step/reset style environments base_gymnasium_agent: responses_api_agents: gymnasium_agent: diff --git a/resources_servers/blackjack/README.md b/resources_servers/blackjack/README.md index 961c4c2f0..54fbff84a 100644 --- a/resources_servers/blackjack/README.md +++ b/resources_servers/blackjack/README.md @@ -12,6 +12,10 @@ Example data provided in `data/example.jsonl` (system prompt only, no verifier_m ng_run "+config_paths=[resources_servers/blackjack/configs/blackjack.yaml,responses_api_models/vllm_model/configs/vllm_model.yaml]" ``` +## Data + +Each game is generated on the fly during `reset()`, so every row in `example.jsonl` is identical. To create more data, duplicate the row. Each rollout gets a fresh random deal. Use `num_repeats` in the YAML config or the `+num_repeats` CLI flag to control how many games per row. + ## Collect rollouts ```bash diff --git a/resources_servers/blackjack/configs/blackjack.yaml b/resources_servers/blackjack/configs/blackjack.yaml index 36d5609e1..9d2e81131 100644 --- a/resources_servers/blackjack/configs/blackjack.yaml +++ b/resources_servers/blackjack/configs/blackjack.yaml @@ -5,6 +5,7 @@ blackjack: domain: games verified: false description: Blackjack. Model hits or stands. Reward +1 win, 0 draw, -1 loss/bust. + value: Example gymnasium-style multi-step environment blackjack_gymnasium_agent: responses_api_agents: gymnasium_agent: From 5b9680a670e352ad3301e952922d0939312835e0 Mon Sep 17 00:00:00 2001 From: Lawrence Lane Date: Wed, 15 Apr 2026 12:55:51 -0400 Subject: [PATCH 04/16] docs: style guide fixes for gymnasium-api.md (#1075) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Align `docs/resources-server/gymnasium-api.md` with the existing docs style guide patterns ## Changes - Add back navigation button and `---` horizontal rule separators between sections (matches all other doc pages) - Add cross-references to related docs (`{doc}` links to single-step tutorial, multi-step tutorial, LLM-as-judge) - Add `:::{tip}` directive pointing readers to the standard `SimpleResourcesServer` tutorials - Fix `_extract_text` → `extract_text` to match the actual export from `resources_servers.base_gymnasium` - Fix typo: "initial the environment" → "initialize the environment" - Fix hyphenation: "non agentic" → "non-agentic" - Fix double dash `--` → em-dash `—` in Examples section - Add GitHub source link for blackjack example (matches other doc pages) - Fix typo "Gymansium" → "Gymnasium" in `gymnasium_agent/README.md` ## Test plan - [ ] Verify doc builds without broken references (`{doc}` and `{ref}` links) - [ ] Visual check that section separators and back button render correctly 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Signed-off-by: Lawrence Lane Co-authored-by: Claude Opus 4.6 --- docs/resources-server/gymnasium-api.md | 74 +++++++++++++------ .../gymnasium_agent/README.md | 2 +- 2 files changed, 53 insertions(+), 23 deletions(-) diff --git a/docs/resources-server/gymnasium-api.md b/docs/resources-server/gymnasium-api.md index 9b69e52f2..04532e303 100644 --- a/docs/resources-server/gymnasium-api.md +++ b/docs/resources-server/gymnasium-api.md @@ -3,7 +3,17 @@ # Gymnasium API -`GymnasiumServer` is a [Gymnasium](https://gymnasium.farama.org/)-style base class for resources servers. Implement `step()`, optionally `reset()`, use with `gymnasium_agent`. +`GymnasiumServer` is a [Gymnasium](https://gymnasium.farama.org/)-style base class for resources servers. Implement `step()`, optionally `reset()`, and pair with `gymnasium_agent`. + +:::{button-ref} index +:color: secondary +:outline: +:ref-type: doc + +< Back to Resources Server +::: + +--- ## Interface @@ -13,7 +23,7 @@ from nemo_gym.openai_utils import NeMoGymResponse class MyEnv(GymnasiumServer): async def reset(self, metadata: dict, session_id=None) -> tuple[str | None, dict]: - # Called once at the start of each episode to initial the environment + # Called once at the start of each episode to initialize the environment # Return (observation, info) # None observation = use responses_create_params.input as-is. return None, {} @@ -21,7 +31,7 @@ class MyEnv(GymnasiumServer): async def step(self, action: NeMoGymResponse, metadata: dict, session_id=None) -> tuple[str | None, float, bool, bool, dict]: # Called after each model response, executes any actions taken or implements other env logic ( tags, tool calls, user turns, or just single-step verification), computes rewards, determines if done. # Return (observation, reward, terminated, truncated, info) - # observation: next message to model (tool result, rendering of state, user msg etc), or None if episode is over. + # observation: next message to model (tool result, rendering of state, user message, and so on), or None if episode is over. # reward: per-step reward, accumulated across all steps by the agent. # terminated: True when the episode ends naturally (task solved, game over, etc). Tells the agent to stop. # truncated: True when the episode is cut short (step limit, timeout, etc). @@ -29,27 +39,35 @@ class MyEnv(GymnasiumServer): ... ``` -The main method to implement is step(). Implementing a custom `reset()` is optional to initialize environment state or return a message from the environment. The default implementation returns `(None, {})`. +The main method to implement is `step()`. Implementing a custom `reset()` is optional — use it to initialize environment state or return an opening message from the environment. The default implementation returns `(None, {})`. -`metadata` is the `verifier_metadata` dict from your input JSONL, passed through unchanged. This is where you put task-specific data (expected answers, test cases, board configurations, etc.) that the environment needs for initialization or scoring. Access fields via `metadata.get("field_name")`. +`metadata` is the `verifier_metadata` dict from your input JSONL, passed through unchanged. This is where you put task-specific data (expected answers, test cases, board configurations, and so on) that the environment needs for initialization or scoring. Access fields with `metadata.get("field_name")`. -`session_id` is a unique string per rollout. Use it as a key to store per-episode state (e.g. game boards, conversation history) in a dict on your server. Stateless environments can ignore it. +`session_id` is a unique string per rollout. Use it as a key to store per-episode state (such as game boards or conversation history) in a dict on your server. Stateless environments can ignore it. -`action` is the full `NeMoGymResponse` from the model server. Your `step()` parses whatever it needs from it: text content, `` tags, function calls, etc. Use the `extract_text()` helper for text, or inspect `action.output` directly for structured output like tool calls. +`action` is the full `NeMoGymResponse` from the model server. Your `step()` parses whatever it needs from it: text content, `` tags, function calls, and so on. Use `extract_text()` for text, or inspect `action.output` directly for structured output like tool calls. -## Single-step +:::{tip} +For the full single-step and multi-step patterns using the standard `SimpleResourcesServer` API, see {doc}`/environment-tutorials/single-step-environment` and {doc}`/environment-tutorials/multi-step-environment`. +::: -Single-step environments are the common non agentic use case: one model call, then grade the output. Implement `step()` so it always returns `terminated=True`. +--- + +## Single-Step + +Single-step environments are the common non-agentic use case: one model call, then grade the output. Implement `step()` so it always returns `terminated=True`. ```python class MySingleStepEnv(GymnasiumServer): async def step(self, action: NeMoGymResponse, metadata: dict, session_id=None): - response_text = _extract_text(action) + response_text = extract_text(action) reward = 1.0 if metadata.get("answer") in response_text else 0.0 return None, reward, True, False, {} ``` -## Multi-step with action tags +--- + +## Multi-Step with Action Tags Multiple model calls per episode without native tool calling. The model uses `` tags in its output, `step()` parses them and returns the next observation or terminates. @@ -66,7 +84,7 @@ class BlackjackEnv(GymnasiumServer): return f"Your hand: {hand}. hit or stand?", {} async def step(self, action, metadata, session_id=None): - text = _extract_text(action) + text = extract_text(action) m = re.search(r"\s*(hit|stand)\s*", text, re.IGNORECASE) decision = m.group(1).lower() if m else "stand" @@ -81,7 +99,9 @@ class BlackjackEnv(GymnasiumServer): return None, reward, True, False, {} ``` -## Tool-use +--- + +## Tool Use For tool-calling environments, `step()` checks `action.output` for items with `type == "function_call"`, executes them, and returns the results as the next observation. Tool schemas go in `responses_create_params.tools` in your JSONL data so the model knows what tools are available. @@ -112,7 +132,9 @@ class MyToolEnv(GymnasiumServer): return None, reward, True, False, {} ``` -## Multi-turn +--- + +## Multi-Turn `step()` returns the next user message as the observation. The `gymnasium_agent` appends it to the conversation and calls the model again. Return `None` to end. @@ -136,16 +158,18 @@ class MyMultiTurnEnv(GymnasiumServer): return None, reward, True, False, {} ``` -## LLM-as-judge +--- + +## LLM-as-Judge -Use `step()` to call a judge model via `self.server_client` and score the output. The judge model must be configured as a separate model server. +Use `step()` to call a judge model through `self.server_client` and score the output. The judge model must be configured as a separate model server. See {doc}`/environment-tutorials/llm-as-judge-verification` for the full pattern. ```python class MyJudgeEnv(GymnasiumServer): judge_server: str = "judge_model" # name of the model server in YAML async def step(self, action, metadata, session_id=None): - response_text = _extract_text(action) + response_text = extract_text(action) judge_input = f"Question: {metadata.get('question')}\nAnswer: {response_text}\nIs this correct? Say YES or NO." judge_resp = await self.server_client.post( server_name=self.judge_server, @@ -157,7 +181,9 @@ class MyJudgeEnv(GymnasiumServer): return None, reward, True, False, {} ``` -## Reward model +--- + +## Reward Model Same pattern. Call a reward model endpoint and use its score directly. @@ -169,15 +195,17 @@ class MyRewardModelEnv(GymnasiumServer): resp = await self.server_client.post( server_name=self.rm_server, url_path="/v1/score", - json={"input": metadata.get("prompt"), "response": _extract_text(action)}, + json={"input": metadata.get("prompt"), "response": extract_text(action)}, ) score = (await resp.json()).get("score", 0.0) return None, score, True, False, {} ``` -## YAML configuration +--- + +## YAML Configuration -`GymnasiumServer` pairs with `gymnasium_agent` instead of `simple_agent`. The agent config references the env server via `env_server`. +`GymnasiumServer` pairs with `gymnasium_agent` instead of `simple_agent`. The agent config references the env server through `env_server`. ```yaml my_env_instance: @@ -203,6 +231,8 @@ my_gymnasium_agent_instance: jsonl_fpath: resources_servers/my_env/data/example.jsonl ``` +--- + ## Examples -- `resources_servers/blackjack/` -- multi-step game with action tags +- [`blackjack`](https://github.com/NVIDIA-NeMo/Gym/tree/main/resources_servers/blackjack) — Multi-step game with action tags diff --git a/responses_api_agents/gymnasium_agent/README.md b/responses_api_agents/gymnasium_agent/README.md index a91a145eb..8b8409ed9 100644 --- a/responses_api_agents/gymnasium_agent/README.md +++ b/responses_api_agents/gymnasium_agent/README.md @@ -1,5 +1,5 @@ # Gymnasium Agent -Agent for Gymansium-style environments based on `GymnasiumServer` resources servers. Drives the reset/step loop. +Agent for Gymnasium-style environments based on `GymnasiumServer` resources servers. Drives the reset/step loop. See `docs/resources-server/gymnasium-api.md` for more details. \ No newline at end of file From c1319c740af73147676a3a0181d1e54a1fb5c84d Mon Sep 17 00:00:00 2001 From: cmunley1 Date: Fri, 17 Apr 2026 00:25:56 -0700 Subject: [PATCH 05/16] move logic to base.py Signed-off-by: cmunley1 --- resources_servers/base_gymnasium/__init__.py | 102 ++++-------------- resources_servers/base_gymnasium/base.py | 99 +++++++++++++++++ .../configs/base_gymnasium.yaml | 2 +- 3 files changed, 118 insertions(+), 85 deletions(-) create mode 100644 resources_servers/base_gymnasium/base.py diff --git a/resources_servers/base_gymnasium/__init__.py b/resources_servers/base_gymnasium/__init__.py index 99734b299..25779b972 100644 --- a/resources_servers/base_gymnasium/__init__.py +++ b/resources_servers/base_gymnasium/__init__.py @@ -13,87 +13,21 @@ # See the License for the specific language governing permissions and # limitations under the License. -from abc import abstractmethod -from typing import Optional - -from fastapi import FastAPI, Request -from pydantic import BaseModel, ConfigDict - -from nemo_gym.base_resources_server import BaseVerifyRequest, SimpleResourcesServer -from nemo_gym.openai_utils import NeMoGymResponse, NeMoGymResponseCreateParamsNonStreaming -from nemo_gym.server_utils import SESSION_ID_KEY - - -class EnvResetRequest(BaseModel): - model_config = ConfigDict(extra="allow") - responses_create_params: NeMoGymResponseCreateParamsNonStreaming - - -class EnvResetResponse(BaseModel): - observation: Optional[str] = None - info: dict = {} - - -class EnvStepRequest(BaseModel): - model_config = ConfigDict(extra="allow") - responses_create_params: NeMoGymResponseCreateParamsNonStreaming - response: NeMoGymResponse - - -class EnvStepResponse(BaseModel): - observation: Optional[str] = None - reward: float = 0.0 - terminated: bool = False - truncated: bool = False - info: dict = {} - - -def extract_text(response: NeMoGymResponse) -> str: - """Extract all text content from a NeMoGymResponse.""" - parts = [] - for item in response.output: - if item.type == "message": - content = item.content - if isinstance(content, str): - parts.append(content) - else: - for c in content: - if c.type == "output_text": - parts.append(c.text) - return "".join(parts) - - -class GymnasiumServer(SimpleResourcesServer): - """Gymnasium-style base class. Pair with gymnasium_agent. - - step() returns (observation, reward, terminated, truncated, info). - """ - - def setup_webserver(self) -> FastAPI: - app = FastAPI() - self.setup_session_middleware(app) - app.post("/reset")(self._reset_endpoint) - app.post("/step")(self._step_endpoint) - app.post("/aggregate_metrics")(self.aggregate_metrics) - return app - - async def _reset_endpoint(self, body: EnvResetRequest, request: Request) -> EnvResetResponse: - session_id = request.session.get(SESSION_ID_KEY) - obs, info = await self.reset(body.model_extra or {}, session_id) - return EnvResetResponse(observation=obs, info=info) - - async def _step_endpoint(self, body: EnvStepRequest, request: Request) -> EnvStepResponse: - session_id = request.session.get(SESSION_ID_KEY) - obs, reward, terminated, truncated, info = await self.step(body.response, body.model_extra or {}, session_id) - return EnvStepResponse(observation=obs, reward=reward, terminated=terminated, truncated=truncated, info=info) - - async def reset(self, metadata: dict, session_id: Optional[str] = None) -> tuple[Optional[str], dict]: - return None, {} - - @abstractmethod - async def step( - self, action: NeMoGymResponse, metadata: dict, session_id: Optional[str] = None - ) -> tuple[Optional[str], float, bool, bool, dict]: ... - - async def verify(self, body: BaseVerifyRequest) -> None: # type: ignore[override] - raise NotImplementedError("GymnasiumServer uses /step instead of /verify. Pair with gymnasium_agent.") +from .base import ( + EnvResetRequest, + EnvResetResponse, + EnvStepRequest, + EnvStepResponse, + GymnasiumServer, + extract_text, +) + + +__all__ = [ + "EnvResetRequest", + "EnvResetResponse", + "EnvStepRequest", + "EnvStepResponse", + "GymnasiumServer", + "extract_text", +] diff --git a/resources_servers/base_gymnasium/base.py b/resources_servers/base_gymnasium/base.py new file mode 100644 index 000000000..99734b299 --- /dev/null +++ b/resources_servers/base_gymnasium/base.py @@ -0,0 +1,99 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import abstractmethod +from typing import Optional + +from fastapi import FastAPI, Request +from pydantic import BaseModel, ConfigDict + +from nemo_gym.base_resources_server import BaseVerifyRequest, SimpleResourcesServer +from nemo_gym.openai_utils import NeMoGymResponse, NeMoGymResponseCreateParamsNonStreaming +from nemo_gym.server_utils import SESSION_ID_KEY + + +class EnvResetRequest(BaseModel): + model_config = ConfigDict(extra="allow") + responses_create_params: NeMoGymResponseCreateParamsNonStreaming + + +class EnvResetResponse(BaseModel): + observation: Optional[str] = None + info: dict = {} + + +class EnvStepRequest(BaseModel): + model_config = ConfigDict(extra="allow") + responses_create_params: NeMoGymResponseCreateParamsNonStreaming + response: NeMoGymResponse + + +class EnvStepResponse(BaseModel): + observation: Optional[str] = None + reward: float = 0.0 + terminated: bool = False + truncated: bool = False + info: dict = {} + + +def extract_text(response: NeMoGymResponse) -> str: + """Extract all text content from a NeMoGymResponse.""" + parts = [] + for item in response.output: + if item.type == "message": + content = item.content + if isinstance(content, str): + parts.append(content) + else: + for c in content: + if c.type == "output_text": + parts.append(c.text) + return "".join(parts) + + +class GymnasiumServer(SimpleResourcesServer): + """Gymnasium-style base class. Pair with gymnasium_agent. + + step() returns (observation, reward, terminated, truncated, info). + """ + + def setup_webserver(self) -> FastAPI: + app = FastAPI() + self.setup_session_middleware(app) + app.post("/reset")(self._reset_endpoint) + app.post("/step")(self._step_endpoint) + app.post("/aggregate_metrics")(self.aggregate_metrics) + return app + + async def _reset_endpoint(self, body: EnvResetRequest, request: Request) -> EnvResetResponse: + session_id = request.session.get(SESSION_ID_KEY) + obs, info = await self.reset(body.model_extra or {}, session_id) + return EnvResetResponse(observation=obs, info=info) + + async def _step_endpoint(self, body: EnvStepRequest, request: Request) -> EnvStepResponse: + session_id = request.session.get(SESSION_ID_KEY) + obs, reward, terminated, truncated, info = await self.step(body.response, body.model_extra or {}, session_id) + return EnvStepResponse(observation=obs, reward=reward, terminated=terminated, truncated=truncated, info=info) + + async def reset(self, metadata: dict, session_id: Optional[str] = None) -> tuple[Optional[str], dict]: + return None, {} + + @abstractmethod + async def step( + self, action: NeMoGymResponse, metadata: dict, session_id: Optional[str] = None + ) -> tuple[Optional[str], float, bool, bool, dict]: ... + + async def verify(self, body: BaseVerifyRequest) -> None: # type: ignore[override] + raise NotImplementedError("GymnasiumServer uses /step instead of /verify. Pair with gymnasium_agent.") diff --git a/resources_servers/base_gymnasium/configs/base_gymnasium.yaml b/resources_servers/base_gymnasium/configs/base_gymnasium.yaml index a90b6a66f..2847ae1d1 100644 --- a/resources_servers/base_gymnasium/configs/base_gymnasium.yaml +++ b/resources_servers/base_gymnasium/configs/base_gymnasium.yaml @@ -1,7 +1,7 @@ base_gymnasium: resources_servers: base_gymnasium: - entrypoint: __init__.py + entrypoint: base.py domain: other verified: false description: Base class for Gymnasium-style servers. Not a standalone server. From 92a066a9f285cc559e555f33f1fc37856b7aa2f5 Mon Sep 17 00:00:00 2001 From: cmunley1 Date: Fri, 17 Apr 2026 00:29:26 -0700 Subject: [PATCH 06/16] readme Signed-off-by: cmunley1 --- resources_servers/base_gymnasium/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources_servers/base_gymnasium/README.md b/resources_servers/base_gymnasium/README.md index 8007aa743..b87e71750 100644 --- a/resources_servers/base_gymnasium/README.md +++ b/resources_servers/base_gymnasium/README.md @@ -1,6 +1,6 @@ # Gymnasium -Base class (`GymnasiumServer`) for gymnasium-style resources servers. Not a standalone server. Import from here: +Base resources server (`GymnasiumServer`) for gymnasium-style environments. Not a standalone server. Import from here: ```python from resources_servers.base_gymnasium import GymnasiumServer From ae7f1723de8623b7808b9b10f349eda8bc4ed2e5 Mon Sep 17 00:00:00 2001 From: cmunley1 Date: Fri, 17 Apr 2026 00:47:29 -0700 Subject: [PATCH 07/16] nit Signed-off-by: cmunley1 --- resources_servers/base_gymnasium/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources_servers/base_gymnasium/base.py b/resources_servers/base_gymnasium/base.py index 99734b299..05d8bda1d 100644 --- a/resources_servers/base_gymnasium/base.py +++ b/resources_servers/base_gymnasium/base.py @@ -96,4 +96,4 @@ async def step( ) -> tuple[Optional[str], float, bool, bool, dict]: ... async def verify(self, body: BaseVerifyRequest) -> None: # type: ignore[override] - raise NotImplementedError("GymnasiumServer uses /step instead of /verify. Pair with gymnasium_agent.") + raise NotImplementedError("GymnasiumServer uses /step instead of /verify. Use with gymnasium_agent.") From f0d5e2a0a3375cbc16ce7cb234f466c66a7c8b6e Mon Sep 17 00:00:00 2001 From: cmunley1 Date: Mon, 20 Apr 2026 16:02:48 -0700 Subject: [PATCH 08/16] session state in base class wit hcleanup, accumulate tools proprly, parse stricter, validate max steps, more tests Signed-off-by: cmunley1 --- resources_servers/base_gymnasium/base.py | 11 +- .../base_gymnasium/tests/test_app.py | 155 +++++++++++-- resources_servers/blackjack/app.py | 25 +- resources_servers/blackjack/tests/test_app.py | 154 ++++++++++++- responses_api_agents/gymnasium_agent/app.py | 48 ++-- .../gymnasium_agent/tests/test_app.py | 216 ++++++++++++++++-- 6 files changed, 531 insertions(+), 78 deletions(-) diff --git a/resources_servers/base_gymnasium/base.py b/resources_servers/base_gymnasium/base.py index 05d8bda1d..50117d40d 100644 --- a/resources_servers/base_gymnasium/base.py +++ b/resources_servers/base_gymnasium/base.py @@ -14,10 +14,10 @@ # limitations under the License. from abc import abstractmethod -from typing import Optional +from typing import Any, Dict, Optional from fastapi import FastAPI, Request -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, Field from nemo_gym.base_resources_server import BaseVerifyRequest, SimpleResourcesServer from nemo_gym.openai_utils import NeMoGymResponse, NeMoGymResponseCreateParamsNonStreaming @@ -69,6 +69,8 @@ class GymnasiumServer(SimpleResourcesServer): step() returns (observation, reward, terminated, truncated, info). """ + session_state: Dict[str, Any] = Field(default_factory=dict) + def setup_webserver(self) -> FastAPI: app = FastAPI() self.setup_session_middleware(app) @@ -85,6 +87,8 @@ async def _reset_endpoint(self, body: EnvResetRequest, request: Request) -> EnvR async def _step_endpoint(self, body: EnvStepRequest, request: Request) -> EnvStepResponse: session_id = request.session.get(SESSION_ID_KEY) obs, reward, terminated, truncated, info = await self.step(body.response, body.model_extra or {}, session_id) + if terminated or truncated: + await self.close_session(session_id) return EnvStepResponse(observation=obs, reward=reward, terminated=terminated, truncated=truncated, info=info) async def reset(self, metadata: dict, session_id: Optional[str] = None) -> tuple[Optional[str], dict]: @@ -95,5 +99,8 @@ async def step( self, action: NeMoGymResponse, metadata: dict, session_id: Optional[str] = None ) -> tuple[Optional[str], float, bool, bool, dict]: ... + async def close_session(self, session_id: Optional[str]) -> None: + self.session_state.pop(session_id, None) + async def verify(self, body: BaseVerifyRequest) -> None: # type: ignore[override] raise NotImplementedError("GymnasiumServer uses /step instead of /verify. Use with gymnasium_agent.") diff --git a/resources_servers/base_gymnasium/tests/test_app.py b/resources_servers/base_gymnasium/tests/test_app.py index a2f0677e9..31b9bde10 100644 --- a/resources_servers/base_gymnasium/tests/test_app.py +++ b/resources_servers/base_gymnasium/tests/test_app.py @@ -12,22 +12,151 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from types import SimpleNamespace from unittest.mock import MagicMock +import pytest + from nemo_gym.base_resources_server import BaseResourcesServerConfig -from nemo_gym.server_utils import ServerClient -from resources_servers.base_gymnasium import GymnasiumServer +from nemo_gym.openai_utils import NeMoGymResponse, NeMoGymResponseOutputMessage, NeMoGymResponseOutputText +from nemo_gym.server_utils import SESSION_ID_KEY, ServerClient +from resources_servers.base_gymnasium import EnvResetRequest, EnvStepRequest, GymnasiumServer, extract_text + + +def _make_response(*parts: str) -> NeMoGymResponse: + return NeMoGymResponse( + id="r", + created_at=0.0, + model="m", + object="response", + output=[ + NeMoGymResponseOutputMessage( + id=f"msg_{i}", + content=[NeMoGymResponseOutputText(annotations=[], text=p, type="output_text")], + role="assistant", + status="completed", + type="message", + ) + for i, p in enumerate(parts) + ], + parallel_tool_calls=True, + tool_choice="auto", + tools=[], + ) + + +class _FakeRequest: + def __init__(self, session_id="sid-1"): + self.session = {SESSION_ID_KEY: session_id} + + +class _TerminatingEnv(GymnasiumServer): + async def step(self, action, metadata, session_id=None): + return None, 1.0, True, False, {} + + +class _OngoingEnv(GymnasiumServer): + async def step(self, action, metadata, session_id=None): + return "keep going", 0.0, False, False, {} + + +class _TruncatingEnv(GymnasiumServer): + async def step(self, action, metadata, session_id=None): + return None, 0.0, False, True, {} + + +_close_log: list = [] + + +class _CustomCloseEnv(GymnasiumServer): + async def step(self, action, metadata, session_id=None): + return None, 0.0, True, False, {} + + async def close_session(self, session_id): + _close_log.append(session_id) + await super().close_session(session_id) + + +def _make_env(cls): + config = BaseResourcesServerConfig(host="", port=0, entrypoint="", name="") + return cls(config=config, server_client=MagicMock(spec=ServerClient)) class TestGymnasiumServer: - def test_sanity(self) -> None: - class ConcreteEnv(GymnasiumServer): - async def step(self, action, metadata, session_id=None): - return None, 1.0, True, False, {} - - config = BaseResourcesServerConfig(host="", port=0, entrypoint="", name="") - env = ConcreteEnv(config=config, server_client=MagicMock(spec=ServerClient)) - app = env.setup_webserver() - routes = {r.path for r in app.routes} - assert "/reset" in routes - assert "/step" in routes + def test_routes_registered(self): + env = _make_env(_TerminatingEnv) + routes = {r.path for r in env.setup_webserver().routes} + assert {"/reset", "/step", "/aggregate_metrics"}.issubset(routes) + + def test_verify_raises(self): + env = _make_env(_TerminatingEnv) + with pytest.raises(NotImplementedError): + import asyncio + + asyncio.run(env.verify(SimpleNamespace())) + + @pytest.mark.asyncio + async def test_reset_default_returns_empty(self): + env = _make_env(_TerminatingEnv) + env.session_state["sid-1"] = {"x": 1} + body = EnvResetRequest(responses_create_params={"input": []}) + resp = await env._reset_endpoint(body, _FakeRequest()) + assert resp.observation is None + assert resp.info == {} + + @pytest.mark.asyncio + async def test_step_pops_on_terminated(self): + env = _make_env(_TerminatingEnv) + env.session_state["sid-1"] = {"x": 1} + body = EnvStepRequest(responses_create_params={"input": []}, response=_make_response("x")) + resp = await env._step_endpoint(body, _FakeRequest("sid-1")) + assert resp.terminated is True + assert "sid-1" not in env.session_state + + @pytest.mark.asyncio + async def test_step_pops_on_truncated(self): + env = _make_env(_TruncatingEnv) + env.session_state["sid-1"] = {"x": 1} + body = EnvStepRequest(responses_create_params={"input": []}, response=_make_response("x")) + resp = await env._step_endpoint(body, _FakeRequest("sid-1")) + assert resp.truncated is True + assert "sid-1" not in env.session_state + + @pytest.mark.asyncio + async def test_step_keeps_state_when_ongoing(self): + env = _make_env(_OngoingEnv) + env.session_state["sid-1"] = {"x": 1} + body = EnvStepRequest(responses_create_params={"input": []}, response=_make_response("x")) + resp = await env._step_endpoint(body, _FakeRequest("sid-1")) + assert resp.terminated is False + assert resp.truncated is False + assert "sid-1" in env.session_state + + @pytest.mark.asyncio + async def test_close_session_override_invoked(self): + _close_log.clear() + env = _make_env(_CustomCloseEnv) + env.session_state["sid-1"] = {"x": 1} + body = EnvStepRequest(responses_create_params={"input": []}, response=_make_response("x")) + await env._step_endpoint(body, _FakeRequest("sid-1")) + assert _close_log == ["sid-1"] + assert "sid-1" not in env.session_state + + +class TestExtractText: + def test_concats_output_text(self): + r = _make_response("hello ", "world") + assert extract_text(r) == "hello world" + + def test_empty_output(self): + r = NeMoGymResponse( + id="r", + created_at=0.0, + model="m", + object="response", + output=[], + parallel_tool_calls=True, + tool_choice="auto", + tools=[], + ) + assert extract_text(r) == "" diff --git a/resources_servers/blackjack/app.py b/resources_servers/blackjack/app.py index 049874492..d2f33af3f 100644 --- a/resources_servers/blackjack/app.py +++ b/resources_servers/blackjack/app.py @@ -22,9 +22,7 @@ import random import re -from typing import Dict, Optional - -from pydantic import Field +from typing import Optional from nemo_gym.openai_utils import NeMoGymResponse from resources_servers.base_gymnasium import GymnasiumServer, extract_text @@ -33,8 +31,8 @@ _RANKS = ["2", "3", "4", "5", "6", "7", "8", "9", "10", "J", "Q", "K", "A"] -def _deal(): - return random.choice(_RANKS) +def _deal(rng: random.Random) -> str: + return rng.choice(_RANKS) def _hand_value(hand: list[str]) -> int: @@ -51,12 +49,11 @@ def _fmt(hand: list[str]) -> str: class BlackjackEnv(GymnasiumServer): - session_state: Dict[str, dict] = Field(default_factory=dict) - async def reset(self, metadata: dict, session_id: Optional[str] = None) -> tuple[Optional[str], dict]: - player = [_deal(), _deal()] - dealer = [_deal(), _deal()] - self.session_state[session_id] = {"player": player, "dealer": dealer} + rng = random.Random() + player = [_deal(rng), _deal(rng)] + dealer = [_deal(rng), _deal(rng)] + self.session_state[session_id] = {"player": player, "dealer": dealer, "rng": rng} obs = ( f"Your hand: {_fmt(player)} = {_hand_value(player)}\n" f"Dealer shows: {dealer[0]}\n" @@ -70,12 +67,13 @@ async def step( state = self.session_state.get(session_id, {}) player = state.get("player", []) dealer = state.get("dealer", []) + rng = state.get("rng") or random.Random() text = extract_text(action) m = re.search(r"\s*(hit|stand)\s*", text, re.IGNORECASE) - decision = m.group(1).lower() if m else ("hit" if "hit" in text.lower() else "stand") + decision = m.group(1).lower() if m else "stand" if decision == "hit": - player.append(_deal()) + player.append(_deal(rng)) val = _hand_value(player) if val > 21: return None, -1.0, True, False, {"result": "bust", "player": _fmt(player), "value": val} @@ -86,9 +84,8 @@ async def step( ) return obs, 0.0, False, False, {} - # stand: dealer plays out while _hand_value(dealer) < 17: - dealer.append(_deal()) + dealer.append(_deal(rng)) pv, dv = _hand_value(player), _hand_value(dealer) if dv > 21 or pv > dv: diff --git a/resources_servers/blackjack/tests/test_app.py b/resources_servers/blackjack/tests/test_app.py index 4187b1544..2597b72d6 100644 --- a/resources_servers/blackjack/tests/test_app.py +++ b/resources_servers/blackjack/tests/test_app.py @@ -14,12 +14,156 @@ # limitations under the License. from unittest.mock import MagicMock +import pytest + from nemo_gym.base_resources_server import BaseResourcesServerConfig +from nemo_gym.openai_utils import NeMoGymResponse, NeMoGymResponseOutputMessage, NeMoGymResponseOutputText from nemo_gym.server_utils import ServerClient -from resources_servers.blackjack.app import BlackjackEnv +from resources_servers.blackjack.app import BlackjackEnv, _hand_value + + +def _make_env(): + config = BaseResourcesServerConfig(host="", port=0, entrypoint="", name="") + return BlackjackEnv(config=config, server_client=MagicMock(spec=ServerClient)) + + +def _response(text: str) -> NeMoGymResponse: + return NeMoGymResponse( + id="r", + created_at=0.0, + model="m", + object="response", + output=[ + NeMoGymResponseOutputMessage( + id="msg", + content=[NeMoGymResponseOutputText(annotations=[], text=text, type="output_text")], + role="assistant", + status="completed", + type="message", + ) + ], + parallel_tool_calls=True, + tool_choice="auto", + tools=[], + ) + + +class TestHandValue: + def test_basic(self): + assert _hand_value(["5", "7"]) == 12 + + def test_face_cards(self): + assert _hand_value(["K", "Q"]) == 20 + + def test_ace_low_when_busting(self): + assert _hand_value(["A", "K", "5"]) == 16 + + def test_ace_high_when_safe(self): + assert _hand_value(["A", "9"]) == 20 + + def test_two_aces(self): + assert _hand_value(["A", "A"]) == 12 + + +class TestReset: + @pytest.mark.asyncio + async def test_reset_populates_state(self): + env = _make_env() + obs, info = await env.reset({}, session_id="sid") + assert "sid" in env.session_state + state = env.session_state["sid"] + assert len(state["player"]) == 2 + assert len(state["dealer"]) == 2 + assert "rng" in state + assert "Your hand" in obs + assert "hit" in obs + + @pytest.mark.asyncio + async def test_per_session_rng_is_isolated(self): + # Two sessions get distinct RNG instances (not the module default). + env = _make_env() + await env.reset({}, session_id="a") + await env.reset({}, session_id="b") + assert env.session_state["a"]["rng"] is not env.session_state["b"]["rng"] + + +class TestStep: + @pytest.mark.asyncio + async def test_stand_finishes_game(self): + env = _make_env() + # Preload a known safe state so the result is deterministic. + env.session_state["sid"] = { + "player": ["K", "9"], # 19 + "dealer": ["10"], # dealer will draw until ≥ 17 + "rng": __import__("random").Random(0), + } + _, reward, term, trunc, info = await env.step(_response("stand"), {}, session_id="sid") + assert term is True + assert trunc is False + assert reward in (-1.0, 0.0, 1.0) + assert info["result"] in ("win", "draw", "loss") + + @pytest.mark.asyncio + async def test_hit_bust_ends_game(self): + env = _make_env() + # Force a bust on the next draw regardless of RNG: player already at 20, drawing any card except A busts. + # Use a seeded RNG that deals non-A. + rng = __import__("random").Random(0) + env.session_state["sid"] = {"player": ["K", "K"], "dealer": ["5"], "rng": rng} + _, reward, term, _, info = await env.step(_response("hit"), {}, session_id="sid") + # Player at 20 + any non-ace → bust. With rng seed 0, the draw is deterministic. + if term: + assert reward == -1.0 + assert info["result"] == "bust" + + @pytest.mark.asyncio + async def test_hit_continues_when_safe(self): + env = _make_env() + env.session_state["sid"] = { + "player": ["5", "3"], # 8 + "dealer": ["10"], + "rng": __import__("random").Random(0), + } + obs, reward, term, _, _ = await env.step(_response("hit"), {}, session_id="sid") + assert term is False + assert reward == 0.0 + assert "Your hand" in obs + + +class TestActionParser: + @pytest.mark.asyncio + async def _decide(self, text: str) -> str: + env = _make_env() + env.session_state["sid"] = { + "player": ["2", "2"], # 4 — can't bust on hit + "dealer": ["10"], + "rng": __import__("random").Random(0), + } + obs, _, term, _, info = await env.step(_response(text), {}, session_id="sid") + # Terminal → stood; non-terminal → hit. + return "stand" if term else "hit" + + @pytest.mark.asyncio + async def test_hit_tag(self): + assert await self._decide("hit") == "hit" + + @pytest.mark.asyncio + async def test_stand_tag(self): + assert await self._decide("stand") == "stand" + + @pytest.mark.asyncio + async def test_whitespace_tolerated(self): + assert await self._decide(" hit ") == "hit" + + @pytest.mark.asyncio + async def test_case_insensitive(self): + assert await self._decide("HIT") == "hit" + @pytest.mark.asyncio + async def test_no_tag_defaults_stand(self): + # Previously the fallback did a substring match, so "don't hit" would parse as hit. + assert await self._decide("i don't know, maybe hit?") == "stand" -class TestApp: - def test_sanity(self) -> None: - config = BaseResourcesServerConfig(host="", port=0, entrypoint="", name="") - BlackjackEnv(config=config, server_client=MagicMock(spec=ServerClient)) + @pytest.mark.asyncio + async def test_unknown_action_defaults_stand(self): + assert await self._decide("fold") == "stand" diff --git a/responses_api_agents/gymnasium_agent/app.py b/responses_api_agents/gymnasium_agent/app.py index a4e919f47..39982005d 100644 --- a/responses_api_agents/gymnasium_agent/app.py +++ b/responses_api_agents/gymnasium_agent/app.py @@ -16,7 +16,7 @@ """Agent for GymnasiumServer resources servers (resources_servers.base_gymnasium) which implements the Gymnasium API.""" from fastapi import Body, Request, Response -from pydantic import ConfigDict +from pydantic import ConfigDict, Field from nemo_gym.base_resources_server import ( BaseRunRequest, @@ -30,13 +30,13 @@ NeMoGymResponseCreateParamsNonStreaming, ) from nemo_gym.server_utils import get_response_json, raise_for_status -from resources_servers.base_gymnasium import EnvResetResponse, EnvStepResponse, extract_text +from resources_servers.base_gymnasium import EnvResetResponse, EnvStepResponse class GymnasiumAgentConfig(BaseResponsesAPIAgentConfig): env_server: ResourcesServerRef model_server: ModelServerRef - max_steps: int = 10 + max_steps: int = Field(10, ge=1) class GymnasiumAgentRunRequest(BaseRunRequest): @@ -85,32 +85,28 @@ async def run(self, request: Request, body: GymnasiumAgentRunRequest) -> Gymnasi reset_data = EnvResetResponse.model_validate(await get_response_json(reset_resp)) cookies = reset_resp.cookies - current_input = body.responses_create_params.model_copy(deep=True) - if isinstance(current_input.input, str): - current_input = current_input.model_copy( - update={"input": [NeMoGymEasyInputMessage(role="user", content=current_input.input)]} - ) + base_body = body.responses_create_params.model_copy(deep=True) + if isinstance(base_body.input, str): + base_body.input = [NeMoGymEasyInputMessage(role="user", content=base_body.input)] if reset_data.observation: - current_input = current_input.model_copy( - update={ - "input": list(current_input.input) - + [NeMoGymEasyInputMessage(role="user", content=reset_data.observation)] - } - ) + base_body.input = list(base_body.input) + [ + NeMoGymEasyInputMessage(role="user", content=reset_data.observation) + ] - all_outputs = [] + new_outputs = [] total_reward = 0.0 usage = None model_server_cookies = None step_data = EnvStepResponse(terminated=False, truncated=True, reward=0.0) last_model_response = None - # step until done for _ in range(self.config.max_steps): + new_body = base_body.model_copy(update={"input": base_body.input + new_outputs}) + model_resp = await self.server_client.post( server_name=self.config.model_server.name, url_path="/v1/responses", - json=current_input, + json=new_body, cookies=model_server_cookies, ) await raise_for_status(model_resp) @@ -118,7 +114,7 @@ async def run(self, request: Request, body: GymnasiumAgentRunRequest) -> Gymnasi model_server_cookies = model_resp.cookies last_model_response = model_response - all_outputs.extend(model_response.output) + new_outputs.extend(model_response.output) if model_response.usage: if usage is None: @@ -145,20 +141,12 @@ async def run(self, request: Request, body: GymnasiumAgentRunRequest) -> Gymnasi break if step_data.observation: - current_input = current_input.model_copy( - update={ - "input": list(current_input.input) - + [ - NeMoGymEasyInputMessage(role="assistant", content=extract_text(model_response)), - NeMoGymEasyInputMessage(role="user", content=step_data.observation), - ] - } - ) - - else: # loop completed without break, meaning max_steps reached + new_outputs.append(NeMoGymEasyInputMessage(role="user", content=step_data.observation)) + + else: step_data = step_data.model_copy(update={"truncated": True}) - last_model_response.output = all_outputs + last_model_response.output = new_outputs last_model_response.usage = usage return GymnasiumRunResponse( diff --git a/responses_api_agents/gymnasium_agent/tests/test_app.py b/responses_api_agents/gymnasium_agent/tests/test_app.py index 31606af36..d2284ac7e 100644 --- a/responses_api_agents/gymnasium_agent/tests/test_app.py +++ b/responses_api_agents/gymnasium_agent/tests/test_app.py @@ -12,15 +12,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import json +from unittest.mock import AsyncMock, MagicMock -from unittest.mock import MagicMock +import pytest from nemo_gym.config_types import ModelServerRef, ResourcesServerRef from nemo_gym.server_utils import ServerClient -from responses_api_agents.gymnasium_agent.app import GymnasiumAgent, GymnasiumAgentConfig +from responses_api_agents.gymnasium_agent.app import GymnasiumAgent, GymnasiumAgentConfig, GymnasiumAgentRunRequest -def _make_agent(): +def _make_agent(max_steps=10): config = GymnasiumAgentConfig( host="", port=0, @@ -28,21 +30,207 @@ def _make_agent(): name="test_gymnasium_agent", env_server=ResourcesServerRef(type="resources_servers", name="my_env"), model_server=ModelServerRef(type="responses_api_models", name="policy_model"), + max_steps=max_steps, ) return GymnasiumAgent(config=config, server_client=MagicMock(spec=ServerClient)) -class TestGymnasiumAgent: - def test_setup_webserver(self): - agent = _make_agent() - app = agent.setup_webserver() +def _model_response(text: str, input_toks=1, output_toks=1) -> dict: + return { + "id": "r", + "created_at": 0.0, + "model": "m", + "object": "response", + "output": [ + { + "id": "msg", + "content": [{"annotations": [], "text": text, "type": "output_text"}], + "role": "assistant", + "status": "completed", + "type": "message", + } + ], + "parallel_tool_calls": True, + "tool_choice": "auto", + "tools": [], + "usage": { + "input_tokens": input_toks, + "input_tokens_details": {"cached_tokens": 0}, + "output_tokens": output_toks, + "output_tokens_details": {"reasoning_tokens": 0}, + "total_tokens": input_toks + output_toks, + }, + } + + +class _FakeHttpResp: + def __init__(self, payload: dict): + self._payload = payload + self.cookies = {} + self.status = 200 + self.ok = True + + async def json(self): + return self._payload + + async def read(self): + return json.dumps(self._payload).encode() + + @property + def content(self): + class _Body: + async def read(inner): + return json.dumps(self._payload).encode() + + return _Body() + + def raise_for_status(self): + return None + + +def _wire_mock_client(agent, responses_per_url): + """Wire agent.server_client.post to return payloads keyed by url_path.""" + call_log = [] + + async def _post(server_name, url_path, json=None, cookies=None, **kw): + call_log.append((server_name, url_path, json)) + payload = responses_per_url[url_path].pop(0) + return _FakeHttpResp(payload) + + agent.server_client.post = AsyncMock(side_effect=_post) + return call_log + + +class TestRoutes: + def test_routes_registered(self): + app = _make_agent().setup_webserver() routes = {r.path for r in app.routes} - assert "/run" in routes - assert "/v1/responses" in routes - assert "/aggregate_metrics" in routes + assert {"/run", "/v1/responses", "/aggregate_metrics"}.issubset(routes) + + +class TestConfig: + def test_max_steps_validator_rejects_zero(self): + with pytest.raises(Exception): + GymnasiumAgentConfig( + host="", + port=0, + entrypoint="", + name="x", + env_server=ResourcesServerRef(type="resources_servers", name="e"), + model_server=ModelServerRef(type="responses_api_models", name="m"), + max_steps=0, + ) - def test_config(self): + def test_default_max_steps(self): + assert _make_agent().config.max_steps == 10 + + +class TestRun: + @pytest.mark.asyncio + async def test_terminates_on_first_step(self): agent = _make_agent() - assert agent.config.env_server.name == "my_env" - assert agent.config.model_server.name == "policy_model" - assert agent.config.max_steps == 10 + call_log = _wire_mock_client( + agent, + { + "/reset": [{"observation": "go", "info": {}}], + "/v1/responses": [_model_response("move A")], + "/step": [{"observation": None, "reward": 1.0, "terminated": True, "truncated": False, "info": {}}], + }, + ) + req = MagicMock() + req.cookies = {} + body = GymnasiumAgentRunRequest(responses_create_params={"input": [{"role": "user", "content": "play"}]}) + result = await agent.run(req, body) + assert result.terminated is True + assert result.reward == 1.0 + # reset + exactly 1 model call + 1 step + urls = [u for (_s, u, _) in call_log] + assert urls.count("/reset") == 1 + assert urls.count("/v1/responses") == 1 + assert urls.count("/step") == 1 + + @pytest.mark.asyncio + async def test_multi_step_preserves_output_items_in_history(self): + agent = _make_agent(max_steps=3) + call_log = _wire_mock_client( + agent, + { + "/reset": [{"observation": "start", "info": {}}], + "/v1/responses": [ + _model_response("turn-1", output_toks=10), + _model_response("turn-2", output_toks=20), + ], + "/step": [ + {"observation": "obs-1", "reward": 0.5, "terminated": False, "truncated": False, "info": {}}, + {"observation": None, "reward": 0.5, "terminated": True, "truncated": False, "info": {}}, + ], + }, + ) + req = MagicMock() + req.cookies = {} + body = GymnasiumAgentRunRequest(responses_create_params={"input": [{"role": "user", "content": "play"}]}) + result = await agent.run(req, body) + assert result.reward == 1.0 + assert result.terminated is True + # Inspect turn-2 model call body — its input must contain the full turn-1 output item, + # not a flattened string, and the obs-1 appended as user message. + turn2_body = [body for (s, u, body) in call_log if u == "/v1/responses"][1] + turn2_input = turn2_body.input + # turn-1 full output item preserved (with structured content list) + assistant_items = [m for m in turn2_input if getattr(m, "role", None) == "assistant"] + assert any( + isinstance(getattr(m, "content", None), list) + and any( + getattr(c, "type", None) == "output_text" and getattr(c, "text", "") == "turn-1" for c in m.content + ) + for m in assistant_items + ), f"turn-1 output not preserved in structured form: {assistant_items}" + # obs-1 appended as a user message after turn-1 + assert any(getattr(m, "role", None) == "user" and getattr(m, "content", "") == "obs-1" for m in turn2_input) + + @pytest.mark.asyncio + async def test_max_steps_sets_truncated(self): + agent = _make_agent(max_steps=2) + _wire_mock_client( + agent, + { + "/reset": [{"observation": None, "info": {}}], + "/v1/responses": [_model_response("a"), _model_response("b")], + "/step": [ + {"observation": "obs-1", "reward": 0.0, "terminated": False, "truncated": False, "info": {}}, + {"observation": "obs-2", "reward": 0.0, "terminated": False, "truncated": False, "info": {}}, + ], + }, + ) + req = MagicMock() + req.cookies = {} + body = GymnasiumAgentRunRequest(responses_create_params={"input": [{"role": "user", "content": "x"}]}) + result = await agent.run(req, body) + assert result.truncated is True + assert result.terminated is False + + @pytest.mark.asyncio + async def test_usage_accumulates_across_turns(self): + agent = _make_agent(max_steps=3) + _wire_mock_client( + agent, + { + "/reset": [{"observation": None, "info": {}}], + "/v1/responses": [ + _model_response("a", input_toks=5, output_toks=7), + _model_response("b", input_toks=11, output_toks=13), + ], + "/step": [ + {"observation": "o", "reward": 0.0, "terminated": False, "truncated": False, "info": {}}, + {"observation": None, "reward": 0.0, "terminated": True, "truncated": False, "info": {}}, + ], + }, + ) + req = MagicMock() + req.cookies = {} + body = GymnasiumAgentRunRequest(responses_create_params={"input": [{"role": "user", "content": "x"}]}) + result = await agent.run(req, body) + # usage summed across both turns + assert result.response.usage.input_tokens == 16 + assert result.response.usage.output_tokens == 20 + assert result.response.usage.total_tokens == 36 From b1e0f82393cd66912df63eaf2b7f94b375583a40 Mon Sep 17 00:00:00 2001 From: cmunley1 Date: Mon, 20 Apr 2026 16:09:30 -0700 Subject: [PATCH 09/16] doc Signed-off-by: cmunley1 --- resources_servers/base_gymnasium/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources_servers/base_gymnasium/base.py b/resources_servers/base_gymnasium/base.py index 50117d40d..b0e898728 100644 --- a/resources_servers/base_gymnasium/base.py +++ b/resources_servers/base_gymnasium/base.py @@ -64,7 +64,7 @@ def extract_text(response: NeMoGymResponse) -> str: class GymnasiumServer(SimpleResourcesServer): - """Gymnasium-style base class. Pair with gymnasium_agent. + """Gymnasium-style base class. Used with gymnasium_agent. step() returns (observation, reward, terminated, truncated, info). """ From 83ef22140c7828e0e4e5e7454b233b1568a20a47 Mon Sep 17 00:00:00 2001 From: cmunley1 Date: Thu, 23 Apr 2026 10:52:53 -0700 Subject: [PATCH 10/16] docs Signed-off-by: cmunley1 --- responses_api_agents/gymnasium_agent/app.py | 25 +++++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/responses_api_agents/gymnasium_agent/app.py b/responses_api_agents/gymnasium_agent/app.py index 39982005d..aa75799b4 100644 --- a/responses_api_agents/gymnasium_agent/app.py +++ b/responses_api_agents/gymnasium_agent/app.py @@ -26,6 +26,7 @@ from nemo_gym.config_types import AggregateMetrics, AggregateMetricsRequest, ModelServerRef, ResourcesServerRef from nemo_gym.openai_utils import ( NeMoGymEasyInputMessage, + NeMoGymFunctionCallOutput, NeMoGymResponse, NeMoGymResponseCreateParamsNonStreaming, ) @@ -72,18 +73,17 @@ async def responses( return result async def run(self, request: Request, body: GymnasiumAgentRunRequest) -> GymnasiumRunResponse: - cookies = request.cookies + env_cookies = request.cookies - # call reset to initialize the environment reset_resp = await self.server_client.post( server_name=self.config.env_server.name, url_path="/reset", json=body.model_dump(), - cookies=cookies, + cookies=env_cookies, ) await raise_for_status(reset_resp) reset_data = EnvResetResponse.model_validate(await get_response_json(reset_resp)) - cookies = reset_resp.cookies + env_cookies = reset_resp.cookies base_body = body.responses_create_params.model_copy(deep=True) if isinstance(base_body.input, str): @@ -99,6 +99,7 @@ async def run(self, request: Request, body: GymnasiumAgentRunRequest) -> Gymnasi model_server_cookies = None step_data = EnvStepResponse(terminated=False, truncated=True, reward=0.0) last_model_response = None + finished = False for _ in range(self.config.max_steps): new_body = base_body.model_copy(update={"input": base_body.input + new_outputs}) @@ -130,20 +131,30 @@ async def run(self, request: Request, body: GymnasiumAgentRunRequest) -> Gymnasi server_name=self.config.env_server.name, url_path="/step", json=body.model_dump() | {"response": model_response.model_dump()}, - cookies=cookies, + cookies=env_cookies, ) await raise_for_status(step_resp) step_data = EnvStepResponse.model_validate(await get_response_json(step_resp)) total_reward += step_data.reward - cookies = step_resp.cookies + env_cookies = step_resp.cookies if step_data.terminated or step_data.truncated: + finished = True break + for tool_output in (step_data.info or {}).get("tool_outputs", []): + new_outputs.append( + NeMoGymFunctionCallOutput( + type="function_call_output", + call_id=tool_output["call_id"], + output=tool_output["output"], + ) + ) + if step_data.observation: new_outputs.append(NeMoGymEasyInputMessage(role="user", content=step_data.observation)) - else: + if not finished: step_data = step_data.model_copy(update={"truncated": True}) last_model_response.output = new_outputs From 6b0d4bb0d4e41450f4d443fa8b5be01cc1aff76e Mon Sep 17 00:00:00 2001 From: cmunley1 Date: Thu, 23 Apr 2026 11:24:37 -0700 Subject: [PATCH 11/16] readme Signed-off-by: cmunley1 --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 724086cc0..358ad32eb 100644 --- a/README.md +++ b/README.md @@ -239,6 +239,7 @@ The Dataset column links to publicly available datasets (e.g., on HuggingFace). | Vlm Eval Kit | other | - | Measure VLM capabilities | - | ✓ | - | OCRBench.yaml | - | | Vlm Eval Kit | other | Run all supported VLMEvalKit benchmarks. | Measure VLM capabilities | - | ✓ | - | vlm_eval_kit.yaml | - | | Workplace Assistant | agent | Workplace assistant multi-step tool-using environment | Improve multi-step tool use capability | ✓ | ✓ | Apache 2.0 | workplace_assistant.yaml | Nemotron-RL-agent-workplace_assistant | +| Workplace Assistant Gym | agent | Workplace assistant ported to GymnasiumServer (step/reset). Tool dispatch happens inside step(). | Mirror of workplace_assistant exercised through the gymnasium agent loop. | ✓ | ✓ | Apache 2.0 | workplace_assistant_gym.yaml | Nemotron-RL-agent-workplace_assistant | | Xlam Fc | agent | Salesforce xlam-function-calling-60k tool calling tasks | Improve tool-calling capabilities | ✓ | ✓ | Apache 2.0 | xlam_fc.yaml | - | | Xstest | safety | XSTest safety benchmark - exaggerated safety (over-refusal) evaluation | Evaluate model safety calibration between helpfulness and harmlessness | - | - | - | xstest.yaml | - | From b583fcb762e11b1b69e27009d2108c2a4047020e Mon Sep 17 00:00:00 2001 From: cmunley1 Date: Thu, 23 Apr 2026 11:25:16 -0700 Subject: [PATCH 12/16] remove docs Signed-off-by: cmunley1 --- docs/resources-server/gymnasium-api.md | 238 ------------------------- 1 file changed, 238 deletions(-) delete mode 100644 docs/resources-server/gymnasium-api.md diff --git a/docs/resources-server/gymnasium-api.md b/docs/resources-server/gymnasium-api.md deleted file mode 100644 index 04532e303..000000000 --- a/docs/resources-server/gymnasium-api.md +++ /dev/null @@ -1,238 +0,0 @@ -(gymnasium-api)= - - -# Gymnasium API - -`GymnasiumServer` is a [Gymnasium](https://gymnasium.farama.org/)-style base class for resources servers. Implement `step()`, optionally `reset()`, and pair with `gymnasium_agent`. - -:::{button-ref} index -:color: secondary -:outline: -:ref-type: doc - -< Back to Resources Server -::: - ---- - -## Interface - -```python -from resources_servers.base_gymnasium import GymnasiumServer -from nemo_gym.openai_utils import NeMoGymResponse - -class MyEnv(GymnasiumServer): - async def reset(self, metadata: dict, session_id=None) -> tuple[str | None, dict]: - # Called once at the start of each episode to initialize the environment - # Return (observation, info) - # None observation = use responses_create_params.input as-is. - return None, {} - - async def step(self, action: NeMoGymResponse, metadata: dict, session_id=None) -> tuple[str | None, float, bool, bool, dict]: - # Called after each model response, executes any actions taken or implements other env logic ( tags, tool calls, user turns, or just single-step verification), computes rewards, determines if done. - # Return (observation, reward, terminated, truncated, info) - # observation: next message to model (tool result, rendering of state, user message, and so on), or None if episode is over. - # reward: per-step reward, accumulated across all steps by the agent. - # terminated: True when the episode ends naturally (task solved, game over, etc). Tells the agent to stop. - # truncated: True when the episode is cut short (step limit, timeout, etc). - # info: arbitrary dict passed through to the final response. Use for diagnostics, scores, metadata. - ... -``` - -The main method to implement is `step()`. Implementing a custom `reset()` is optional — use it to initialize environment state or return an opening message from the environment. The default implementation returns `(None, {})`. - -`metadata` is the `verifier_metadata` dict from your input JSONL, passed through unchanged. This is where you put task-specific data (expected answers, test cases, board configurations, and so on) that the environment needs for initialization or scoring. Access fields with `metadata.get("field_name")`. - -`session_id` is a unique string per rollout. Use it as a key to store per-episode state (such as game boards or conversation history) in a dict on your server. Stateless environments can ignore it. - -`action` is the full `NeMoGymResponse` from the model server. Your `step()` parses whatever it needs from it: text content, `` tags, function calls, and so on. Use `extract_text()` for text, or inspect `action.output` directly for structured output like tool calls. - -:::{tip} -For the full single-step and multi-step patterns using the standard `SimpleResourcesServer` API, see {doc}`/environment-tutorials/single-step-environment` and {doc}`/environment-tutorials/multi-step-environment`. -::: - ---- - -## Single-Step - -Single-step environments are the common non-agentic use case: one model call, then grade the output. Implement `step()` so it always returns `terminated=True`. - -```python -class MySingleStepEnv(GymnasiumServer): - async def step(self, action: NeMoGymResponse, metadata: dict, session_id=None): - response_text = extract_text(action) - reward = 1.0 if metadata.get("answer") in response_text else 0.0 - return None, reward, True, False, {} -``` - ---- - -## Multi-Step with Action Tags - -Multiple model calls per episode without native tool calling. The model uses `` tags in its output, `step()` parses them and returns the next observation or terminates. - -```python -import re -from pydantic import Field - -class BlackjackEnv(GymnasiumServer): - session_state: dict = Field(default_factory=dict) - - async def reset(self, metadata, session_id=None): - hand = deal_hand() - self.session_state[session_id] = hand - return f"Your hand: {hand}. hit or stand?", {} - - async def step(self, action, metadata, session_id=None): - text = extract_text(action) - m = re.search(r"\s*(hit|stand)\s*", text, re.IGNORECASE) - decision = m.group(1).lower() if m else "stand" - - hand = self.session_state[session_id] - if decision == "hit": - hand = hit(hand) - if bust(hand): - return None, -1.0, True, False, {} - return f"Your hand: {hand}. hit or stand?", 0.0, False, False, {} - - reward = score_against_dealer(hand) - return None, reward, True, False, {} -``` - ---- - -## Tool Use - -For tool-calling environments, `step()` checks `action.output` for items with `type == "function_call"`, executes them, and returns the results as the next observation. Tool schemas go in `responses_create_params.tools` in your JSONL data so the model knows what tools are available. - -```python -import json -from pydantic import Field - -class MyToolEnv(GymnasiumServer): - session_state: dict = Field(default_factory=dict) - - async def reset(self, metadata, session_id=None): - self.session_state[session_id] = initialize(metadata) - return None, {} - - async def step(self, action, metadata, session_id=None): - tool_calls = [o for o in action.output if o.type == "function_call"] - - if tool_calls: - state = self.session_state[session_id] - results = [] - for call in tool_calls: - args = json.loads(call.arguments) - result = state["functions"][call.name](**args) - results.append(f"{call.name} -> {result}") - return "\n".join(results), 0.0, False, False, {} - - reward = self._grade(action, metadata) - return None, reward, True, False, {} -``` - ---- - -## Multi-Turn - -`step()` returns the next user message as the observation. The `gymnasium_agent` appends it to the conversation and calls the model again. Return `None` to end. - -```python -class MyMultiTurnEnv(GymnasiumServer): - session_turns: dict = Field(default_factory=dict) - - async def reset(self, metadata, session_id=None): - self.session_turns[session_id] = 0 - return None, {} - - async def step(self, action, metadata, session_id=None): - follow_ups = metadata.get("follow_ups", []) - turn = self.session_turns.get(session_id, 0) - - if turn < len(follow_ups): - self.session_turns[session_id] = turn + 1 - return follow_ups[turn], 0.0, False, False, {} - - reward = self._grade(action, metadata) - return None, reward, True, False, {} -``` - ---- - -## LLM-as-Judge - -Use `step()` to call a judge model through `self.server_client` and score the output. The judge model must be configured as a separate model server. See {doc}`/environment-tutorials/llm-as-judge-verification` for the full pattern. - -```python -class MyJudgeEnv(GymnasiumServer): - judge_server: str = "judge_model" # name of the model server in YAML - - async def step(self, action, metadata, session_id=None): - response_text = extract_text(action) - judge_input = f"Question: {metadata.get('question')}\nAnswer: {response_text}\nIs this correct? Say YES or NO." - judge_resp = await self.server_client.post( - server_name=self.judge_server, - url_path="/v1/responses", - json={"input": [{"role": "user", "content": judge_input}]}, - ) - judgment = await judge_resp.json() - reward = 1.0 if "YES" in str(judgment.get("output_text", "")).upper() else 0.0 - return None, reward, True, False, {} -``` - ---- - -## Reward Model - -Same pattern. Call a reward model endpoint and use its score directly. - -```python -class MyRewardModelEnv(GymnasiumServer): - rm_server: str = "reward_model" - - async def step(self, action, metadata, session_id=None): - resp = await self.server_client.post( - server_name=self.rm_server, - url_path="/v1/score", - json={"input": metadata.get("prompt"), "response": extract_text(action)}, - ) - score = (await resp.json()).get("score", 0.0) - return None, score, True, False, {} -``` - ---- - -## YAML Configuration - -`GymnasiumServer` pairs with `gymnasium_agent` instead of `simple_agent`. The agent config references the env server through `env_server`. - -```yaml -my_env_instance: - resources_servers: - my_env: - entrypoint: app.py - domain: knowledge - -my_gymnasium_agent_instance: - responses_api_agents: - gymnasium_agent: - entrypoint: app.py - env_server: - type: resources_servers - name: my_env - model_server: - type: responses_api_models - name: policy_model - max_steps: 10 - datasets: - - name: example - type: example - jsonl_fpath: resources_servers/my_env/data/example.jsonl -``` - ---- - -## Examples - -- [`blackjack`](https://github.com/NVIDIA-NeMo/Gym/tree/main/resources_servers/blackjack) — Multi-step game with action tags From e276f56f49b69653e55e39c48cfc9220d380c8e7 Mon Sep 17 00:00:00 2001 From: cmunley1 Date: Thu, 23 Apr 2026 11:27:18 -0700 Subject: [PATCH 13/16] readme Signed-off-by: cmunley1 --- resources_servers/base_gymnasium/README.md | 203 ++++++++++++++++++++- 1 file changed, 201 insertions(+), 2 deletions(-) diff --git a/resources_servers/base_gymnasium/README.md b/resources_servers/base_gymnasium/README.md index b87e71750..77e548fe9 100644 --- a/resources_servers/base_gymnasium/README.md +++ b/resources_servers/base_gymnasium/README.md @@ -1,9 +1,208 @@ # Gymnasium -Base resources server (`GymnasiumServer`) for gymnasium-style environments. Not a standalone server. Import from here: +`GymnasiumServer` is a [Gymnasium](https://gymnasium.farama.org/)-style base class for resources servers. Implement `step()`, optionally `reset()`, and pair with `gymnasium_agent`. Not a standalone server. ```python from resources_servers.base_gymnasium import GymnasiumServer ``` -See `docs/resources-server/gymnasium-api.md` for usage and `resources_servers/blackjack/` for a working example. +## Interface + +```python +from resources_servers.base_gymnasium import GymnasiumServer +from nemo_gym.openai_utils import NeMoGymResponse + +class MyEnv(GymnasiumServer): + async def reset(self, metadata: dict, session_id=None) -> tuple[str | None, dict]: + # Called once at the start of each episode to initialize the environment. + # Return (observation, info). + # None observation = use responses_create_params.input as-is. + return None, {} + + async def step(self, action: NeMoGymResponse, metadata: dict, session_id=None) -> tuple[str | None, float, bool, bool, dict]: + # Called after each model response. Executes any actions taken or other env logic + # ( tags, tool calls, user turns, or single-step verification), computes + # rewards, determines if done. + # Return (observation, reward, terminated, truncated, info). + # observation: next message to the model (tool result, rendering of state, + # user message, etc.), or None if the episode is over. + # reward: per-step reward, accumulated across all steps by the agent. + # terminated: True when the episode ends naturally (task solved, game over). + # truncated: True when the episode is cut short (step limit, timeout). + # info: arbitrary dict passed through to the final response. Use for diagnostics, + # scores, metadata. Set info["tool_outputs"] = [{call_id, output}, ...] to have + # the agent surface tool results as proper function_call_output items instead of + # a plain user-message observation. + ... +``` + +The main method to implement is `step()`. Implementing a custom `reset()` is optional; use it to initialize environment state or return an opening message from the environment. The default implementation returns `(None, {})`. + +`metadata` is the `verifier_metadata` dict from your input JSONL, passed through unchanged. This is where you put task-specific data (expected answers, test cases, board configurations, etc.) that the environment needs for initialization or scoring. Access fields with `metadata.get("field_name")`. + +`session_id` is a unique string per rollout. Use it as a key to store per-episode state (game boards, conversation history, etc.) in a dict on your server. Stateless environments can ignore it. + +`action` is the full `NeMoGymResponse` from the model server. Your `step()` parses whatever it needs from it: text content, `` tags, function calls, etc. Use `extract_text()` for text, or inspect `action.output` directly for structured output like tool calls. + +## Single-Step + +Single-step environments are the common non-agentic case: one model call, then grade the output. Implement `step()` so it always returns `terminated=True`. + +```python +class MySingleStepEnv(GymnasiumServer): + async def step(self, action, metadata, session_id=None): + response_text = extract_text(action) + reward = 1.0 if metadata.get("answer") in response_text else 0.0 + return None, reward, True, False, {} +``` + +## Multi-Step with Action Tags + +Multiple model calls per episode without native tool calling. The model uses `` tags in its output; `step()` parses them and returns the next observation or terminates. + +```python +import re + +class BlackjackEnv(GymnasiumServer): + async def reset(self, metadata, session_id=None): + hand = deal_hand() + self.session_state[session_id] = hand + return f"Your hand: {hand}. hit or stand?", {} + + async def step(self, action, metadata, session_id=None): + text = extract_text(action) + m = re.search(r"\s*(hit|stand)\s*", text, re.IGNORECASE) + decision = m.group(1).lower() if m else "stand" + + hand = self.session_state[session_id] + if decision == "hit": + hand = hit(hand) + if bust(hand): + return None, -1.0, True, False, {} + return f"Your hand: {hand}. hit or stand?", 0.0, False, False, {} + + reward = score_against_dealer(hand) + return None, reward, True, False, {} +``` + +## Tool Use + +For tool-calling environments, `step()` inspects `action.output` for items with `type == "function_call"`, executes them, and returns per-call outputs in `info["tool_outputs"]`. The agent synthesizes proper `function_call_output` items tied to each `call_id`, so the model sees the tool_call/response structure it was trained on. Tool schemas go in `responses_create_params.tools` in your JSONL so the model knows what tools are available. + +```python +import json + +class MyToolEnv(GymnasiumServer): + async def reset(self, metadata, session_id=None): + self.session_state[session_id] = initialize(metadata) + return None, {} + + async def step(self, action, metadata, session_id=None): + tool_calls = [o for o in action.output if o.type == "function_call"] + + if tool_calls: + state = self.session_state[session_id] + outputs = [] + for call in tool_calls: + args = json.loads(call.arguments) + result = state["functions"][call.name](**args) + outputs.append({"call_id": call.call_id, "output": json.dumps(result, default=str)}) + return None, 0.0, False, False, {"tool_outputs": outputs} + + reward = self._grade(action, metadata) + return None, reward, True, False, {} +``` + +## Multi-Turn + +`step()` returns the next user message as the observation. The `gymnasium_agent` appends it to the conversation and calls the model again. Return `None` as the observation to end. + +```python +class MyMultiTurnEnv(GymnasiumServer): + async def reset(self, metadata, session_id=None): + self.session_state[session_id] = {"turn": 0} + return None, {} + + async def step(self, action, metadata, session_id=None): + follow_ups = metadata.get("follow_ups", []) + state = self.session_state[session_id] + + if state["turn"] < len(follow_ups): + msg = follow_ups[state["turn"]] + state["turn"] += 1 + return msg, 0.0, False, False, {} + + reward = self._grade(action, metadata) + return None, reward, True, False, {} +``` + +## LLM-as-Judge + +Use `step()` to call a judge model through `self.server_client` and score the output. The judge model must be configured as a separate model server. + +```python +class MyJudgeEnv(GymnasiumServer): + judge_server: str = "judge_model" # name of the model server in YAML + + async def step(self, action, metadata, session_id=None): + response_text = extract_text(action) + judge_input = f"Question: {metadata.get('question')}\nAnswer: {response_text}\nIs this correct? Say YES or NO." + judge_resp = await self.server_client.post( + server_name=self.judge_server, + url_path="/v1/responses", + json={"input": [{"role": "user", "content": judge_input}]}, + ) + judgment = await judge_resp.json() + reward = 1.0 if "YES" in str(judgment.get("output_text", "")).upper() else 0.0 + return None, reward, True, False, {} +``` + +## Reward Model + +Same pattern. Call a reward model endpoint and use its score directly. + +```python +class MyRewardModelEnv(GymnasiumServer): + rm_server: str = "reward_model" + + async def step(self, action, metadata, session_id=None): + resp = await self.server_client.post( + server_name=self.rm_server, + url_path="/v1/score", + json={"input": metadata.get("prompt"), "response": extract_text(action)}, + ) + score = (await resp.json()).get("score", 0.0) + return None, score, True, False, {} +``` + +## YAML Configuration + +`GymnasiumServer` pairs with `gymnasium_agent` instead of `simple_agent`. The agent config references the env server through `env_server`. + +```yaml +my_env_instance: + resources_servers: + my_env: + entrypoint: app.py + domain: knowledge + +my_gymnasium_agent_instance: + responses_api_agents: + gymnasium_agent: + entrypoint: app.py + env_server: + type: resources_servers + name: my_env + model_server: + type: responses_api_models + name: policy_model + max_steps: 10 + datasets: + - name: example + type: example + jsonl_fpath: resources_servers/my_env/data/example.jsonl +``` + +## Examples + +- [`blackjack`](https://github.com/NVIDIA-NeMo/Gym/tree/main/resources_servers/blackjack) — Multi-step game with action tags. From 00d8b388c45b0a52991f78e7c9700c03adcd95ca Mon Sep 17 00:00:00 2001 From: cmunley1 Date: Thu, 23 Apr 2026 11:29:27 -0700 Subject: [PATCH 14/16] readme Signed-off-by: cmunley1 --- resources_servers/base_gymnasium/README.md | 32 ++++++++-------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/resources_servers/base_gymnasium/README.md b/resources_servers/base_gymnasium/README.md index 77e548fe9..67ea751b1 100644 --- a/resources_servers/base_gymnasium/README.md +++ b/resources_servers/base_gymnasium/README.md @@ -1,6 +1,6 @@ # Gymnasium -`GymnasiumServer` is a [Gymnasium](https://gymnasium.farama.org/)-style base class for resources servers. Implement `step()`, optionally `reset()`, and pair with `gymnasium_agent`. Not a standalone server. +`GymnasiumServer` is a [Gymnasium](https://gymnasium.farama.org/)-style base class for resources servers. Implement `step()`, optionally `reset()`, and use with `gymnasium_agent`. Not a standalone server. ```python from resources_servers.base_gymnasium import GymnasiumServer @@ -14,29 +14,21 @@ from nemo_gym.openai_utils import NeMoGymResponse class MyEnv(GymnasiumServer): async def reset(self, metadata: dict, session_id=None) -> tuple[str | None, dict]: - # Called once at the start of each episode to initialize the environment. - # Return (observation, info). - # None observation = use responses_create_params.input as-is. - return None, {} + return None, {} # (observation, info); None obs = use input as-is async def step(self, action: NeMoGymResponse, metadata: dict, session_id=None) -> tuple[str | None, float, bool, bool, dict]: - # Called after each model response. Executes any actions taken or other env logic - # ( tags, tool calls, user turns, or single-step verification), computes - # rewards, determines if done. - # Return (observation, reward, terminated, truncated, info). - # observation: next message to the model (tool result, rendering of state, - # user message, etc.), or None if the episode is over. - # reward: per-step reward, accumulated across all steps by the agent. - # terminated: True when the episode ends naturally (task solved, game over). - # truncated: True when the episode is cut short (step limit, timeout). - # info: arbitrary dict passed through to the final response. Use for diagnostics, - # scores, metadata. Set info["tool_outputs"] = [{call_id, output}, ...] to have - # the agent surface tool results as proper function_call_output items instead of - # a plain user-message observation. - ... + ... # (observation, reward, terminated, truncated, info) ``` -The main method to implement is `step()`. Implementing a custom `reset()` is optional; use it to initialize environment state or return an opening message from the environment. The default implementation returns `(None, {})`. +`reset()` runs once per episode. `step()` runs after each model response and returns the 5-tuple: + +- **observation**: next message to the model, or `None` to end the episode. +- **reward**: per-step reward; the agent sums across steps. +- **terminated**: episode ended naturally (task solved, game over). +- **truncated**: episode cut short (step limit, timeout). +- **info**: passthrough dict. Set `info["tool_outputs"] = [{call_id, output}, ...]` to have the agent feed tool results back as proper `function_call_output` items instead of a user-message observation. + +Only `step()` is required. The default `reset()` returns `(None, {})`. `metadata` is the `verifier_metadata` dict from your input JSONL, passed through unchanged. This is where you put task-specific data (expected answers, test cases, board configurations, etc.) that the environment needs for initialization or scoring. Access fields with `metadata.get("field_name")`. From 7017efd0cc7ab38ba6c62d93b5f1813e97cc940c Mon Sep 17 00:00:00 2001 From: cmunley1 Date: Thu, 23 Apr 2026 11:42:57 -0700 Subject: [PATCH 15/16] renaming Signed-off-by: cmunley1 --- resources_servers/base_gymnasium/README.md | 32 +++++++++---------- resources_servers/base_gymnasium/base.py | 11 ++++++- .../configs/base_gymnasium.yaml | 2 +- .../blackjack/configs/blackjack.yaml | 2 +- resources_servers/blackjack/tests/test_app.py | 2 +- responses_api_agents/gymnasium_agent/app.py | 8 ++--- .../gymnasium_agent/tests/test_app.py | 6 ++-- 7 files changed, 35 insertions(+), 28 deletions(-) diff --git a/resources_servers/base_gymnasium/README.md b/resources_servers/base_gymnasium/README.md index 67ea751b1..0cef836b5 100644 --- a/resources_servers/base_gymnasium/README.md +++ b/resources_servers/base_gymnasium/README.md @@ -14,7 +14,7 @@ from nemo_gym.openai_utils import NeMoGymResponse class MyEnv(GymnasiumServer): async def reset(self, metadata: dict, session_id=None) -> tuple[str | None, dict]: - return None, {} # (observation, info); None obs = use input as-is + return None, {} # (observation, info); observation (if set) is appended to input async def step(self, action: NeMoGymResponse, metadata: dict, session_id=None) -> tuple[str | None, float, bool, bool, dict]: ... # (observation, reward, terminated, truncated, info) @@ -23,18 +23,18 @@ class MyEnv(GymnasiumServer): `reset()` runs once per episode. `step()` runs after each model response and returns the 5-tuple: - **observation**: next message to the model, or `None` to end the episode. -- **reward**: per-step reward; the agent sums across steps. +- **reward**: per-step reward; the agent sums across steps by default. - **terminated**: episode ended naturally (task solved, game over). - **truncated**: episode cut short (step limit, timeout). -- **info**: passthrough dict. Set `info["tool_outputs"] = [{call_id, output}, ...]` to have the agent feed tool results back as proper `function_call_output` items instead of a user-message observation. +- **info**: extra metadata the env wants to return (debug info, scores, stats). Also how the env sends `tool_outputs` to the agent (see Tool Use). -Only `step()` is required. The default `reset()` returns `(None, {})`. +Only `step()` is required. The default `reset()` returns `(None, {})`, meaning the prompt from the dataset is used as-is. A non-`None` observation from `reset()` is appended to the input as a user message before the first model call. -`metadata` is the `verifier_metadata` dict from your input JSONL, passed through unchanged. This is where you put task-specific data (expected answers, test cases, board configurations, etc.) that the environment needs for initialization or scoring. Access fields with `metadata.get("field_name")`. +The three arguments shown in the signatures above: -`session_id` is a unique string per rollout. Use it as a key to store per-episode state (game boards, conversation history, etc.) in a dict on your server. Stateless environments can ignore it. - -`action` is the full `NeMoGymResponse` from the model server. Your `step()` parses whatever it needs from it: text content, `` tags, function calls, etc. Use `extract_text()` for text, or inspect `action.output` directly for structured output like tool calls. +- **`metadata`**: any extra fields from the JSONL row (e.g. `ground_truth`, `category`). Use for task config or scoring references. Access via `metadata.get("field")`. +- **`session_id`**: unique string per rollout. Use as a key into `self.session_state` for per-episode state (game boards, conversation history, etc.). Stateless envs can ignore it. +- **`action`**: the model's `NeMoGymResponse` for the current turn. Use `extract_text(action)` for text or iterate `action.output` for structured items like `function_call`. ## Single-Step @@ -93,18 +93,16 @@ class MyToolEnv(GymnasiumServer): tool_calls = [o for o in action.output if o.type == "function_call"] if tool_calls: - state = self.session_state[session_id] - outputs = [] - for call in tool_calls: - args = json.loads(call.arguments) - result = state["functions"][call.name](**args) - outputs.append({"call_id": call.call_id, "output": json.dumps(result, default=str)}) + fns = self.session_state[session_id]["functions"] + outputs = [self.tool_output(c, fns[c.name](**json.loads(c.arguments))) for c in tool_calls] return None, 0.0, False, False, {"tool_outputs": outputs} reward = self._grade(action, metadata) return None, reward, True, False, {} ``` +`self.tool_output(call, result)` is a helper on `GymnasiumServer` that builds the `{"call_id", "output"}` dict the agent expects (JSON-serializes the result for you). + ## Multi-Turn `step()` returns the next user message as the observation. The `gymnasium_agent` appends it to the conversation and calls the model again. Return `None` as the observation to end. @@ -169,7 +167,7 @@ class MyRewardModelEnv(GymnasiumServer): ## YAML Configuration -`GymnasiumServer` pairs with `gymnasium_agent` instead of `simple_agent`. The agent config references the env server through `env_server`. +`GymnasiumServer` pairs with `gymnasium_agent` instead of `simple_agent`. Same shape as the `simple_agent` config, with the agent referencing the environment through `resources_server`. ```yaml my_env_instance: @@ -182,7 +180,7 @@ my_gymnasium_agent_instance: responses_api_agents: gymnasium_agent: entrypoint: app.py - env_server: + resources_server: type: resources_servers name: my_env model_server: @@ -197,4 +195,4 @@ my_gymnasium_agent_instance: ## Examples -- [`blackjack`](https://github.com/NVIDIA-NeMo/Gym/tree/main/resources_servers/blackjack) — Multi-step game with action tags. +- [`blackjack`](https://github.com/NVIDIA-NeMo/Gym/tree/main/resources_servers/blackjack): multi-step game with action tags. diff --git a/resources_servers/base_gymnasium/base.py b/resources_servers/base_gymnasium/base.py index b0e898728..211e30d03 100644 --- a/resources_servers/base_gymnasium/base.py +++ b/resources_servers/base_gymnasium/base.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json from abc import abstractmethod from typing import Any, Dict, Optional @@ -20,7 +21,11 @@ from pydantic import BaseModel, ConfigDict, Field from nemo_gym.base_resources_server import BaseVerifyRequest, SimpleResourcesServer -from nemo_gym.openai_utils import NeMoGymResponse, NeMoGymResponseCreateParamsNonStreaming +from nemo_gym.openai_utils import ( + NeMoGymResponse, + NeMoGymResponseCreateParamsNonStreaming, + NeMoGymResponseFunctionToolCall, +) from nemo_gym.server_utils import SESSION_ID_KEY @@ -102,5 +107,9 @@ async def step( async def close_session(self, session_id: Optional[str]) -> None: self.session_state.pop(session_id, None) + @staticmethod + def tool_output(call: NeMoGymResponseFunctionToolCall, result: Any) -> dict: + return {"call_id": call.call_id, "output": json.dumps(result, default=str)} + async def verify(self, body: BaseVerifyRequest) -> None: # type: ignore[override] raise NotImplementedError("GymnasiumServer uses /step instead of /verify. Use with gymnasium_agent.") diff --git a/resources_servers/base_gymnasium/configs/base_gymnasium.yaml b/resources_servers/base_gymnasium/configs/base_gymnasium.yaml index 2847ae1d1..b24e8f386 100644 --- a/resources_servers/base_gymnasium/configs/base_gymnasium.yaml +++ b/resources_servers/base_gymnasium/configs/base_gymnasium.yaml @@ -10,7 +10,7 @@ base_gymnasium_agent: responses_api_agents: gymnasium_agent: entrypoint: app.py - env_server: + resources_server: type: resources_servers name: base_gymnasium model_server: diff --git a/resources_servers/blackjack/configs/blackjack.yaml b/resources_servers/blackjack/configs/blackjack.yaml index 9d2e81131..0dae1e2d8 100644 --- a/resources_servers/blackjack/configs/blackjack.yaml +++ b/resources_servers/blackjack/configs/blackjack.yaml @@ -10,7 +10,7 @@ blackjack_gymnasium_agent: responses_api_agents: gymnasium_agent: entrypoint: app.py - env_server: + resources_server: type: resources_servers name: blackjack model_server: diff --git a/resources_servers/blackjack/tests/test_app.py b/resources_servers/blackjack/tests/test_app.py index 2597b72d6..0c0b2e6c6 100644 --- a/resources_servers/blackjack/tests/test_app.py +++ b/resources_servers/blackjack/tests/test_app.py @@ -135,7 +135,7 @@ class TestActionParser: async def _decide(self, text: str) -> str: env = _make_env() env.session_state["sid"] = { - "player": ["2", "2"], # 4 — can't bust on hit + "player": ["2", "2"], # 4, can't bust on hit "dealer": ["10"], "rng": __import__("random").Random(0), } diff --git a/responses_api_agents/gymnasium_agent/app.py b/responses_api_agents/gymnasium_agent/app.py index aa75799b4..0312f2fcf 100644 --- a/responses_api_agents/gymnasium_agent/app.py +++ b/responses_api_agents/gymnasium_agent/app.py @@ -35,7 +35,7 @@ class GymnasiumAgentConfig(BaseResponsesAPIAgentConfig): - env_server: ResourcesServerRef + resources_server: ResourcesServerRef model_server: ModelServerRef max_steps: int = Field(10, ge=1) @@ -76,7 +76,7 @@ async def run(self, request: Request, body: GymnasiumAgentRunRequest) -> Gymnasi env_cookies = request.cookies reset_resp = await self.server_client.post( - server_name=self.config.env_server.name, + server_name=self.config.resources_server.name, url_path="/reset", json=body.model_dump(), cookies=env_cookies, @@ -128,7 +128,7 @@ async def run(self, request: Request, body: GymnasiumAgentRunRequest) -> Gymnasi usage.output_tokens_details.reasoning_tokens = 0 step_resp = await self.server_client.post( - server_name=self.config.env_server.name, + server_name=self.config.resources_server.name, url_path="/step", json=body.model_dump() | {"response": model_response.model_dump()}, cookies=env_cookies, @@ -171,7 +171,7 @@ async def run(self, request: Request, body: GymnasiumAgentRunRequest) -> Gymnasi async def aggregate_metrics(self, body: AggregateMetricsRequest = Body()) -> AggregateMetrics: response = await self.server_client.post( - server_name=self.config.env_server.name, + server_name=self.config.resources_server.name, url_path="/aggregate_metrics", json=body, ) diff --git a/responses_api_agents/gymnasium_agent/tests/test_app.py b/responses_api_agents/gymnasium_agent/tests/test_app.py index d2284ac7e..68ed67b4e 100644 --- a/responses_api_agents/gymnasium_agent/tests/test_app.py +++ b/responses_api_agents/gymnasium_agent/tests/test_app.py @@ -28,7 +28,7 @@ def _make_agent(max_steps=10): port=0, entrypoint="", name="test_gymnasium_agent", - env_server=ResourcesServerRef(type="resources_servers", name="my_env"), + resources_server=ResourcesServerRef(type="resources_servers", name="my_env"), model_server=ModelServerRef(type="responses_api_models", name="policy_model"), max_steps=max_steps, ) @@ -116,7 +116,7 @@ def test_max_steps_validator_rejects_zero(self): port=0, entrypoint="", name="x", - env_server=ResourcesServerRef(type="resources_servers", name="e"), + resources_server=ResourcesServerRef(type="resources_servers", name="e"), model_server=ModelServerRef(type="responses_api_models", name="m"), max_steps=0, ) @@ -172,7 +172,7 @@ async def test_multi_step_preserves_output_items_in_history(self): result = await agent.run(req, body) assert result.reward == 1.0 assert result.terminated is True - # Inspect turn-2 model call body — its input must contain the full turn-1 output item, + # Inspect turn-2 model call body: its input must contain the full turn-1 output item, # not a flattened string, and the obs-1 appended as user message. turn2_body = [body for (s, u, body) in call_log if u == "/v1/responses"][1] turn2_input = turn2_body.input From f38c60f09522ebab654d2b1678fd5c1ccf46e697 Mon Sep 17 00:00:00 2001 From: cmunley1 Date: Thu, 23 Apr 2026 11:55:38 -0700 Subject: [PATCH 16/16] readme Signed-off-by: cmunley1 --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index f2d938043..d9913d7b5 100644 --- a/README.md +++ b/README.md @@ -240,7 +240,6 @@ The Dataset column links to publicly available datasets (e.g., on HuggingFace). | Vlm Eval Kit | other | - | Measure VLM capabilities | - | ✓ | - | OCRBench.yaml | - | | Vlm Eval Kit | other | Run all supported VLMEvalKit benchmarks. | Measure VLM capabilities | - | ✓ | - | vlm_eval_kit.yaml | - | | Workplace Assistant | agent | Workplace assistant multi-step tool-using environment | Improve multi-step tool use capability | ✓ | ✓ | Apache 2.0 | workplace_assistant.yaml | Nemotron-RL-agent-workplace_assistant | -| Workplace Assistant Gym | agent | Workplace assistant ported to GymnasiumServer (step/reset). Tool dispatch happens inside step(). | Mirror of workplace_assistant exercised through the gymnasium agent loop. | ✓ | ✓ | Apache 2.0 | workplace_assistant_gym.yaml | Nemotron-RL-agent-workplace_assistant | | Xlam Fc | agent | Salesforce xlam-function-calling-60k tool calling tasks | Improve tool-calling capabilities | ✓ | ✓ | Apache 2.0 | xlam_fc.yaml | - | | Xstest | safety | XSTest safety benchmark - exaggerated safety (over-refusal) evaluation | Evaluate model safety calibration between helpfulness and harmlessness | - | - | - | xstest.yaml | - |