NVIDIA-NeMo
diff --git a/‎benchmarks/finance_sec_search/README.md‎
Lines changed: 62 additions & 0 deletions b/‎benchmarks/finance_sec_search/README.md‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎benchmarks/finance_sec_search/__init__.py‎
Lines changed: 14 additions & 0 deletions b/‎benchmarks/finance_sec_search/__init__.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎benchmarks/finance_sec_search/config_no_web_search.yaml‎
Lines changed: 28 additions & 0 deletions b/‎benchmarks/finance_sec_search/config_no_web_search.yaml‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎benchmarks/finance_sec_search/config_web_search.yaml‎
Lines changed: 29 additions & 0 deletions b/‎benchmarks/finance_sec_search/config_web_search.yaml‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎benchmarks/finance_sec_search/data/.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/finance_sec_search/data/.gitignore‎
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,62 @@
+# Finance SEC Search
+
+50-question financial information retrieval benchmark from the
+[Vals AI finance-agent](https://github.com/vals-ai/finance-agent) public
+dataset. Questions cover SEC EDGAR filings, financial metrics, and
+company analysis.
+
+## Verification
+
+Uses LLM-as-judge with a financial grading rubric (0/1/2 scale).
+Only fully correct answers (`[[2]]`) receive reward 1.0. The judge
+prompt and rubric are defined in the `finance_sec_search` resource
+server's `/prompt_templates`.
+
+## Tools
+
+| Tool | Description |
+|------|-------------|
+| `sec_filing_search` | Search SEC EDGAR for filing metadata by stock ticker symbol |
+| `parse_html_page` | Fetch and parse any HTML page (SEC URLs use disk cache), store under a key |
+| `retrieve_information` | Query stored documents via LLM prompt with `{{key}}` placeholders |
+| `submit_final_result` | Submit the final answer (required to receive a reward) |
+| `web_search` | Internet search via Tavily API (optional — requires `tavily_api_key` in `env.yaml`) |
+
+## Data preparation
+
+Without web search:
+
+```bash
+ng_prepare_benchmark '+config_paths=[benchmarks/finance_sec_search/config_no_web_search.yaml]'
+```
+
+With web search (requires `tavily_api_key` in `env.yaml`):
+
+```bash
+ng_prepare_benchmark '+config_paths=[benchmarks/finance_sec_search/config_web_search.yaml]'
+```
+
+Downloads `public.csv` from the Vals AI GitHub repo and writes benchmark
+JSONL to `data/`.
+
+| Config | Output file |
+|--------|-------------|
+| `config_no_web_search.yaml` | `data/finance_sec_search_benchmark.jsonl` |
+| `config_web_search.yaml` | `data/finance_sec_search_benchmark_web_search.jsonl` |
+
+## Running servers
+
+```bash
+config_paths="responses_api_models/vllm_model/configs/vllm_model.yaml,\
+benchmarks/finance_sec_search/config_no_web_search.yaml"
+ng_run "+config_paths=[$config_paths]"
+```
+
+## Collecting rollouts
+
+```bash
+ng_collect_rollouts \
+    +agent_name=finance_sec_search_benchmark_agent \
+    +input_jsonl_fpath=benchmarks/finance_sec_search/data/finance_sec_search_benchmark.jsonl \
+    +output_jsonl_fpath=results/finance_sec_search_rollouts.jsonl
+```
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
@@ -0,0 +1,28 @@
+# Chain to existing finance_sec_search resource server + agent config
+config_paths:
+  - resources_servers/finance_sec_search/configs/finance_sec_search.yaml
+
+# Defaults for judge model interpolation variables from finance_sec_search.yaml.
+# Override via env.yaml or CLI when running actual evaluations.
+search_judge_model_base_url: https://api.openai.com/v1
+search_judge_model_api_key: ""
+search_judge_model_name: gpt-4o
+
+# Isolated copy of the resource server for this benchmark
+finance_sec_search_benchmark_resources_server:
+  _inherit_from: finance_sec_search_resources_server
+
+# Benchmark agent — inherits from finance_agent, overrides datasets
+finance_sec_search_benchmark_agent:
+  _inherit_from: finance_agent
+  responses_api_agents:
+    finance_agent:
+      resources_server:
+        name: finance_sec_search_benchmark_resources_server
+      datasets:
+      - name: finance_sec_search
+        type: benchmark
+        jsonl_fpath: benchmarks/finance_sec_search/data/finance_sec_search_benchmark.jsonl
+        prompt_config: null
+        prepare_script: benchmarks/finance_sec_search/prepare.py
+        num_repeats: 1
@@ -0,0 +1,29 @@
+# Chain to existing finance_sec_search resource server + agent config
+config_paths:
+  - resources_servers/finance_sec_search/configs/finance_sec_search.yaml
+
+# Defaults for judge model interpolation variables from finance_sec_search.yaml.
+# Override via env.yaml or CLI when running actual evaluations.
+search_judge_model_base_url: https://api.openai.com/v1
+search_judge_model_api_key: ""
+search_judge_model_name: gpt-4o
+
+# Isolated copy of the resource server for this benchmark
+finance_sec_search_web_search_benchmark_resources_server:
+  _inherit_from: finance_sec_search_resources_server
+
+# Benchmark agent — inherits from finance_agent, overrides datasets
+# Uses web_search variant (requires tavily_api_key in env.yaml)
+finance_sec_search_web_search_benchmark_agent:
+  _inherit_from: finance_agent
+  responses_api_agents:
+    finance_agent:
+      resources_server:
+        name: finance_sec_search_web_search_benchmark_resources_server
+      datasets:
+      - name: finance_sec_search
+        type: benchmark
+        jsonl_fpath: benchmarks/finance_sec_search/data/finance_sec_search_benchmark_web_search.jsonl
+        prompt_config: null
+        prepare_script: benchmarks/finance_sec_search/prepare_web_search.py
+        num_repeats: 1
@@ -0,0 +1 @@
+*.jsonl