Skip to content

Commit 894cee3

Browse files
authored
Merge branch 'main' into wedu/nemo-rl-update
2 parents 0feb908 + 04284cd commit 894cee3

30 files changed

Lines changed: 1712 additions & 554 deletions

File tree

docs/basics/code-packaging.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,21 @@ running commands from it.
4040
```
4141
In all cases, uncommitted code will not be used.
4242

43+
!!! note
44+
45+
You can override the default packaging behavior with the following environment variables:
46+
47+
- `NEMO_SKILLS_FORCE_PATTERN_PACKAGER=1` — Skip git-based packaging entirely and always use the installed
48+
`nemo_skills` package tree (PatternPackager). Useful when you have an editable install and don't want
49+
packaging tied to the git state of your current directory.
50+
- `NEMO_SKILLS_FORCE_INSTALLED_PACKAGE=1` — When running from a git repo, use the installed `nemo_skills`
51+
package instead of the repo's `nemo_skills/` directory. The git repo is still packaged, but `nemo_skills`
52+
is picked up from the installed location. Useful when your repo checkout has extra files you don't want
53+
uploaded.
54+
55+
Note that `NEMO_SKILLS_FORCE_INSTALLED_PACKAGE` has no effect when `NEMO_SKILLS_FORCE_PATTERN_PACKAGER`
56+
is also set, since the latter bypasses the git repo branch entirely.
57+
4358

4459
Finally, it's important to keep in mind that whenever you submit a new experiment, NeMo-Run will create a copy of your
4560
code package both locally (inside `~/.nemo_run`) and on cluster (inside `ssh_tunnel/job_dir` path in your cluster config).

nemo_skills/code_execution/local_sandbox/local_sandbox_server.py

Lines changed: 76 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,38 @@ def stop_shell(self, shell_id):
212212
proc.terminate()
213213
proc.join(timeout=2.0)
214214

215+
def _finish_restart(self, shell_id):
216+
"""Start a fresh shell and mark it as needing session restore.
217+
218+
IMPORTANT: Must be called OUTSIDE except handlers. start_shell() uses
219+
multiprocessing.Process.start() → os.fork(). A child forked inside an
220+
except block inherits sys.exc_info() from the parent, causing Python's
221+
implicit exception chaining to attach the parent's exception context to
222+
every subsequent exception in the child process.
223+
"""
224+
self.start_shell(shell_id)
225+
with self.manager_lock:
226+
if shell_id in self.shells:
227+
self.shells[shell_id]["restart_pending"] = True
228+
229+
def _cleanup_shell_resources(self, proc, conn):
230+
# Best-effort teardown for the current shell process and pipe.
231+
try:
232+
proc.terminate()
233+
except Exception:
234+
try:
235+
os.kill(proc.pid, signal.SIGKILL)
236+
except Exception:
237+
pass
238+
try:
239+
proc.join(timeout=2.0)
240+
except Exception:
241+
pass
242+
try:
243+
conn.close()
244+
except Exception:
245+
pass
246+
215247
def run_cell(self, shell_id, code, timeout=1.0, grace=2.0, traceback_verbosity="Plain"):
216248
"""
217249
Execute `code` on shell `shell_id`.
@@ -252,18 +284,35 @@ def run_cell(self, shell_id, code, timeout=1.0, grace=2.0, traceback_verbosity="
252284

253285
exec_id = time.time_ns()
254286
with lock:
287+
_need_restart = False
255288
# send execution request
256289
try:
257290
conn.send({"cmd": "exec", "id": exec_id, "code": code, "traceback_verbosity": traceback_verbosity})
258291
except Exception as exc:
259-
return {
292+
logging.warning(f"Shell process for {shell_id} failed before execution request was sent, restarting")
293+
# clean up the shell process and connection - best-effort
294+
self._cleanup_shell_resources(proc, conn)
295+
296+
# remove the old shell entry
297+
with self.manager_lock:
298+
self.shells.pop(shell_id, None)
299+
300+
_need_restart = True
301+
_exc_msg = str(exc)
302+
_restart_result = {
260303
"status": "error",
261-
"msg": f"send failed: {exc}",
304+
"msg": f"send failed: {_exc_msg}",
262305
"shell_was_created": shell_was_created,
306+
"shell_was_restarted": True,
263307
"shell_was_recently_restarted": shell_was_recently_restarted,
264308
}
265309

310+
if _need_restart:
311+
self._finish_restart(shell_id)
312+
return _restart_result
313+
266314
# wait for the result up to `timeout`
315+
_need_restart = False
267316
if conn.poll(timeout):
268317
try:
269318
result = conn.recv()
@@ -273,23 +322,27 @@ def run_cell(self, shell_id, code, timeout=1.0, grace=2.0, traceback_verbosity="
273322
except EOFError:
274323
# Connection closed - shell process died, need to restart
275324
logging.warning(f"Shell process for {shell_id} died during execution, restarting")
276-
with self.manager_lock:
277-
self.shells.pop(shell_id, None)
278-
self.start_shell(shell_id)
279325

280-
# Mark the new shell as having a restart pending
326+
# clean up the shell process and connection - best-effort
327+
self._cleanup_shell_resources(proc, conn)
328+
329+
# remove the old shell entry
281330
with self.manager_lock:
282-
if shell_id in self.shells:
283-
self.shells[shell_id]["restart_pending"] = True
331+
self.shells.pop(shell_id, None)
284332

285-
return {
333+
_need_restart = True
334+
_restart_result = {
286335
"status": "error",
287336
"msg": "connection closed",
288337
"shell_was_created": shell_was_created,
289338
"shell_was_restarted": True,
290339
"shell_was_recently_restarted": shell_was_recently_restarted,
291340
}
292341

342+
if _need_restart:
343+
self._finish_restart(shell_id)
344+
return _restart_result
345+
293346
# no reply yet -> try gentle interrupt (SIGINT)
294347
try:
295348
# Process.send_signal exists on Unix; fallback to os.kill if necessary
@@ -302,6 +355,7 @@ def run_cell(self, shell_id, code, timeout=1.0, grace=2.0, traceback_verbosity="
302355
pass
303356

304357
# wait short grace period for the shell to handle the interrupt
358+
_need_restart = False
305359
if conn.poll(grace):
306360
try:
307361
result = conn.recv()
@@ -311,48 +365,34 @@ def run_cell(self, shell_id, code, timeout=1.0, grace=2.0, traceback_verbosity="
311365
except EOFError:
312366
# Connection closed - shell process died, need to restart
313367
logging.warning(f"Shell process for {shell_id} died during interrupt, restarting")
314-
with self.manager_lock:
315-
self.shells.pop(shell_id, None)
316-
self.start_shell(shell_id)
368+
# clean up the shell process and connection - best-effort
369+
self._cleanup_shell_resources(proc, conn)
317370

318-
# Mark the new shell as having a restart pending
371+
# remove the old shell entry
319372
with self.manager_lock:
320-
if shell_id in self.shells:
321-
self.shells[shell_id]["restart_pending"] = True
373+
self.shells.pop(shell_id, None)
322374

323-
return {
375+
_need_restart = True
376+
_restart_result = {
324377
"status": "interrupted",
325378
"msg": "connection closed after interrupt",
326379
"shell_was_created": shell_was_created,
327380
"shell_was_restarted": True,
328381
"shell_was_recently_restarted": shell_was_recently_restarted,
329382
}
330383

331-
# still stuck -> terminate the shell and restart it (drop memory)
332-
try:
333-
proc.terminate()
334-
except Exception:
335-
try:
336-
os.kill(proc.pid, signal.SIGKILL)
337-
except Exception:
338-
pass
339-
proc.join(timeout=2.0)
384+
if _need_restart:
385+
self._finish_restart(shell_id)
386+
return _restart_result
340387

341-
# close old connection (best-effort)
342-
try:
343-
conn.close()
344-
except Exception:
345-
pass
388+
# still stuck -> terminate the shell and restart it (drop memory)
389+
# clean up the shell process and connection - best-effort
390+
self._cleanup_shell_resources(proc, conn)
346391

347392
# remove and restart a fresh shell for this id
348393
with self.manager_lock:
349394
self.shells.pop(shell_id, None)
350-
self.start_shell(shell_id)
351-
352-
# Mark the new shell as having a restart pending
353-
with self.manager_lock:
354-
if shell_id in self.shells:
355-
self.shells[shell_id]["restart_pending"] = True
395+
self._finish_restart(shell_id)
356396

357397
return {
358398
"status": "timeout_killed",

nemo_skills/dataset/arena-hard/prepare.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,4 +50,9 @@
5050
data = json.loads(line)
5151
data["question"] = data.pop("prompt")
5252
data["baseline_answer"] = baseline_answers[data["uid"]]
53+
# The upstream lmarena/arena-hard-auto source stores a dataset-version
54+
# string ("arena-hard-v0.1") in the category field. v1 has no real
55+
# sub-categories, so drop it to let ArenaJudgeTask.fill_prompt fall
56+
# through to the default prompt via its `if not category` branch.
57+
data.pop("category", None)
5358
fout.write(json.dumps(data) + "\n")

nemo_skills/dataset/livecodebench-cpp/prepare.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,16 @@
2020

2121

2222
class PromptConstants:
23-
# reference: https://github.com/LiveCodeBench/LiveCodeBench/blob/main/lcb_runner/prompts/code_generation.py#L35C5-L38C1
24-
FORMATTING_MESSAGE_WITH_STARTER_CODE = "You will use the following starter code to write the solution to the problem and enclose your code within delimiters."
25-
FORMATTING_WITHOUT_STARTER_CODE = "Read the inputs from stdin solve the problem and write the answer to stdout (do not directly test on the sample inputs). Enclose your code within delimiters as follows. Ensure that when the c++ program runs, it reads the inputs, runs the algorithm and writes output to STDOUT."
23+
# reference: https://huggingface.co/nvidia/Nemotron-Cascade-8B/blob/main/evaluation/data/benchmark.py#L1166
24+
FORMATTING_WITHOUT_STARTER_CODE = (
25+
"Write a C++14 or C++17 code to solve the problem. "
26+
"Please place the solution code in the following format:\n"
27+
"```cpp\n// Your solution code here\n```"
28+
)
29+
FORMATTING_MESSAGE_WITH_STARTER_CODE = (
30+
"Please place the solution code in C++14 or C++17 language in the following format:\n"
31+
"```cpp\n// Your solution code here\n```"
32+
)
2633

2734

2835
def parse_data(split):
@@ -46,11 +53,17 @@ def parse_data(split):
4653
def clean_data(dataset, keep_all_columns=False):
4754
def map_fn(data):
4855
if data["starter_code"]:
56+
data["starter_code"] = (
57+
"\n\n"
58+
+ "Solve the problem starting with the provided function header.\n\nFunction header:\n"
59+
+ "```\n"
60+
+ data["starter_code"]
61+
+ "\n```"
62+
)
4963
data["formatting_message"] = PromptConstants.FORMATTING_MESSAGE_WITH_STARTER_CODE
50-
data["starter_code"] = f"```cpp\n{data['starter_code']}\n```"
5164
else:
65+
data["starter_code"] = ""
5266
data["formatting_message"] = PromptConstants.FORMATTING_WITHOUT_STARTER_CODE
53-
data["starter_code"] = "```cpp\n// YOUR CODE HERE\n```"
5467

5568
data["task_id"] = data["question_id"]
5669
return data

nemo_skills/dataset/livecodebench-pro/prepare.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,11 @@
2828
("25q3", "quater_2025_7_9", 144),
2929
]
3030

31-
FORMATTING_MESSAGE = "Read the inputs from stdin solve the problem and write the answer to stdout (do not directly test on the sample inputs). Enclose your code within delimiters as follows. Ensure that when the c++ program runs, it reads the inputs, runs the algorithm and writes output to STDOUT."
31+
FORMATTING_MESSAGE = (
32+
"Write a self-contained solution in C++14 or C++17. "
33+
"Include all necessary headers. Please place the solution code in the following format:\n"
34+
"```cpp\n// Your solution code here\n```"
35+
)
3236

3337

3438
def download_testcases(local_dir, token):
@@ -65,7 +69,7 @@ def process_problem_splits(output_dir, token):
6569
output_record = dict(row)
6670
output_record["question_content"] = row["problem_statement"]
6771
output_record["formatting_message"] = FORMATTING_MESSAGE
68-
output_record["starter_code"] = "```cpp\n// YOUR CODE HERE\n```"
72+
output_record["starter_code"] = ""
6973
output_record["subset_for_metrics"] = row["difficulty"]
7074
f.write(json.dumps(output_record) + "\n")
7175

nemo_skills/evaluation/evaluator/livecodebench.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ async def eval_livecodebench_async(eval_config: LiveCodeBenchEvaluatorConfig):
211211
"""Evaluation running within a sandbox."""
212212
async with sandbox_context(eval_config.sandbox) as sandbox:
213213
if not await _install_packages_in_sandbox(sandbox, eval_config):
214-
return
214+
raise RuntimeError("Failed to install livecodebench packages in sandbox.")
215215

216216
jsonl_file = eval_config.input_file
217217
LOG.info(f"Processing file: {jsonl_file} in sandbox")
@@ -299,6 +299,10 @@ def eval_livecodebench(cfg):
299299
raise RuntimeError("The 'pypy3' interpreter requires a running sandbox, but the service was unreachable.")
300300

301301
if sandbox_is_ready:
302-
asyncio.run(eval_livecodebench_async(eval_config))
302+
try:
303+
asyncio.run(eval_livecodebench_async(eval_config))
304+
except Exception as e:
305+
LOG.warning(f"Sandbox evaluation failed: {e}. Falling back to local evaluation.")
306+
eval_livecodebench_without_sandbox(eval_config)
303307
else:
304308
eval_livecodebench_without_sandbox(eval_config)

nemo_skills/evaluation/evaluator/scicode.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,18 @@ async def _execute_single_test(args):
5757
code += line + "\n"
5858

5959
sandbox = get_sandbox(**eval_config.sandbox)
60-
output_dict, _ = await sandbox.execute_code(code, timeout=eval_config.timeout, max_output_characters=100000)
61-
62-
return elem_idx, output_dict
60+
session_id = None
61+
try:
62+
output_dict, session_id = await sandbox.execute_code(
63+
code, timeout=eval_config.timeout, max_output_characters=100000
64+
)
65+
return elem_idx, output_dict
66+
finally:
67+
try:
68+
if session_id is not None:
69+
await sandbox.delete_session(str(session_id))
70+
finally:
71+
await sandbox.close()
6372

6473

6574
def test_code(eval_config, scicode_data):

nemo_skills/inference/eval/arena_judge.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def setup_prompt(self):
8484
if self.cfg.prompt_format == "openai":
8585
return None
8686

87-
# Load the default prompt (used for most categories including hard_prompt, arena-hard-v0.1, etc.)
87+
# Load the default prompt (used for any category not explicitly mapped below, including hard_prompt)
8888
default_prompt = get_prompt(
8989
prompt_config=self.cfg.prompt_config,
9090
tokenizer=self.tokenizer,

nemo_skills/inference/eval/swebench.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -509,6 +509,7 @@ async def _execute_container_command(self, data_point, command, expected_file_pa
509509
apptainer_cmd = (
510510
f"apptainer exec --writable-tmpfs --cleanenv --no-mount home,tmp,bind-paths "
511511
f"--mount type=bind,src=/nemo_run/code,dst=/nemo_run/code "
512+
f"--mount type=bind,src={Path(self.cfg.input_file).parent},dst=/input_mount,ro "
512513
f"--mount type=bind,src=/root,dst=/root_mount,ro "
513514
f"--mount type=bind,src={self.output_dir},dst=/trajectories_mount "
514515
f"{extra_apptainer_args} "
@@ -818,7 +819,7 @@ async def _run_openhands(self, data_point, api_base):
818819
openhands_cmd = (
819820
# make sure /workspace isn't mounted as a safety precaution
820821
# (mounting it in the nemo-skills cluster config is ok, just not inside of apptainer specifically)
821-
"if [ -d /workspace ]; then "
822+
"if awk '{print $2}' /proc/mounts | grep -qE '^/workspace(/|$)'; then "
822823
" echo 'Exiting because /workspace is mounted.' && "
823824
" echo 'Please make sure /workspace is not mounted inside of Apptainer before running OpenHands.' && "
824825
" echo 'This is because OpenHands DELETES EVERYTHING in the /workspace folder if it exists.' && "
@@ -838,7 +839,7 @@ async def _run_openhands(self, data_point, api_base):
838839
"source /root/OpenHands/.venv/bin/activate && "
839840
# copy dataset
840841
f"mkdir {data_dir} && "
841-
f"cp {self.cfg.input_file} {data_dir}/dataset.jsonl && "
842+
f"cp /input_mount/{Path(self.cfg.input_file).name} {data_dir}/dataset.jsonl && "
842843
# set up config files
843844
f"echo {shlex.quote(config_str)} >config.toml && "
844845
f"echo \"selected_ids = ['{data_point['instance_id']}']\" >evaluation/benchmarks/{benchmark_name}/config.toml && "
@@ -969,7 +970,7 @@ async def _process_single_datapoint_impl(self, data_point, data):
969970
"cd /root/SWE-bench && "
970971
# run the evaluation with streaming output
971972
f"/root/SWE-bench/venv/bin/python -m swebench.harness.run_local_evaluation "
972-
f" --raw_sample_path {self.cfg.input_file} "
973+
f" --raw_sample_path /input_mount/{Path(self.cfg.input_file).name} "
973974
f" --patch_path {pred_mounted_path} "
974975
f" --output_dir eval-outputs "
975976
f" --scripts_dir /root/SWE-bench/run_scripts && "
@@ -987,7 +988,7 @@ async def _process_single_datapoint_impl(self, data_point, data):
987988
f" --instance_ids {data_point['instance_id']} "
988989
f" --run_id eval-outputs "
989990
f" --timeout {self.cfg.swebench_tests_timeout} "
990-
f" --dataset_name {self.cfg.input_file} && "
991+
f" --dataset_name /input_mount/{Path(self.cfg.input_file).name} && "
991992
f"cp -r logs/run_evaluation/eval-outputs /trajectories_mount/"
992993
)
993994

nemo_skills/inference/model/context_retry.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -528,4 +528,5 @@ def return_empty_generation_with_error(detailed_error: str, error_reason: str =
528528
"num_generated_tokens": 0,
529529
"error": error_reason,
530530
"detailed_error": detailed_error,
531+
"finish_reason": "error",
531532
}

0 commit comments

Comments
 (0)