NVIDIA-NeMo
diff --git a/‎docs/basics/code-packaging.md‎
Lines changed: 15 additions & 0 deletions b/‎docs/basics/code-packaging.md‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎nemo_skills/code_execution/local_sandbox/local_sandbox_server.py‎
Lines changed: 76 additions & 36 deletions b/‎nemo_skills/code_execution/local_sandbox/local_sandbox_server.py‎
Lines changed: 76 additions & 36 deletions
diff --git a/‎nemo_skills/dataset/arena-hard/prepare.py‎
Lines changed: 5 additions & 0 deletions b/‎nemo_skills/dataset/arena-hard/prepare.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎nemo_skills/dataset/livecodebench-cpp/prepare.py‎
Lines changed: 18 additions & 5 deletions b/‎nemo_skills/dataset/livecodebench-cpp/prepare.py‎
Lines changed: 18 additions & 5 deletions
diff --git a/‎nemo_skills/dataset/livecodebench-pro/prepare.py‎
Lines changed: 6 additions & 2 deletions b/‎nemo_skills/dataset/livecodebench-pro/prepare.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎nemo_skills/evaluation/evaluator/livecodebench.py‎
Lines changed: 6 additions & 2 deletions b/‎nemo_skills/evaluation/evaluator/livecodebench.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎nemo_skills/evaluation/evaluator/scicode.py‎
Lines changed: 12 additions & 3 deletions b/‎nemo_skills/evaluation/evaluator/scicode.py‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎nemo_skills/inference/eval/arena_judge.py‎
Lines changed: 1 addition & 1 deletion b/‎nemo_skills/inference/eval/arena_judge.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎nemo_skills/inference/eval/swebench.py‎
Lines changed: 5 additions & 4 deletions b/‎nemo_skills/inference/eval/swebench.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎nemo_skills/inference/model/context_retry.py‎
Lines changed: 1 addition & 0 deletions b/‎nemo_skills/inference/model/context_retry.py‎
Lines changed: 1 addition & 0 deletions
@@ -40,6 +40,21 @@ running commands from it.
     ```
     In all cases, uncommitted code will not be used.
 
+!!! note
+
+    You can override the default packaging behavior with the following environment variables:
+
+    - `NEMO_SKILLS_FORCE_PATTERN_PACKAGER=1` — Skip git-based packaging entirely and always use the installed
+      `nemo_skills` package tree (PatternPackager). Useful when you have an editable install and don't want
+      packaging tied to the git state of your current directory.
+    - `NEMO_SKILLS_FORCE_INSTALLED_PACKAGE=1` — When running from a git repo, use the installed `nemo_skills`
+      package instead of the repo's `nemo_skills/` directory. The git repo is still packaged, but `nemo_skills`
+      is picked up from the installed location. Useful when your repo checkout has extra files you don't want
+      uploaded.
+
+    Note that `NEMO_SKILLS_FORCE_INSTALLED_PACKAGE` has no effect when `NEMO_SKILLS_FORCE_PATTERN_PACKAGER`
+    is also set, since the latter bypasses the git repo branch entirely.
+
 
 Finally, it's important to keep in mind that whenever you submit a new experiment, NeMo-Run will create a copy of your
 code package both locally (inside `~/.nemo_run`) and on cluster (inside `ssh_tunnel/job_dir` path in your cluster config).
 
@@ -212,6 +212,38 @@ def stop_shell(self, shell_id):
         proc.terminate()
         proc.join(timeout=2.0)
 
+    def _finish_restart(self, shell_id):
+        """Start a fresh shell and mark it as needing session restore.
+
+        IMPORTANT: Must be called OUTSIDE except handlers. start_shell() uses
+        multiprocessing.Process.start() → os.fork(). A child forked inside an
+        except block inherits sys.exc_info() from the parent, causing Python's
+        implicit exception chaining to attach the parent's exception context to
+        every subsequent exception in the child process.
+        """
+        self.start_shell(shell_id)
+        with self.manager_lock:
+            if shell_id in self.shells:
+                self.shells[shell_id]["restart_pending"] = True
+
+    def _cleanup_shell_resources(self, proc, conn):
+        # Best-effort teardown for the current shell process and pipe.
+        try:
+            proc.terminate()
+        except Exception:
+            try:
+                os.kill(proc.pid, signal.SIGKILL)
+            except Exception:
+                pass
+        try:
+            proc.join(timeout=2.0)
+        except Exception:
+            pass
+        try:
+            conn.close()
+        except Exception:
+            pass
+
     def run_cell(self, shell_id, code, timeout=1.0, grace=2.0, traceback_verbosity="Plain"):
         """
         Execute `code` on shell `shell_id`.
@@ -252,18 +284,35 @@ def run_cell(self, shell_id, code, timeout=1.0, grace=2.0, traceback_verbosity="
 
         exec_id = time.time_ns()
         with lock:
+            _need_restart = False
             # send execution request
             try:
                 conn.send({"cmd": "exec", "id": exec_id, "code": code, "traceback_verbosity": traceback_verbosity})
             except Exception as exc:
-                return {
+                logging.warning(f"Shell process for {shell_id} failed before execution request was sent, restarting")
+                # clean up the shell process and connection - best-effort
+                self._cleanup_shell_resources(proc, conn)
+
+                # remove the old shell entry
+                with self.manager_lock:
+                    self.shells.pop(shell_id, None)
+
+                _need_restart = True
+                _exc_msg = str(exc)
+                _restart_result = {
                     "status": "error",
-                    "msg": f"send failed: {exc}",
+                    "msg": f"send failed: {_exc_msg}",
                     "shell_was_created": shell_was_created,
+                    "shell_was_restarted": True,
                     "shell_was_recently_restarted": shell_was_recently_restarted,
                 }
 
+            if _need_restart:
+                self._finish_restart(shell_id)
+                return _restart_result
+
             # wait for the result up to `timeout`
+            _need_restart = False
             if conn.poll(timeout):
                 try:
                     result = conn.recv()
@@ -273,23 +322,27 @@ def run_cell(self, shell_id, code, timeout=1.0, grace=2.0, traceback_verbosity="
                 except EOFError:
                     # Connection closed - shell process died, need to restart
                     logging.warning(f"Shell process for {shell_id} died during execution, restarting")
-                    with self.manager_lock:
-                        self.shells.pop(shell_id, None)
-                    self.start_shell(shell_id)
 
-                    # Mark the new shell as having a restart pending
+                    # clean up the shell process and connection - best-effort
+                    self._cleanup_shell_resources(proc, conn)
+
+                    # remove the old shell entry
                     with self.manager_lock:
-                        if shell_id in self.shells:
-                            self.shells[shell_id]["restart_pending"] = True
+                        self.shells.pop(shell_id, None)
 
-                    return {
+                    _need_restart = True
+                    _restart_result = {
                         "status": "error",
                         "msg": "connection closed",
                         "shell_was_created": shell_was_created,
                         "shell_was_restarted": True,
                         "shell_was_recently_restarted": shell_was_recently_restarted,
                     }
 
+            if _need_restart:
+                self._finish_restart(shell_id)
+                return _restart_result
+
             # no reply yet -> try gentle interrupt (SIGINT)
             try:
                 # Process.send_signal exists on Unix; fallback to os.kill if necessary
@@ -302,6 +355,7 @@ def run_cell(self, shell_id, code, timeout=1.0, grace=2.0, traceback_verbosity="
                 pass
 
             # wait short grace period for the shell to handle the interrupt
+            _need_restart = False
             if conn.poll(grace):
                 try:
                     result = conn.recv()
@@ -311,48 +365,34 @@ def run_cell(self, shell_id, code, timeout=1.0, grace=2.0, traceback_verbosity="
                 except EOFError:
                     # Connection closed - shell process died, need to restart
                     logging.warning(f"Shell process for {shell_id} died during interrupt, restarting")
-                    with self.manager_lock:
-                        self.shells.pop(shell_id, None)
-                    self.start_shell(shell_id)
+                    # clean up the shell process and connection - best-effort
+                    self._cleanup_shell_resources(proc, conn)
 
-                    # Mark the new shell as having a restart pending
+                    # remove the old shell entry
                     with self.manager_lock:
-                        if shell_id in self.shells:
-                            self.shells[shell_id]["restart_pending"] = True
+                        self.shells.pop(shell_id, None)
 
-                    return {
+                    _need_restart = True
+                    _restart_result = {
                         "status": "interrupted",
                         "msg": "connection closed after interrupt",
                         "shell_was_created": shell_was_created,
                         "shell_was_restarted": True,
                         "shell_was_recently_restarted": shell_was_recently_restarted,
                     }
 
-            # still stuck -> terminate the shell and restart it (drop memory)
-            try:
-                proc.terminate()
-            except Exception:
-                try:
-                    os.kill(proc.pid, signal.SIGKILL)
-                except Exception:
-                    pass
-            proc.join(timeout=2.0)
+            if _need_restart:
+                self._finish_restart(shell_id)
+                return _restart_result
 
-            # close old connection (best-effort)
-            try:
-                conn.close()
-            except Exception:
-                pass
+            # still stuck -> terminate the shell and restart it (drop memory)
+            # clean up the shell process and connection - best-effort
+            self._cleanup_shell_resources(proc, conn)
 
             # remove and restart a fresh shell for this id
             with self.manager_lock:
                 self.shells.pop(shell_id, None)
-            self.start_shell(shell_id)
-
-            # Mark the new shell as having a restart pending
-            with self.manager_lock:
-                if shell_id in self.shells:
-                    self.shells[shell_id]["restart_pending"] = True
+            self._finish_restart(shell_id)
 
             return {
                 "status": "timeout_killed",
 
@@ -50,4 +50,9 @@
             data = json.loads(line)
             data["question"] = data.pop("prompt")
             data["baseline_answer"] = baseline_answers[data["uid"]]
+            # The upstream lmarena/arena-hard-auto source stores a dataset-version
+            # string ("arena-hard-v0.1") in the category field. v1 has no real
+            # sub-categories, so drop it to let ArenaJudgeTask.fill_prompt fall
+            # through to the default prompt via its `if not category` branch.
+            data.pop("category", None)
             fout.write(json.dumps(data) + "\n")
@@ -20,9 +20,16 @@
 
 
 class PromptConstants:
-    # reference: https://github.com/LiveCodeBench/LiveCodeBench/blob/main/lcb_runner/prompts/code_generation.py#L35C5-L38C1
-    FORMATTING_MESSAGE_WITH_STARTER_CODE = "You will use the following starter code to write the solution to the problem and enclose your code within delimiters."
-    FORMATTING_WITHOUT_STARTER_CODE = "Read the inputs from stdin solve the problem and write the answer to stdout (do not directly test on the sample inputs). Enclose your code within delimiters as follows. Ensure that when the c++ program runs, it reads the inputs, runs the algorithm and writes output to STDOUT."
+    # reference: https://huggingface.co/nvidia/Nemotron-Cascade-8B/blob/main/evaluation/data/benchmark.py#L1166
+    FORMATTING_WITHOUT_STARTER_CODE = (
+        "Write a C++14 or C++17 code to solve the problem. "
+        "Please place the solution code in the following format:\n"
+        "```cpp\n// Your solution code here\n```"
+    )
+    FORMATTING_MESSAGE_WITH_STARTER_CODE = (
+        "Please place the solution code in C++14 or C++17 language in the following format:\n"
+        "```cpp\n// Your solution code here\n```"
+    )
 
 
 def parse_data(split):
@@ -46,11 +53,17 @@ def parse_data(split):
 def clean_data(dataset, keep_all_columns=False):
     def map_fn(data):
         if data["starter_code"]:
+            data["starter_code"] = (
+                "\n\n"
+                + "Solve the problem starting with the provided function header.\n\nFunction header:\n"
+                + "```\n"
+                + data["starter_code"]
+                + "\n```"
+            )
             data["formatting_message"] = PromptConstants.FORMATTING_MESSAGE_WITH_STARTER_CODE
-            data["starter_code"] = f"```cpp\n{data['starter_code']}\n```"
         else:
+            data["starter_code"] = ""
             data["formatting_message"] = PromptConstants.FORMATTING_WITHOUT_STARTER_CODE
-            data["starter_code"] = "```cpp\n// YOUR CODE HERE\n```"
 
         data["task_id"] = data["question_id"]
         return data
 
@@ -28,7 +28,11 @@
     ("25q3", "quater_2025_7_9", 144),
 ]
 
-FORMATTING_MESSAGE = "Read the inputs from stdin solve the problem and write the answer to stdout (do not directly test on the sample inputs). Enclose your code within delimiters as follows. Ensure that when the c++ program runs, it reads the inputs, runs the algorithm and writes output to STDOUT."
+FORMATTING_MESSAGE = (
+    "Write a self-contained solution in C++14 or C++17. "
+    "Include all necessary headers. Please place the solution code in the following format:\n"
+    "```cpp\n// Your solution code here\n```"
+)
 
 
 def download_testcases(local_dir, token):
@@ -65,7 +69,7 @@ def process_problem_splits(output_dir, token):
                     output_record = dict(row)
                     output_record["question_content"] = row["problem_statement"]
                     output_record["formatting_message"] = FORMATTING_MESSAGE
-                    output_record["starter_code"] = "```cpp\n// YOUR CODE HERE\n```"
+                    output_record["starter_code"] = ""
                     output_record["subset_for_metrics"] = row["difficulty"]
                     f.write(json.dumps(output_record) + "\n")
 
 
@@ -211,7 +211,7 @@ async def eval_livecodebench_async(eval_config: LiveCodeBenchEvaluatorConfig):
     """Evaluation running within a sandbox."""
     async with sandbox_context(eval_config.sandbox) as sandbox:
         if not await _install_packages_in_sandbox(sandbox, eval_config):
-            return
+            raise RuntimeError("Failed to install livecodebench packages in sandbox.")
 
         jsonl_file = eval_config.input_file
         LOG.info(f"Processing file: {jsonl_file} in sandbox")
@@ -299,6 +299,10 @@ def eval_livecodebench(cfg):
         raise RuntimeError("The 'pypy3' interpreter requires a running sandbox, but the service was unreachable.")
 
     if sandbox_is_ready:
-        asyncio.run(eval_livecodebench_async(eval_config))
+        try:
+            asyncio.run(eval_livecodebench_async(eval_config))
+        except Exception as e:
+            LOG.warning(f"Sandbox evaluation failed: {e}. Falling back to local evaluation.")
+            eval_livecodebench_without_sandbox(eval_config)
     else:
         eval_livecodebench_without_sandbox(eval_config)
@@ -57,9 +57,18 @@ async def _execute_single_test(args):
             code += line + "\n"
 
     sandbox = get_sandbox(**eval_config.sandbox)
-    output_dict, _ = await sandbox.execute_code(code, timeout=eval_config.timeout, max_output_characters=100000)
-
-    return elem_idx, output_dict
+    session_id = None
+    try:
+        output_dict, session_id = await sandbox.execute_code(
+            code, timeout=eval_config.timeout, max_output_characters=100000
+        )
+        return elem_idx, output_dict
+    finally:
+        try:
+            if session_id is not None:
+                await sandbox.delete_session(str(session_id))
+        finally:
+            await sandbox.close()
 
 
 def test_code(eval_config, scicode_data):
 
@@ -84,7 +84,7 @@ def setup_prompt(self):
         if self.cfg.prompt_format == "openai":
             return None
 
-        # Load the default prompt (used for most categories including hard_prompt, arena-hard-v0.1, etc.)
+        # Load the default prompt (used for any category not explicitly mapped below, including hard_prompt)
         default_prompt = get_prompt(
             prompt_config=self.cfg.prompt_config,
             tokenizer=self.tokenizer,
 
@@ -509,6 +509,7 @@ async def _execute_container_command(self, data_point, command, expected_file_pa
         apptainer_cmd = (
             f"apptainer exec --writable-tmpfs --cleanenv --no-mount home,tmp,bind-paths "
             f"--mount type=bind,src=/nemo_run/code,dst=/nemo_run/code "
+            f"--mount type=bind,src={Path(self.cfg.input_file).parent},dst=/input_mount,ro "
             f"--mount type=bind,src=/root,dst=/root_mount,ro "
             f"--mount type=bind,src={self.output_dir},dst=/trajectories_mount "
             f"{extra_apptainer_args} "
@@ -818,7 +819,7 @@ async def _run_openhands(self, data_point, api_base):
         openhands_cmd = (
             # make sure /workspace isn't mounted as a safety precaution
             # (mounting it in the nemo-skills cluster config is ok, just not inside of apptainer specifically)
-            "if [ -d /workspace ]; then "
+            "if awk '{print $2}' /proc/mounts | grep -qE '^/workspace(/|$)'; then "
             "    echo 'Exiting because /workspace is mounted.' && "
             "    echo 'Please make sure /workspace is not mounted inside of Apptainer before running OpenHands.' && "
             "    echo 'This is because OpenHands DELETES EVERYTHING in the /workspace folder if it exists.' && "
@@ -838,7 +839,7 @@ async def _run_openhands(self, data_point, api_base):
             "source /root/OpenHands/.venv/bin/activate && "
             # copy dataset
             f"mkdir {data_dir} && "
-            f"cp {self.cfg.input_file} {data_dir}/dataset.jsonl && "
+            f"cp /input_mount/{Path(self.cfg.input_file).name} {data_dir}/dataset.jsonl && "
             # set up config files
             f"echo {shlex.quote(config_str)} >config.toml && "
             f"echo \"selected_ids = ['{data_point['instance_id']}']\" >evaluation/benchmarks/{benchmark_name}/config.toml && "
@@ -969,7 +970,7 @@ async def _process_single_datapoint_impl(self, data_point, data):
                     "cd /root/SWE-bench && "
                     # run the evaluation with streaming output
                     f"/root/SWE-bench/venv/bin/python -m swebench.harness.run_local_evaluation "
-                    f"    --raw_sample_path {self.cfg.input_file} "
+                    f"    --raw_sample_path /input_mount/{Path(self.cfg.input_file).name} "
                     f"    --patch_path {pred_mounted_path} "
                     f"    --output_dir eval-outputs "
                     f"    --scripts_dir /root/SWE-bench/run_scripts && "
@@ -987,7 +988,7 @@ async def _process_single_datapoint_impl(self, data_point, data):
                     f"    --instance_ids {data_point['instance_id']} "
                     f"    --run_id eval-outputs "
                     f"    --timeout {self.cfg.swebench_tests_timeout} "
-                    f"    --dataset_name {self.cfg.input_file} && "
+                    f"    --dataset_name /input_mount/{Path(self.cfg.input_file).name} && "
                     f"cp -r logs/run_evaluation/eval-outputs /trajectories_mount/"
                 )
 
 
@@ -528,4 +528,5 @@ def return_empty_generation_with_error(detailed_error: str, error_reason: str =
         "num_generated_tokens": 0,
         "error": error_reason,
         "detailed_error": detailed_error,
+        "finish_reason": "error",
     }
Original file line number	Diff line number	Diff line change
`@@ -528,4 +528,5 @@ def return_empty_generation_with_error(detailed_error: str, error_reason: str =`
`528`	`528`	`"num_generated_tokens": 0,`
`529`	`529`	`"error": error_reason,`
`530`	`530`	`"detailed_error": detailed_error,`
	`531`	`+ "finish_reason": "error",`
`531`	`532`	`}`