Skip to content

Commit 9e406f3

Browse files
committed
Update action
1 parent 5a9d113 commit 9e406f3

2 files changed

Lines changed: 78 additions & 9 deletions

File tree

.github/actions/collect_info/s_getInfo.py

Lines changed: 68 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import json
2+
import subprocess
13
import numpy as np
24
import pandas as pd
35
from concurrent.futures import ThreadPoolExecutor, TimeoutError
@@ -57,6 +59,20 @@ def append_to_txt(file_path, value):
5759
except Exception as e:
5860
print(f"Error appending to {file_path}: {e}")
5961

62+
63+
def _to_json_serializable(obj):
64+
"""Convert numpy types in info_single to native Python for JSON."""
65+
if isinstance(obj, dict):
66+
return {k: _to_json_serializable(v) for k, v in obj.items()}
67+
if isinstance(obj, (np.integer,)):
68+
return int(obj)
69+
if isinstance(obj, (np.floating,)):
70+
return float(obj)
71+
if isinstance(obj, np.ndarray):
72+
return obj.tolist()
73+
return obj
74+
75+
6076
# Find problems that are parametric
6177
filename = os.path.join(cwd, 'list_of_parametric_problems_with_parameters_python.txt')
6278
# Scan each line, each line only has one problem name, which ends before the first comma
@@ -397,10 +413,26 @@ def process_arg(problem_name, arg, fixed_argins):
397413
return info_single
398414

399415
if __name__ == "__main__":
416+
# --single mode: run one problem in subprocess isolation. Crashes (segfault/OOM) only kill
417+
# the child; parent excludes the problem and continues. Output written to result_single.json.
418+
if len(sys.argv) >= 2 and sys.argv[1] == "--single":
419+
try:
420+
name = sys.argv[2]
421+
args = json.loads(sys.argv[3]) if len(sys.argv) > 3 else None
422+
info = get_problem_info(name, known_feasibility, args)
423+
out = os.path.join(saving_path, "result_single.json")
424+
with open(out, "w") as f:
425+
json.dump(_to_json_serializable(info), f, indent=None)
426+
sys.exit(0)
427+
except Exception as e:
428+
print(f"[--single] Error processing {sys.argv[2] if len(sys.argv) > 2 else '?'}: {e}")
429+
sys.exit(1)
430+
400431
csv_file = os.path.join(saving_path, 'probinfo_python.csv')
401432
csv_file_temp = os.path.join(saving_path, 'probinfo_python_temp.csv')
402433
current_prob_file = os.path.join(saving_path, 'current_problem.txt')
403434
exclude_file = os.path.join(saving_path, 'exclude_python.txt')
435+
result_single_path = os.path.join(saving_path, 'result_single.json')
404436

405437
# 1. Crash Detection: If current_problem.txt exists, the previous run crashed on that problem.
406438
if os.path.exists(current_prob_file):
@@ -450,16 +482,45 @@ def process_arg(problem_name, arg, fixed_argins):
450482
# Write current problem name before processing it to detect crashes
451483
with open(current_prob_file, 'w') as f:
452484
f.write(name)
453-
454-
info = get_problem_info(name, known_feasibility, args)
455-
456-
# Original logic to filter out 'unknown' values
485+
486+
# Run each problem in a subprocess so that segfault/OOM during load only kills the
487+
# child; we exclude the problem and continue. No need to restart the whole script.
488+
args_json = json.dumps(args) if args is not None else "null"
489+
cmd = [sys.executable, os.path.abspath(__file__), "--single", name, args_json]
490+
try:
491+
ret = subprocess.run(cmd, cwd=cwd, timeout=None)
492+
except subprocess.TimeoutExpired:
493+
ret = subprocess.CompletedProcess(cmd, returncode=-1)
494+
495+
if ret.returncode != 0:
496+
print(f"Problem {name} crashed or failed (exit {ret.returncode}). Excluding and continuing.")
497+
append_to_txt(exclude_file, name)
498+
if name not in problem_exclude:
499+
problem_exclude.append(name)
500+
if os.path.exists(current_prob_file):
501+
os.remove(current_prob_file)
502+
if os.path.exists(result_single_path):
503+
try:
504+
os.remove(result_single_path)
505+
except Exception:
506+
pass
507+
sys.stdout.flush()
508+
sys.stderr.flush()
509+
continue
510+
511+
with open(result_single_path, "r") as f:
512+
info = json.load(f)
513+
try:
514+
os.remove(result_single_path)
515+
except Exception:
516+
pass
517+
457518
def has_unknown_values(info_dict):
458519
for value in info_dict.values():
459520
if str(value).strip().lower() == 'unknown':
460521
return True
461522
return False
462-
523+
463524
if not has_unknown_values(info):
464525
df_single = pd.DataFrame([info])
465526
if not os.path.exists(csv_file_temp):
@@ -468,11 +529,10 @@ def has_unknown_values(info_dict):
468529
df_single.to_csv(csv_file_temp, mode='a', header=False, index=False, na_rep='nan')
469530
else:
470531
print(f"Filtered out problem {name} due to 'unknown' values.")
471-
472-
# Clear crash detection file after successful processing
532+
473533
if os.path.exists(current_prob_file):
474534
os.remove(current_prob_file)
475-
535+
476536
sys.stdout.flush()
477537
sys.stderr.flush()
478538

.github/workflows/collect_info.yml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
name: Collect Info
22

3+
# Prevent new runs from canceling in-progress runs (e.g. workflow_run re-trigger).
4+
# When Sync completes again, we avoid "The operation was canceled" on the previous collect run.
5+
concurrency:
6+
group: collect-info
7+
cancel-in-progress: false
8+
39
# This workflow do the following:
410
# 1. Checkout the repository
511
# 2. Checkout the optiprofiler repository
@@ -89,6 +95,10 @@ jobs:
8995
pip install -e .
9096
cd ..
9197
98+
# Retry loop only handles Python script crashes (non-zero exit). It does NOT handle
99+
# "The operation was canceled" — that is job-level cancellation (timeout/concurrency);
100+
# the whole step is killed, so no retry runs. Use concurrency.cancel-in-progress: false
101+
# to avoid new workflow runs canceling this one. Job timeout is 6h (timeout-minutes).
92102
- name: Run Python script
93103
if: matrix.language == 'python'
94104
run: |
@@ -103,7 +113,6 @@ jobs:
103113
fi
104114
echo "Script crashed with exit code $EXIT_CODE. Restarting ($((COUNT+1))/$MAX_RETRIES)..."
105115
COUNT=$((COUNT+1))
106-
# Small sleep to avoid rapid fire restarts
107116
sleep 2
108117
done
109118
if [ $COUNT -eq $MAX_RETRIES ]; then

0 commit comments

Comments
 (0)