Skip to content

Commit ddfa452

Browse files
committed
diagnose multiprocessing stalling issue
1 parent a4bc9d0 commit ddfa452

File tree

7 files changed

+333
-16
lines changed

7 files changed

+333
-16
lines changed
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
name: Debug multiprocessing spawn
2+
3+
# Temporary diagnostic workflow to investigate multiprocessing spawn hang on macOS/Windows.
4+
# Remove this workflow once the root cause is identified and fixed.
5+
6+
env:
7+
RUSTFLAGS: -C debuginfo=0
8+
RUST_BACKTRACE: 1
9+
PYTHONUTF8: 1
10+
11+
on:
12+
push:
13+
branches: [ "docs" ]
14+
workflow_dispatch:
15+
16+
concurrency:
17+
group: ${{ github.workflow }}-${{ github.ref }}
18+
cancel-in-progress: true
19+
20+
defaults:
21+
run:
22+
shell: bash
23+
24+
jobs:
25+
debug-multiprocessing:
26+
runs-on: ${{ matrix.os }}
27+
timeout-minutes: 30
28+
strategy:
29+
fail-fast: false
30+
matrix:
31+
os: [macOS-latest, windows-latest]
32+
python-version: ["3.10", "3.14"]
33+
34+
steps:
35+
- uses: actions/checkout@v4
36+
37+
- name: Set up Python ${{ matrix.python-version }}
38+
uses: actions/setup-python@v5
39+
with:
40+
python-version: ${{ matrix.python-version }}
41+
42+
- name: Set up Visual Studio environment on Windows
43+
if: runner.os == 'Windows'
44+
uses: ilammy/msvc-dev-cmd@v1
45+
with:
46+
arch: x64
47+
48+
- name: Install the latest version of uv
49+
uses: astral-sh/setup-uv@v4
50+
with:
51+
enable-cache: true
52+
53+
- name: Set up Rust
54+
run: rustup show
55+
56+
- name: Install just
57+
uses: extractions/setup-just@v2
58+
59+
- name: Cache Rust
60+
uses: Swatinem/rust-cache@v2
61+
with:
62+
workspaces: python/pecos-rslib
63+
64+
- name: Install LLVM 14.0.6 using pecos-llvm (Unix)
65+
if: runner.os != 'Windows'
66+
run: |
67+
echo "Installing LLVM using pecos..."
68+
cargo run -p pecos --features cli --release -- llvm install
69+
70+
echo "Setting LLVM environment variables..."
71+
export PECOS_LLVM=$(cargo run -p pecos --features cli --release -- llvm find 2>/dev/null)
72+
export LLVM_SYS_140_PREFIX="$PECOS_LLVM"
73+
74+
echo "PECOS_LLVM=$PECOS_LLVM" >> $GITHUB_ENV
75+
echo "LLVM_SYS_140_PREFIX=$LLVM_SYS_140_PREFIX" >> $GITHUB_ENV
76+
77+
echo "Verifying LLVM installation..."
78+
cargo run -p pecos --features cli --release -- llvm check
79+
80+
- name: Install LLVM 14.0.6 using pecos-llvm (Windows)
81+
if: runner.os == 'Windows'
82+
shell: pwsh
83+
run: |
84+
Write-Host "Installing LLVM using pecos..."
85+
cargo run -p pecos --features cli --release -- llvm install
86+
87+
Write-Host "Setting LLVM environment variables..."
88+
$env:PECOS_LLVM = (cargo run -p pecos --features cli --release -- llvm find 2>$null)
89+
$env:LLVM_SYS_140_PREFIX = $env:PECOS_LLVM
90+
91+
"PECOS_LLVM=$env:PECOS_LLVM" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
92+
"LLVM_SYS_140_PREFIX=$env:LLVM_SYS_140_PREFIX" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
93+
94+
Write-Host "Verifying LLVM installation..."
95+
cargo run -p pecos --features cli --release -- llvm check
96+
97+
- name: Configure MSVC linker (Windows)
98+
if: runner.os == 'Windows'
99+
shell: pwsh
100+
run: |
101+
# Find MSVC link.exe and create cargo config
102+
$vsWhere = "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe"
103+
$vsPath = & $vsWhere -latest -property installationPath
104+
$linkPath = Get-ChildItem -Path "$vsPath\VC\Tools\MSVC" -Recurse -Filter "link.exe" |
105+
Where-Object { $_.FullName -like "*\bin\Hostx64\x64\*" } |
106+
Select-Object -First 1 -ExpandProperty FullName
107+
108+
if ($linkPath) {
109+
New-Item -ItemType Directory -Force -Path .cargo | Out-Null
110+
$escapedPath = $linkPath.Replace('\', '/')
111+
$escapedLLVMPath = $env:LLVM_SYS_140_PREFIX.Replace('\', '/')
112+
$configContent = "[target.x86_64-pc-windows-msvc]`nlinker = `"$escapedPath`"`n`n[env]`nLLVM_SYS_140_PREFIX = `"$escapedLLVMPath`""
113+
$configContent | Out-File -FilePath ".cargo\config.toml" -Encoding UTF8
114+
New-Item -ItemType Directory -Force -Path "python\pecos-rslib\.cargo" | Out-Null
115+
$configContent | Out-File -FilePath "python\pecos-rslib\.cargo\config.toml" -Encoding UTF8
116+
New-Item -ItemType Directory -Force -Path "python\pecos-rslib\rust\.cargo" | Out-Null
117+
$configContent | Out-File -FilePath "python\pecos-rslib\rust\.cargo\config.toml" -Encoding UTF8
118+
[System.Environment]::SetEnvironmentVariable("LLVM_SYS_140_PREFIX", $env:LLVM_SYS_140_PREFIX, "User")
119+
[System.Environment]::SetEnvironmentVariable("LLVM_SYS_140_PREFIX", $env:LLVM_SYS_140_PREFIX, "Process")
120+
} else {
121+
Write-Error "Could not find MSVC link.exe"
122+
exit 1
123+
}
124+
125+
- name: Configure macOS environment
126+
if: runner.os == 'macOS'
127+
run: |
128+
unset LIBRARY_PATH
129+
unset LD_LIBRARY_PATH
130+
unset DYLD_LIBRARY_PATH
131+
unset DYLD_FALLBACK_LIBRARY_PATH
132+
unset PKG_CONFIG_PATH
133+
export LIBRARY_PATH=/usr/lib
134+
echo "LIBRARY_PATH=/usr/lib" >> $GITHUB_ENV
135+
136+
- name: Build PECOS (debug)
137+
run: just build-debug
138+
139+
- name: Run diagnostic script
140+
run: uv run python scripts/debug_multiprocessing_spawn.py
141+
142+
- name: Run multiprocessing test with timeout
143+
run: |
144+
uv run pytest python/quantum-pecos/tests/pecos/integration/test_pickle_multiprocessing.py -v --timeout=120 -x

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ dev = [
4040
test = [ # pinning testing environment
4141
"pytest==8.3.3", # 8.3.4 seems to be causing errors
4242
"pytest-cov==6.0.0",
43+
"pytest-timeout>=2.3.1",
4344
"hypothesis==6.122.3",
4445
]
4546
numpy-compat = [ # NumPy/SciPy compatibility tests - verify compatibility with scientific Python stack

python/quantum-pecos/tests/pecos/integration/test_pickle_multiprocessing.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,10 @@ def _pauliprop_worker(sim_bytes: bytes) -> int:
4747
# Use fork context on Linux (fast, avoids spawn serialization issues with test files).
4848
# On macOS/Windows where fork is unavailable or unsafe, use spawn.
4949
_MP_CONTEXT = "fork" if sys.platform == "linux" else "spawn"
50+
_POOL_TIMEOUT = 60 # seconds -- fail fast instead of hanging CI
5051

5152

53+
@pytest.mark.timeout(120)
5254
class TestMultiprocessingStateVec:
5355
"""Tests for multiprocessing StateVec simulators via pickle."""
5456

@@ -59,10 +61,13 @@ def test_pool_map(self) -> None:
5961
sim_bytes = pickle.dumps(sim)
6062
ctx = multiprocessing.get_context(_MP_CONTEXT)
6163
with ctx.Pool(processes=2) as pool:
62-
results = pool.map(_statevec_worker, [sim_bytes, sim_bytes])
64+
results = pool.map_async(_statevec_worker, [sim_bytes, sim_bytes]).get(
65+
timeout=_POOL_TIMEOUT,
66+
)
6367
assert results == [3, 3]
6468

6569

70+
@pytest.mark.timeout(120)
6671
class TestMultiprocessingSparseSim:
6772
"""Tests for multiprocessing SparseSim simulators via pickle."""
6873

@@ -74,10 +79,13 @@ def test_pool_map(self) -> None:
7479
sim_bytes = pickle.dumps(sim)
7580
ctx = multiprocessing.get_context(_MP_CONTEXT)
7681
with ctx.Pool(processes=2) as pool:
77-
results = pool.map(_sparsesim_worker, [sim_bytes, sim_bytes])
82+
results = pool.map_async(_sparsesim_worker, [sim_bytes, sim_bytes]).get(
83+
timeout=_POOL_TIMEOUT,
84+
)
7885
assert results == [4, 4]
7986

8087

88+
@pytest.mark.timeout(120)
8189
class TestMultiprocessingCoinToss:
8290
"""Tests for multiprocessing CoinToss simulators via pickle."""
8391

@@ -87,10 +95,13 @@ def test_pool_map(self) -> None:
8795
sim_bytes = pickle.dumps(sim)
8896
ctx = multiprocessing.get_context(_MP_CONTEXT)
8997
with ctx.Pool(processes=2) as pool:
90-
results = pool.map(_cointoss_worker, [sim_bytes, sim_bytes])
98+
results = pool.map_async(_cointoss_worker, [sim_bytes, sim_bytes]).get(
99+
timeout=_POOL_TIMEOUT,
100+
)
91101
assert results == [5, 5]
92102

93103

104+
@pytest.mark.timeout(120)
94105
class TestMultiprocessingPauliProp:
95106
"""Tests for multiprocessing PauliProp simulators via pickle."""
96107

@@ -101,6 +112,8 @@ def test_pool_map(self) -> None:
101112
sim_bytes = pickle.dumps(sim)
102113
ctx = multiprocessing.get_context(_MP_CONTEXT)
103114
with ctx.Pool(processes=2) as pool:
104-
results = pool.map(_pauliprop_worker, [sim_bytes, sim_bytes])
115+
results = pool.map_async(_pauliprop_worker, [sim_bytes, sim_bytes]).get(
116+
timeout=_POOL_TIMEOUT,
117+
)
105118
# After H on qubit 0: X->Z, so weight should still be 1
106119
assert all(r == 1 for r in results)

ruff.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ ignore = [
144144
"python/pecos-rslib/tests/*.py" = ["INP001"] # Test files don't need __init__.py
145145

146146
# Scripts and examples - not packages
147-
"scripts/**/*.py" = ["INP001", "S603"] # Script files don't need __init__.py and may run subprocess calls for testing
147+
"scripts/**/*.py" = ["INP001", "S603", "PLC0415", "S301", "BLE001"] # Script files: no __init__.py, subprocess calls, lazy imports, pickle, broad except
148148
"examples/**/*.py" = ["INP001", "BLE001"] # Example files don't need __init__.py and can use broad exception handling
149149

150150

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
#!/usr/bin/env python3
2+
# Copyright 2026 The PECOS Developers
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
5+
# in compliance with the License. You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software distributed under the License
10+
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11+
# or implied. See the License for the specific language governing permissions and limitations under
12+
# the License.
13+
14+
"""Diagnostic script for multiprocessing spawn hangs on macOS/Windows.
15+
16+
This script tests multiprocessing with spawn context in multiple stages,
17+
each with a timeout, to pinpoint exactly where the hang occurs.
18+
19+
Run with: uv run python scripts/debug_multiprocessing_spawn.py
20+
"""
21+
22+
from __future__ import annotations
23+
24+
import multiprocessing
25+
import pickle
26+
import sys
27+
import time
28+
29+
from pecos_rslib import StateVec
30+
31+
TIMEOUT = 60 # seconds per stage
32+
33+
34+
def _worker_basic(_: object) -> str:
35+
"""Worker that returns a constant -- no imports needed."""
36+
return "ok"
37+
38+
39+
def _worker_import(_: object) -> str:
40+
"""Worker that imports pecos_rslib."""
41+
import pecos_rslib # noqa: F401
42+
43+
return "import_ok"
44+
45+
46+
def _worker_pickle_statevec(data: bytes) -> str:
47+
"""Worker that unpickles a StateVec."""
48+
obj = pickle.loads(data)
49+
return f"unpickled_statevec_qubits={obj.num_qubits}"
50+
51+
52+
def _worker_full_pattern(data: bytes) -> int:
53+
"""Worker replicating the test pattern: unpickle + operate."""
54+
sim = pickle.loads(data)
55+
sim.run_1q_gate("H", 0)
56+
return sim.num_qubits
57+
58+
59+
def _log(msg: str) -> None:
60+
"""Print to stderr (unbuffered) with timestamp."""
61+
print(f"[{time.strftime('%H:%M:%S')}] {msg}", file=sys.stderr, flush=True)
62+
63+
64+
def _run_stage(
65+
name: str,
66+
worker: object,
67+
args: list,
68+
ctx: multiprocessing.context.BaseContext,
69+
) -> bool:
70+
"""Run a single diagnostic stage with a timeout.
71+
72+
Returns True if the stage succeeded, False otherwise.
73+
"""
74+
_log(f"--- Stage: {name} ---")
75+
_log(f" Creating Pool(processes=2) with context={ctx.get_start_method()!r}")
76+
77+
try:
78+
with ctx.Pool(processes=2) as pool:
79+
_log(" Pool created. Submitting work via map_async...")
80+
async_result = pool.map_async(worker, args)
81+
_log(f" Work submitted. Waiting up to {TIMEOUT}s for results...")
82+
results = async_result.get(timeout=TIMEOUT)
83+
_log(f" Results: {results}")
84+
_log(f" Stage '{name}' PASSED")
85+
return True
86+
except multiprocessing.TimeoutError:
87+
_log(f" TIMEOUT after {TIMEOUT}s -- stage '{name}' HUNG")
88+
return False
89+
except (OSError, pickle.PickleError, ImportError, RuntimeError) as exc:
90+
_log(f" EXCEPTION in stage '{name}': {exc}")
91+
return False
92+
93+
94+
def _main() -> None:
95+
"""Run all diagnostic stages."""
96+
_log(f"Platform: {sys.platform}")
97+
_log(f"Python: {sys.version}")
98+
_log(f"Executable: {sys.executable}")
99+
100+
method = "fork" if sys.platform == "linux" else "spawn"
101+
_log(f"Multiprocessing start method: {method}")
102+
ctx = multiprocessing.get_context(method)
103+
104+
# Stage 1: Basic spawn -- no imports in worker
105+
if not _run_stage("basic_spawn", _worker_basic, [None, None], ctx):
106+
_log("FAILED at basic spawn -- multiprocessing itself is broken")
107+
sys.exit(1)
108+
109+
# Stage 2: Import pecos_rslib in worker
110+
if not _run_stage("import_pecos_rslib", _worker_import, [None, None], ctx):
111+
_log("FAILED at import -- pecos_rslib import hangs in spawned child")
112+
sys.exit(2)
113+
114+
# Stage 3: Pickle/unpickle StateVec in worker
115+
_log("Preparing StateVec for stage 3...")
116+
sim = StateVec(3, seed=42)
117+
sim.run_1q_gate("H", 0)
118+
sim_bytes = pickle.dumps(sim)
119+
_log(f" Pickled StateVec: {len(sim_bytes)} bytes")
120+
121+
if not _run_stage(
122+
"pickle_statevec",
123+
_worker_pickle_statevec,
124+
[sim_bytes, sim_bytes],
125+
ctx,
126+
):
127+
_log("FAILED at pickle -- StateVec unpickling hangs in spawned child")
128+
sys.exit(3)
129+
130+
# Stage 4: Full test pattern (unpickle + operate)
131+
if not _run_stage(
132+
"full_pattern",
133+
_worker_full_pattern,
134+
[sim_bytes, sim_bytes],
135+
ctx,
136+
):
137+
_log("FAILED at full pattern -- operation on unpickled StateVec hangs")
138+
sys.exit(4)
139+
140+
_log("ALL STAGES PASSED")
141+
sys.exit(0)
142+
143+
144+
if __name__ == "__main__":
145+
_main()

0 commit comments

Comments
 (0)