From 2573d470bab7a9c8a56230400bd74d15882f46db Mon Sep 17 00:00:00 2001 From: pdmurray Date: Tue, 9 Sep 2025 23:23:01 -0700 Subject: [PATCH 01/14] Acquire lock in separate thread --- Cargo.lock | 4 ++-- src/python_spy.rs | 35 +++++++++++++++++++++++++++++++++-- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 57775988..d1becc1e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -713,9 +713,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "libc" -version = "0.2.159" +version = "0.2.175" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5" +checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543" [[package]] name = "libloading" diff --git a/src/python_spy.rs b/src/python_spy.rs index 14ea63ae..2d797178 100644 --- a/src/python_spy.rs +++ b/src/python_spy.rs @@ -214,10 +214,41 @@ impl PythonSpy { // Lock the process if appropriate. Note we have to lock AFTER getting the thread // activity status from the OS (otherwise each thread would report being inactive always). + // // This has the potential for race conditions (in that the thread activity could change - // between getting the status and locking the thread, but seems unavoidable right now + // between getting the status and locking the thread), in which case self.process.lock() + // will hang. This is the reason we do this in a separate thread; after a small timeout, + // we just silently avoid taking any samples on this process. + // + // The approach here follows https://stackoverflow.com/a/36182336/8100451, but with + // recv_timeout. let _lock = if self.config.blocking == LockingStrategy::Lock { - Some(self.process.lock().context("Failed to suspend process")?) + let (tx, rx) = std::sync::mpsc::channel(); + + // Clone the process so it can be moved to the child thread. remoteprocess doesn't + // derive the clone trait, so we do it by hand here. + let process = remoteprocess::Process::new(self.process.pid) + .unwrap_or_else(|_| { + unreachable!( + "Process::new doesn't do anything with the PID upon initialization" + ) + }); + + let join_handle = std::thread::spawn(move || { + // This can fail if the receiver has timed out; silently ignore this, because we + // don't care (the worker thread has already exceeded the timeout) + let _ = tx.send(process.lock()); + }); + + Some( + rx.recv_timeout(std::time::Duration::from_millis(5000)) + .context(format!("Timeout acquiring lock on process {}", self.process.pid)) + .inspect_err(|_| { + drop(join_handle); + drop(rx); + })? + .context(format!("Failed to suspend process {}", self.process.pid))? + ) } else { None }; From f7d8518eb9c7beae6c7e71f659d529f653bf48bc Mon Sep 17 00:00:00 2001 From: pdmurray Date: Tue, 9 Sep 2025 23:38:36 -0700 Subject: [PATCH 02/14] Make the lock timeout configurable --- src/config.rs | 3 +++ src/python_spy.rs | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/config.rs b/src/config.rs index f861d3b1..11e38177 100644 --- a/src/config.rs +++ b/src/config.rs @@ -59,6 +59,8 @@ pub struct Config { pub refresh_seconds: f64, #[doc(hidden)] pub core_filename: Option, + #[doc(hidden)] + pub lock_timeout_ms: u64, } #[allow(non_camel_case_types)] @@ -139,6 +141,7 @@ impl Default for Config { lineno: LineNo::LastInstruction, refresh_seconds: 1.0, core_filename: None, + lock_timeout_ms: 1000, } } } diff --git a/src/python_spy.rs b/src/python_spy.rs index 2d797178..cf520632 100644 --- a/src/python_spy.rs +++ b/src/python_spy.rs @@ -241,7 +241,7 @@ impl PythonSpy { }); Some( - rx.recv_timeout(std::time::Duration::from_millis(5000)) + rx.recv_timeout(std::time::Duration::from_millis(self.config.lock_timeout_ms)) .context(format!("Timeout acquiring lock on process {}", self.process.pid)) .inspect_err(|_| { drop(join_handle); From 4ac8daa9369f8867faae4f6681d9428d2969b14e Mon Sep 17 00:00:00 2001 From: pdmurray Date: Wed, 10 Sep 2025 06:35:37 -0700 Subject: [PATCH 03/14] Update readme with note about locking strategy; run lint --- README.md | 25 +++++++++++++------------ src/python_spy.rs | 28 +++++++++++++++------------- 2 files changed, 28 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index cfff6a3a..b850b4ca 100755 --- a/README.md +++ b/README.md @@ -23,10 +23,10 @@ You can also download prebuilt binaries from the [GitHub Releases Page](https://github.com/benfred/py-spy/releases). If you're a Rust user, py-spy can also be installed with: ```cargo install py-spy```. Note this -builds py-spy from source and requires `libunwind` on Linux and Window, e.g., +builds py-spy from source and requires `libunwind` on Linux and Window, e.g., `apt install libunwind-dev`. -On macOS, [py-spy is in Homebrew](https://formulae.brew.sh/formula/py-spy#default) and +On macOS, [py-spy is in Homebrew](https://formulae.brew.sh/formula/py-spy#default) and can be installed with ```brew install py-spy```. On Arch Linux, [py-spy is in AUR](https://aur.archlinux.org/packages/py-spy/) and can be @@ -181,23 +181,24 @@ There are some limitations with this approach though that may cause idle threads marked as active. First off, we have to get this thread activity information before pausing the program, because getting this from a paused program will cause it to always return that this is idle. This means there is a potential race condition, where we get the thread activity and -then the thread is in a different state when we get the stack trace. Querying the OS for thread -activity also isn't implemented yet for FreeBSD and i686/ARM processors on Linux. On Windows, -calls that are blocked on IO also won't be marked as idle yet, for instance when reading input -from stdin. Finally, on some Linux calls the ptrace attach that we are using may cause idle threads -to wake up momentarily, causing false positives when reading from procfs. For these reasons, -we also have a heuristic fallback that marks known certain known calls in -python as being idle. +then the thread is in a different state when we get the stack trace. In cases where the process +exits before the program can be paused, the call to pause the program will time out, and py-spy +will gracefully exit. Querying the OS for thread activity also isn't implemented yet for +FreeBSD and i686/ARM processors on Linux. On Windows, calls that are blocked on IO also won't +be marked as idle yet, for instance when reading input from stdin. Finally, on some Linux calls +the ptrace attach that we are using may cause idle threads to wake up momentarily, causing +false positives when reading from procfs. For these reasons, we also have a heuristic fallback +that marks known certain known calls in python as being idle. You can disable this functionality by setting the ```--idle``` flag, which -will include frames that py-spy considers idle. +will include frames that py-spy considers idle. ### How does GIL detection work? We get GIL activity by looking at the threadid value pointed to by the ```_PyThreadState_Current``` symbol for Python 3.6 and earlier and by figuring out the equivalent from the ```_PyRuntime``` struct in Python 3.7 and later. These symbols might not be included in your python distribution, which will -cause resolving which thread holds on to the GIL to fail. Current GIL usage is also shown in the +cause resolving which thread holds on to the GIL to fail. Current GIL usage is also shown in the ```top``` view as %GIL. Passing the ```--gil``` flag will only include traces for threads that are holding on to the @@ -233,7 +234,7 @@ your_service: Note that you'll need to restart the docker container in order for this setting to take effect. You can also use py-spy from the Host OS to profile a running process running inside the docker -container. +container. ### How do I run py-spy in Kubernetes? diff --git a/src/python_spy.rs b/src/python_spy.rs index cf520632..ac88cecd 100644 --- a/src/python_spy.rs +++ b/src/python_spy.rs @@ -227,12 +227,9 @@ impl PythonSpy { // Clone the process so it can be moved to the child thread. remoteprocess doesn't // derive the clone trait, so we do it by hand here. - let process = remoteprocess::Process::new(self.process.pid) - .unwrap_or_else(|_| { - unreachable!( - "Process::new doesn't do anything with the PID upon initialization" - ) - }); + let process = remoteprocess::Process::new(self.process.pid).unwrap_or_else(|_| { + unreachable!("Process::new doesn't do anything with the PID upon initialization") + }); let join_handle = std::thread::spawn(move || { // This can fail if the receiver has timed out; silently ignore this, because we @@ -241,13 +238,18 @@ impl PythonSpy { }); Some( - rx.recv_timeout(std::time::Duration::from_millis(self.config.lock_timeout_ms)) - .context(format!("Timeout acquiring lock on process {}", self.process.pid)) - .inspect_err(|_| { - drop(join_handle); - drop(rx); - })? - .context(format!("Failed to suspend process {}", self.process.pid))? + rx.recv_timeout(std::time::Duration::from_millis( + self.config.lock_timeout_ms, + )) + .context(format!( + "Timeout acquiring lock on process {}", + self.process.pid + )) + .inspect_err(|_| { + drop(join_handle); + drop(rx); + })? + .context(format!("Failed to suspend process {}", self.process.pid))?, ) } else { None From 0d5e30dca23db4c1db6030e7d8ea079d9d9744a6 Mon Sep 17 00:00:00 2001 From: pdmurray Date: Mon, 29 Sep 2025 11:05:08 -0700 Subject: [PATCH 04/14] Add better comment; don't return Result (fails on Windows) --- src/python_spy.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/python_spy.rs b/src/python_spy.rs index ac88cecd..c1fb95e3 100644 --- a/src/python_spy.rs +++ b/src/python_spy.rs @@ -232,9 +232,13 @@ impl PythonSpy { }); let join_handle = std::thread::spawn(move || { - // This can fail if the receiver has timed out; silently ignore this, because we - // don't care (the worker thread has already exceeded the timeout) - let _ = tx.send(process.lock()); + // This can fail if the receiver in the main thread has timed out; silently ignore + // this, because we don't care (this only happens when the worker thread has + // already exceeded the timeout). + match process.lock() { + Ok(_) => tx.send(Ok(())), + Err(error) => tx.send(Err(error)), + } }); Some( From 93116be55c0ef084c2a2a8986d26400019260da9 Mon Sep 17 00:00:00 2001 From: pdmurray Date: Wed, 10 Sep 2025 07:27:27 -0700 Subject: [PATCH 05/14] Add a test for 732 --- Cargo.toml | 6 +++ src/python_spy.rs | 79 ++++++++++++++++++--------------- tests/hanging_lock_test/main.rs | 0 tests/integration_test.rs | 10 +++++ 4 files changed, 58 insertions(+), 37 deletions(-) create mode 100644 tests/hanging_lock_test/main.rs diff --git a/Cargo.toml b/Cargo.toml index ba931d29..d5e9b764 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,6 @@ [features] unwind = ["remoteprocess/unwind"] +test_artifacts = [] [package] name = "py-spy" @@ -51,3 +52,8 @@ termios = "0.3.3" [target.'cfg(windows)'.dependencies] winapi = {version = "0.3", features = ["errhandlingapi", "winbase", "consoleapi", "wincon", "handleapi", "timeapi", "processenv" ]} + +[[bin]] +name = "hanging_lock_test" +path = "tests/hanging_lock_test/main.rs" +required-features = ["test_artifacts"] diff --git a/src/python_spy.rs b/src/python_spy.rs index c1fb95e3..be1d6d44 100644 --- a/src/python_spy.rs +++ b/src/python_spy.rs @@ -6,7 +6,7 @@ use std::iter::FromIterator; use std::path::Path; use anyhow::{Context, Error, Result}; -use remoteprocess::{Pid, Process, ProcessMemory, Tid}; +use remoteprocess::{Lock, Pid, Process, ProcessMemory, Tid}; use crate::config::{Config, LockingStrategy}; #[cfg(feature = "unwind")] @@ -193,6 +193,45 @@ impl PythonSpy { } } + /// Lock the process with the given PID; if the process doesn't lock in `lock_timeout_ms`, + /// error out gracefully. + /// + /// Since `remoteprocess::Process::lock` can hang if the process has exited, it is called in + /// a separate thread which times out (and results in an error) if the thread doesn't lock in + /// the given `lock_timeout_ms`. + /// + /// The approach here follows https://stackoverflow.com/a/36182336/8100451, but with + /// recv_timeout. + /// + /// * `pid`: ID of the process to lock + /// * `lock_timeout_ms`: Length of time to wait before erroring out + pub fn lock_process_with_timeout(pid: Pid, lock_timeout_ms: u64) -> Result<()> { + let (tx, rx) = std::sync::mpsc::channel(); + + // Generate a Process instance so it can be moved to the child thread + let process = remoteprocess::Process::new(pid).unwrap_or_else(|_| { + unreachable!("Process::new doesn't do anything with the PID upon initialization") + }); + + let join_handle = std::thread::spawn(move || { + // This can fail if the receiver in the main thread has timed out; silently ignore + // this, because we don't care (this only happens when the worker thread has + // already exceeded the timeout). + match process.lock() { + Ok(_) => tx.send(Ok(())), + Err(error) => tx.send(Err(error)), + } + }); + + rx.recv_timeout(std::time::Duration::from_millis(lock_timeout_ms)) + .context(format!("Timeout acquiring lock on process {pid}")) + .inspect_err(|_| { + drop(join_handle); + drop(rx); + })? + .context(format!("Failed to suspend process {pid}")) + } + // implementation of get_stack_traces, where we have a type for the InterpreterState fn _get_stack_traces(&mut self) -> Result, Error> { // Query the OS to get if each thread in the process is running or not @@ -218,43 +257,9 @@ impl PythonSpy { // This has the potential for race conditions (in that the thread activity could change // between getting the status and locking the thread), in which case self.process.lock() // will hang. This is the reason we do this in a separate thread; after a small timeout, - // we just silently avoid taking any samples on this process. - // - // The approach here follows https://stackoverflow.com/a/36182336/8100451, but with - // recv_timeout. + // we just error out. let _lock = if self.config.blocking == LockingStrategy::Lock { - let (tx, rx) = std::sync::mpsc::channel(); - - // Clone the process so it can be moved to the child thread. remoteprocess doesn't - // derive the clone trait, so we do it by hand here. - let process = remoteprocess::Process::new(self.process.pid).unwrap_or_else(|_| { - unreachable!("Process::new doesn't do anything with the PID upon initialization") - }); - - let join_handle = std::thread::spawn(move || { - // This can fail if the receiver in the main thread has timed out; silently ignore - // this, because we don't care (this only happens when the worker thread has - // already exceeded the timeout). - match process.lock() { - Ok(_) => tx.send(Ok(())), - Err(error) => tx.send(Err(error)), - } - }); - - Some( - rx.recv_timeout(std::time::Duration::from_millis( - self.config.lock_timeout_ms, - )) - .context(format!( - "Timeout acquiring lock on process {}", - self.process.pid - )) - .inspect_err(|_| { - drop(join_handle); - drop(rx); - })? - .context(format!("Failed to suspend process {}", self.process.pid))?, - ) + Some(Self::lock_process_with_timeout(self.process.pid, self.config.lock_timeout_ms)?) } else { None }; diff --git a/tests/hanging_lock_test/main.rs b/tests/hanging_lock_test/main.rs new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration_test.rs b/tests/integration_test.rs index 1052ea40..5fd4628d 100644 --- a/tests/integration_test.rs +++ b/tests/integration_test.rs @@ -553,3 +553,13 @@ fn test_delayed_subprocess() { break; } } + +#[cfg(target_os = "linux")] +#[test] +fn test_hanging_lock() { + let child = std::process::Command::new(env!("CARGO_BIN_EXE_hanging_lock_test")) + .spawn() + .unwrap(); + + let result = PythonSpy::lock_process_with_timeout(child.id().try_into().unwrap(), 1000); +} From bd0db8dac4ff11a95c63519c493a901d9ab0d2c0 Mon Sep 17 00:00:00 2001 From: pdmurray Date: Mon, 29 Sep 2025 15:37:32 -0700 Subject: [PATCH 06/14] Added tests for locking success and failure --- Cargo.toml | 4 +-- src/bin/hanging_lock_test.rs | 16 +++++++++++ src/python_spy.rs | 4 +-- tests/hanging_lock_test/main.rs | 0 tests/integration_test.rs | 49 +++++++++++++++++++++++++++++++-- 5 files changed, 67 insertions(+), 6 deletions(-) create mode 100644 src/bin/hanging_lock_test.rs delete mode 100644 tests/hanging_lock_test/main.rs diff --git a/Cargo.toml b/Cargo.toml index d5e9b764..837f9a4e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [features] -unwind = ["remoteprocess/unwind"] test_artifacts = [] +unwind = ["remoteprocess/unwind"] [package] name = "py-spy" @@ -55,5 +55,5 @@ winapi = {version = "0.3", features = ["errhandlingapi", "winbase", "consoleapi" [[bin]] name = "hanging_lock_test" -path = "tests/hanging_lock_test/main.rs" +path = "src/bin/hanging_lock_test.rs" required-features = ["test_artifacts"] diff --git a/src/bin/hanging_lock_test.rs b/src/bin/hanging_lock_test.rs new file mode 100644 index 00000000..7e3602bb --- /dev/null +++ b/src/bin/hanging_lock_test.rs @@ -0,0 +1,16 @@ +// Helper program for the hanging lock test. +// +// Waits until it receives input on stdin before exiting. +use std::io::{self, Write}; + +fn main() -> io::Result<()> { + println!("awaiting input"); + io::stdout().flush()?; + + let mut buffer = String::new(); + let stdin = io::stdin(); // We get `Stdin` here. + stdin.read_line(&mut buffer)?; + println!("Read buffer: {buffer}. Exiting..."); + io::stdout().flush()?; + Ok(()) +} diff --git a/src/python_spy.rs b/src/python_spy.rs index be1d6d44..a85a4610 100644 --- a/src/python_spy.rs +++ b/src/python_spy.rs @@ -6,7 +6,7 @@ use std::iter::FromIterator; use std::path::Path; use anyhow::{Context, Error, Result}; -use remoteprocess::{Lock, Pid, Process, ProcessMemory, Tid}; +use remoteprocess::{Pid, Process, ProcessMemory, Tid}; use crate::config::{Config, LockingStrategy}; #[cfg(feature = "unwind")] @@ -201,7 +201,7 @@ impl PythonSpy { /// the given `lock_timeout_ms`. /// /// The approach here follows https://stackoverflow.com/a/36182336/8100451, but with - /// recv_timeout. + /// `recv_timeout`. /// /// * `pid`: ID of the process to lock /// * `lock_timeout_ms`: Length of time to wait before erroring out diff --git a/tests/hanging_lock_test/main.rs b/tests/hanging_lock_test/main.rs deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/integration_test.rs b/tests/integration_test.rs index 5fd4628d..69fff829 100644 --- a/tests/integration_test.rs +++ b/tests/integration_test.rs @@ -1,6 +1,9 @@ extern crate py_spy; use py_spy::{Config, Pid, PythonSpy}; use std::collections::HashSet; +use std::io::{BufRead, BufReader, Write}; +use std::process::Stdio; + struct ScriptRunner { #[allow(dead_code)] @@ -556,10 +559,52 @@ fn test_delayed_subprocess() { #[cfg(target_os = "linux")] #[test] -fn test_hanging_lock() { - let child = std::process::Command::new(env!("CARGO_BIN_EXE_hanging_lock_test")) +fn test_hanging_lock_successful() { + let mut child = std::process::Command::new(env!("CARGO_BIN_EXE_hanging_lock_test")) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn() + .unwrap(); + + let child_stdout = child.stdout.take().expect("Child process doesn't have stdout handle"); + let mut stdout_reader = BufReader::new(child_stdout); + let mut buffer = String::new(); + + stdout_reader.read_line(&mut buffer).expect("Should be able to read"); + assert_eq!(buffer, "awaiting input\n"); + + let result = PythonSpy::lock_process_with_timeout(child.id().try_into().unwrap(), 1000); + assert!(result.is_ok()); + + let mut child_stdin = child.stdin.take().expect("Child process doesn't have stdin handle"); + let _ = child_stdin.write_all(b"continue\n"); + let ecode = child.wait().expect("Child process should exit"); + assert!(ecode.success()); +} + +#[cfg(target_os = "linux")] +#[test] +fn test_hanging_lock_failure() { + let mut child = std::process::Command::new(env!("CARGO_BIN_EXE_hanging_lock_test")) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) .spawn() .unwrap(); + let child_stdout = child.stdout.take().expect("Child process doesn't have stdout handle"); + let mut stdout_reader = BufReader::new(child_stdout); + let mut buffer = String::new(); + + stdout_reader.read_line(&mut buffer).expect("Should be able to read"); + assert_eq!(buffer, "awaiting input\n"); + + let mut child_stdin = child.stdin.take().expect("Child process doesn't have stdin handle"); + let _ = child_stdin.write_all(b"continue\n"); + let result = PythonSpy::lock_process_with_timeout(child.id().try_into().unwrap(), 1000); + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("Failed to suspend process")); + + let ecode = child.wait().expect("Child process should exit"); + assert!(ecode.success()); } From b34a62e11249dadbb1d67bbc189e73dbf60cb072 Mon Sep 17 00:00:00 2001 From: pdmurray Date: Tue, 9 Sep 2025 23:23:01 -0700 Subject: [PATCH 07/14] Acquire lock in separate thread --- Cargo.lock | 4 ++-- src/python_spy.rs | 35 +++++++++++++++++++++++++++++++++-- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 57775988..d1becc1e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -713,9 +713,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "libc" -version = "0.2.159" +version = "0.2.175" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5" +checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543" [[package]] name = "libloading" diff --git a/src/python_spy.rs b/src/python_spy.rs index 14ea63ae..2d797178 100644 --- a/src/python_spy.rs +++ b/src/python_spy.rs @@ -214,10 +214,41 @@ impl PythonSpy { // Lock the process if appropriate. Note we have to lock AFTER getting the thread // activity status from the OS (otherwise each thread would report being inactive always). + // // This has the potential for race conditions (in that the thread activity could change - // between getting the status and locking the thread, but seems unavoidable right now + // between getting the status and locking the thread), in which case self.process.lock() + // will hang. This is the reason we do this in a separate thread; after a small timeout, + // we just silently avoid taking any samples on this process. + // + // The approach here follows https://stackoverflow.com/a/36182336/8100451, but with + // recv_timeout. let _lock = if self.config.blocking == LockingStrategy::Lock { - Some(self.process.lock().context("Failed to suspend process")?) + let (tx, rx) = std::sync::mpsc::channel(); + + // Clone the process so it can be moved to the child thread. remoteprocess doesn't + // derive the clone trait, so we do it by hand here. + let process = remoteprocess::Process::new(self.process.pid) + .unwrap_or_else(|_| { + unreachable!( + "Process::new doesn't do anything with the PID upon initialization" + ) + }); + + let join_handle = std::thread::spawn(move || { + // This can fail if the receiver has timed out; silently ignore this, because we + // don't care (the worker thread has already exceeded the timeout) + let _ = tx.send(process.lock()); + }); + + Some( + rx.recv_timeout(std::time::Duration::from_millis(5000)) + .context(format!("Timeout acquiring lock on process {}", self.process.pid)) + .inspect_err(|_| { + drop(join_handle); + drop(rx); + })? + .context(format!("Failed to suspend process {}", self.process.pid))? + ) } else { None }; From cbcc41264a8876c0d682f087cc3a90d42fb461c8 Mon Sep 17 00:00:00 2001 From: pdmurray Date: Tue, 9 Sep 2025 23:38:36 -0700 Subject: [PATCH 08/14] Make the lock timeout configurable --- src/config.rs | 3 +++ src/python_spy.rs | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/config.rs b/src/config.rs index f861d3b1..11e38177 100644 --- a/src/config.rs +++ b/src/config.rs @@ -59,6 +59,8 @@ pub struct Config { pub refresh_seconds: f64, #[doc(hidden)] pub core_filename: Option, + #[doc(hidden)] + pub lock_timeout_ms: u64, } #[allow(non_camel_case_types)] @@ -139,6 +141,7 @@ impl Default for Config { lineno: LineNo::LastInstruction, refresh_seconds: 1.0, core_filename: None, + lock_timeout_ms: 1000, } } } diff --git a/src/python_spy.rs b/src/python_spy.rs index 2d797178..cf520632 100644 --- a/src/python_spy.rs +++ b/src/python_spy.rs @@ -241,7 +241,7 @@ impl PythonSpy { }); Some( - rx.recv_timeout(std::time::Duration::from_millis(5000)) + rx.recv_timeout(std::time::Duration::from_millis(self.config.lock_timeout_ms)) .context(format!("Timeout acquiring lock on process {}", self.process.pid)) .inspect_err(|_| { drop(join_handle); From 9e7fa49c62dd18c7d0a4be00da3c9d78422b49b2 Mon Sep 17 00:00:00 2001 From: pdmurray Date: Wed, 10 Sep 2025 06:35:37 -0700 Subject: [PATCH 09/14] Update readme with note about locking strategy; run lint --- README.md | 25 +++++++++++++------------ src/python_spy.rs | 28 +++++++++++++++------------- 2 files changed, 28 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index cfff6a3a..b850b4ca 100755 --- a/README.md +++ b/README.md @@ -23,10 +23,10 @@ You can also download prebuilt binaries from the [GitHub Releases Page](https://github.com/benfred/py-spy/releases). If you're a Rust user, py-spy can also be installed with: ```cargo install py-spy```. Note this -builds py-spy from source and requires `libunwind` on Linux and Window, e.g., +builds py-spy from source and requires `libunwind` on Linux and Window, e.g., `apt install libunwind-dev`. -On macOS, [py-spy is in Homebrew](https://formulae.brew.sh/formula/py-spy#default) and +On macOS, [py-spy is in Homebrew](https://formulae.brew.sh/formula/py-spy#default) and can be installed with ```brew install py-spy```. On Arch Linux, [py-spy is in AUR](https://aur.archlinux.org/packages/py-spy/) and can be @@ -181,23 +181,24 @@ There are some limitations with this approach though that may cause idle threads marked as active. First off, we have to get this thread activity information before pausing the program, because getting this from a paused program will cause it to always return that this is idle. This means there is a potential race condition, where we get the thread activity and -then the thread is in a different state when we get the stack trace. Querying the OS for thread -activity also isn't implemented yet for FreeBSD and i686/ARM processors on Linux. On Windows, -calls that are blocked on IO also won't be marked as idle yet, for instance when reading input -from stdin. Finally, on some Linux calls the ptrace attach that we are using may cause idle threads -to wake up momentarily, causing false positives when reading from procfs. For these reasons, -we also have a heuristic fallback that marks known certain known calls in -python as being idle. +then the thread is in a different state when we get the stack trace. In cases where the process +exits before the program can be paused, the call to pause the program will time out, and py-spy +will gracefully exit. Querying the OS for thread activity also isn't implemented yet for +FreeBSD and i686/ARM processors on Linux. On Windows, calls that are blocked on IO also won't +be marked as idle yet, for instance when reading input from stdin. Finally, on some Linux calls +the ptrace attach that we are using may cause idle threads to wake up momentarily, causing +false positives when reading from procfs. For these reasons, we also have a heuristic fallback +that marks known certain known calls in python as being idle. You can disable this functionality by setting the ```--idle``` flag, which -will include frames that py-spy considers idle. +will include frames that py-spy considers idle. ### How does GIL detection work? We get GIL activity by looking at the threadid value pointed to by the ```_PyThreadState_Current``` symbol for Python 3.6 and earlier and by figuring out the equivalent from the ```_PyRuntime``` struct in Python 3.7 and later. These symbols might not be included in your python distribution, which will -cause resolving which thread holds on to the GIL to fail. Current GIL usage is also shown in the +cause resolving which thread holds on to the GIL to fail. Current GIL usage is also shown in the ```top``` view as %GIL. Passing the ```--gil``` flag will only include traces for threads that are holding on to the @@ -233,7 +234,7 @@ your_service: Note that you'll need to restart the docker container in order for this setting to take effect. You can also use py-spy from the Host OS to profile a running process running inside the docker -container. +container. ### How do I run py-spy in Kubernetes? diff --git a/src/python_spy.rs b/src/python_spy.rs index cf520632..ac88cecd 100644 --- a/src/python_spy.rs +++ b/src/python_spy.rs @@ -227,12 +227,9 @@ impl PythonSpy { // Clone the process so it can be moved to the child thread. remoteprocess doesn't // derive the clone trait, so we do it by hand here. - let process = remoteprocess::Process::new(self.process.pid) - .unwrap_or_else(|_| { - unreachable!( - "Process::new doesn't do anything with the PID upon initialization" - ) - }); + let process = remoteprocess::Process::new(self.process.pid).unwrap_or_else(|_| { + unreachable!("Process::new doesn't do anything with the PID upon initialization") + }); let join_handle = std::thread::spawn(move || { // This can fail if the receiver has timed out; silently ignore this, because we @@ -241,13 +238,18 @@ impl PythonSpy { }); Some( - rx.recv_timeout(std::time::Duration::from_millis(self.config.lock_timeout_ms)) - .context(format!("Timeout acquiring lock on process {}", self.process.pid)) - .inspect_err(|_| { - drop(join_handle); - drop(rx); - })? - .context(format!("Failed to suspend process {}", self.process.pid))? + rx.recv_timeout(std::time::Duration::from_millis( + self.config.lock_timeout_ms, + )) + .context(format!( + "Timeout acquiring lock on process {}", + self.process.pid + )) + .inspect_err(|_| { + drop(join_handle); + drop(rx); + })? + .context(format!("Failed to suspend process {}", self.process.pid))?, ) } else { None From e55e340ba210a82fdd4c44092d794553da62d8de Mon Sep 17 00:00:00 2001 From: pdmurray Date: Mon, 29 Sep 2025 11:05:08 -0700 Subject: [PATCH 10/14] Add better comment; don't return Result (fails on Windows) --- src/python_spy.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/python_spy.rs b/src/python_spy.rs index ac88cecd..c1fb95e3 100644 --- a/src/python_spy.rs +++ b/src/python_spy.rs @@ -232,9 +232,13 @@ impl PythonSpy { }); let join_handle = std::thread::spawn(move || { - // This can fail if the receiver has timed out; silently ignore this, because we - // don't care (the worker thread has already exceeded the timeout) - let _ = tx.send(process.lock()); + // This can fail if the receiver in the main thread has timed out; silently ignore + // this, because we don't care (this only happens when the worker thread has + // already exceeded the timeout). + match process.lock() { + Ok(_) => tx.send(Ok(())), + Err(error) => tx.send(Err(error)), + } }); Some( From 7ad20dcb6b52f1c59664e6b68ff45666d987be0b Mon Sep 17 00:00:00 2001 From: pdmurray Date: Tue, 30 Sep 2025 10:23:01 -0700 Subject: [PATCH 11/14] Enable extra feature needed to test hanging locks --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a3f47cbb..452089cd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -48,7 +48,7 @@ jobs: - name: Test id: test continue-on-error: true - run: cargo test --release + run: cargo test --release --features test_artifacts - name: Test (retry#1) id: test1 run: cargo test --release From 294467f3826a4b3f9e358aff3affb76ce09f2630 Mon Sep 17 00:00:00 2001 From: pdmurray Date: Tue, 30 Sep 2025 13:05:47 -0700 Subject: [PATCH 12/14] Add tmate to debug test_recursive --- .github/workflows/build.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 452089cd..3495132e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -45,6 +45,7 @@ jobs: python-version: 3.9 - name: Install optional numpy dependency run: pip install numpy>=2 + - uses: mxschmitt/action-tmate@v3 - name: Test id: test continue-on-error: true From 92fd140d854f53233673a873c1006f8ae53c90df Mon Sep 17 00:00:00 2001 From: pdmurray Date: Tue, 30 Sep 2025 15:02:18 -0700 Subject: [PATCH 13/14] Don't exit the locker thread immediately; fix "numpy>=2" in build.yml --- .github/workflows/build.yml | 3 +-- src/python_spy.rs | 48 ++++++++++++++++++++++++++++++------- tests/integration_test.rs | 13 ++++++---- 3 files changed, 48 insertions(+), 16 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 3495132e..9a73b9bb 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -44,8 +44,7 @@ jobs: with: python-version: 3.9 - name: Install optional numpy dependency - run: pip install numpy>=2 - - uses: mxschmitt/action-tmate@v3 + run: pip install "numpy>=2" - name: Test id: test continue-on-error: true diff --git a/src/python_spy.rs b/src/python_spy.rs index ae450a41..c28be9b9 100644 --- a/src/python_spy.rs +++ b/src/python_spy.rs @@ -4,6 +4,7 @@ use std::collections::HashSet; #[cfg(all(target_os = "linux", feature = "unwind"))] use std::iter::FromIterator; use std::path::Path; +use std::sync::mpsc::Sender; use anyhow::{Context, Error, Result}; use remoteprocess::{Pid, Process, ProcessMemory, Tid}; @@ -40,6 +41,20 @@ pub struct PythonSpy { pub dockerized: bool, } +/// A small helper which automatically sends a message on an mpsc channel when it is dropped. +/// +/// * `release_lock_tx`: An mpsc sender which can send an empty () message. +#[derive(Debug)] +pub struct ProcessLocker { + release_lock_tx: Sender<()>, +} + +impl Drop for ProcessLocker { + fn drop(&mut self) { + let _ = self.release_lock_tx.send(()); + } +} + impl PythonSpy { /// Constructs a new PythonSpy object. pub fn new(pid: Pid, config: &Config) -> Result { @@ -197,16 +212,21 @@ impl PythonSpy { /// error out gracefully. /// /// Since `remoteprocess::Process::lock` can hang if the process has exited, it is called in - /// a separate thread which times out (and results in an error) if the thread doesn't lock in - /// the given `lock_timeout_ms`. + /// a separate locker thread which times out (and results in an error) if the target thread + /// doesn't lock in the given `lock_timeout_ms`. The locker thread then waits to exit, holding + /// the lock in memory. When the `ProcessLocker` object returned by this function is dropped, + /// it sends a signal to the locker thread using an mpsc channel which causes the locker thread + /// to finish executing (causing the lock held in that thread to be dropped, unpausing the + /// target thread). /// /// The approach here follows https://stackoverflow.com/a/36182336/8100451, but with /// `recv_timeout`. /// /// * `pid`: ID of the process to lock /// * `lock_timeout_ms`: Length of time to wait before erroring out - pub fn lock_process_with_timeout(pid: Pid, lock_timeout_ms: u64) -> Result<()> { - let (tx, rx) = std::sync::mpsc::channel(); + pub fn lock_process_with_timeout(pid: Pid, lock_timeout_ms: u64) -> Result { + let (acquire_lock_tx, acquire_lock_rx) = std::sync::mpsc::channel(); + let (release_lock_tx, release_lock_rx) = std::sync::mpsc::channel::<()>(); // Generate a Process instance so it can be moved to the child thread let process = remoteprocess::Process::new(pid).unwrap_or_else(|_| { @@ -218,18 +238,28 @@ impl PythonSpy { // this, because we don't care (this only happens when the worker thread has // already exceeded the timeout). match process.lock() { - Ok(_) => tx.send(Ok(())), - Err(error) => tx.send(Err(error)), + Ok(_lock) => { + let _ = acquire_lock_tx.send(Ok(())); + + // Wait until instructed to finish execution (and drop the lock, which + // unlocks the process) + let _ = release_lock_rx.recv(); + } + Err(error) => { + let _ = acquire_lock_tx.send(Err(error)); + } } }); - rx.recv_timeout(std::time::Duration::from_millis(lock_timeout_ms)) + acquire_lock_rx + .recv_timeout(std::time::Duration::from_millis(lock_timeout_ms)) .context(format!("Timeout acquiring lock on process {pid}")) .inspect_err(|_| { drop(join_handle); - drop(rx); + drop(acquire_lock_rx); })? - .context(format!("Failed to suspend process {pid}")) + .context(format!("Failed to suspend process {pid}"))?; + Ok(ProcessLocker { release_lock_tx }) } // implementation of get_stack_traces, where we have a type for the InterpreterState diff --git a/tests/integration_test.rs b/tests/integration_test.rs index 5b27bc1c..255511bb 100644 --- a/tests/integration_test.rs +++ b/tests/integration_test.rs @@ -577,8 +577,11 @@ fn test_hanging_lock_successful() { .expect("Should be able to read"); assert_eq!(buffer, "awaiting input\n"); - let result = PythonSpy::lock_process_with_timeout(child.id().try_into().unwrap(), 1000); - assert!(result.is_ok()); + let locker = PythonSpy::lock_process_with_timeout(child.id().try_into().unwrap(), 1000); + assert!(locker.is_ok()); + + // Need to drop the locker in order to unpause the child process + drop(locker); let mut child_stdin = child .stdin @@ -616,9 +619,9 @@ fn test_hanging_lock_failure() { .expect("Child process doesn't have stdin handle"); let _ = child_stdin.write_all(b"continue\n"); - let result = PythonSpy::lock_process_with_timeout(child.id().try_into().unwrap(), 1000); - assert!(result.is_err()); - assert!(result + let locker = PythonSpy::lock_process_with_timeout(child.id().try_into().unwrap(), 1000); + assert!(locker.is_err()); + assert!(locker .unwrap_err() .to_string() .contains("Failed to suspend process")); From 3aca085a2b88aa65b02b41b21feaf6f3fad78dce Mon Sep 17 00:00:00 2001 From: M Bussonnier Date: Tue, 10 Feb 2026 16:00:24 +0100 Subject: [PATCH 14/14] empty