diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a3f47cbb..9a73b9bb 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -44,11 +44,11 @@ jobs: with: python-version: 3.9 - name: Install optional numpy dependency - run: pip install numpy>=2 + run: pip install "numpy>=2" - name: Test id: test continue-on-error: true - run: cargo test --release + run: cargo test --release --features test_artifacts - name: Test (retry#1) id: test1 run: cargo test --release diff --git a/Cargo.lock b/Cargo.lock index 57775988..d1becc1e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -713,9 +713,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "libc" -version = "0.2.159" +version = "0.2.175" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5" +checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543" [[package]] name = "libloading" diff --git a/Cargo.toml b/Cargo.toml index ba931d29..837f9a4e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,4 +1,5 @@ [features] +test_artifacts = [] unwind = ["remoteprocess/unwind"] [package] @@ -51,3 +52,8 @@ termios = "0.3.3" [target.'cfg(windows)'.dependencies] winapi = {version = "0.3", features = ["errhandlingapi", "winbase", "consoleapi", "wincon", "handleapi", "timeapi", "processenv" ]} + +[[bin]] +name = "hanging_lock_test" +path = "src/bin/hanging_lock_test.rs" +required-features = ["test_artifacts"] diff --git a/README.md b/README.md index cfff6a3a..b850b4ca 100755 --- a/README.md +++ b/README.md @@ -23,10 +23,10 @@ You can also download prebuilt binaries from the [GitHub Releases Page](https://github.com/benfred/py-spy/releases). If you're a Rust user, py-spy can also be installed with: ```cargo install py-spy```. Note this -builds py-spy from source and requires `libunwind` on Linux and Window, e.g., +builds py-spy from source and requires `libunwind` on Linux and Window, e.g., `apt install libunwind-dev`. -On macOS, [py-spy is in Homebrew](https://formulae.brew.sh/formula/py-spy#default) and +On macOS, [py-spy is in Homebrew](https://formulae.brew.sh/formula/py-spy#default) and can be installed with ```brew install py-spy```. On Arch Linux, [py-spy is in AUR](https://aur.archlinux.org/packages/py-spy/) and can be @@ -181,23 +181,24 @@ There are some limitations with this approach though that may cause idle threads marked as active. First off, we have to get this thread activity information before pausing the program, because getting this from a paused program will cause it to always return that this is idle. This means there is a potential race condition, where we get the thread activity and -then the thread is in a different state when we get the stack trace. Querying the OS for thread -activity also isn't implemented yet for FreeBSD and i686/ARM processors on Linux. On Windows, -calls that are blocked on IO also won't be marked as idle yet, for instance when reading input -from stdin. Finally, on some Linux calls the ptrace attach that we are using may cause idle threads -to wake up momentarily, causing false positives when reading from procfs. For these reasons, -we also have a heuristic fallback that marks known certain known calls in -python as being idle. +then the thread is in a different state when we get the stack trace. In cases where the process +exits before the program can be paused, the call to pause the program will time out, and py-spy +will gracefully exit. Querying the OS for thread activity also isn't implemented yet for +FreeBSD and i686/ARM processors on Linux. On Windows, calls that are blocked on IO also won't +be marked as idle yet, for instance when reading input from stdin. Finally, on some Linux calls +the ptrace attach that we are using may cause idle threads to wake up momentarily, causing +false positives when reading from procfs. For these reasons, we also have a heuristic fallback +that marks known certain known calls in python as being idle. You can disable this functionality by setting the ```--idle``` flag, which -will include frames that py-spy considers idle. +will include frames that py-spy considers idle. ### How does GIL detection work? We get GIL activity by looking at the threadid value pointed to by the ```_PyThreadState_Current``` symbol for Python 3.6 and earlier and by figuring out the equivalent from the ```_PyRuntime``` struct in Python 3.7 and later. These symbols might not be included in your python distribution, which will -cause resolving which thread holds on to the GIL to fail. Current GIL usage is also shown in the +cause resolving which thread holds on to the GIL to fail. Current GIL usage is also shown in the ```top``` view as %GIL. Passing the ```--gil``` flag will only include traces for threads that are holding on to the @@ -233,7 +234,7 @@ your_service: Note that you'll need to restart the docker container in order for this setting to take effect. You can also use py-spy from the Host OS to profile a running process running inside the docker -container. +container. ### How do I run py-spy in Kubernetes? diff --git a/src/bin/hanging_lock_test.rs b/src/bin/hanging_lock_test.rs new file mode 100644 index 00000000..7e3602bb --- /dev/null +++ b/src/bin/hanging_lock_test.rs @@ -0,0 +1,16 @@ +// Helper program for the hanging lock test. +// +// Waits until it receives input on stdin before exiting. +use std::io::{self, Write}; + +fn main() -> io::Result<()> { + println!("awaiting input"); + io::stdout().flush()?; + + let mut buffer = String::new(); + let stdin = io::stdin(); // We get `Stdin` here. + stdin.read_line(&mut buffer)?; + println!("Read buffer: {buffer}. Exiting..."); + io::stdout().flush()?; + Ok(()) +} diff --git a/src/config.rs b/src/config.rs index f861d3b1..11e38177 100644 --- a/src/config.rs +++ b/src/config.rs @@ -59,6 +59,8 @@ pub struct Config { pub refresh_seconds: f64, #[doc(hidden)] pub core_filename: Option, + #[doc(hidden)] + pub lock_timeout_ms: u64, } #[allow(non_camel_case_types)] @@ -139,6 +141,7 @@ impl Default for Config { lineno: LineNo::LastInstruction, refresh_seconds: 1.0, core_filename: None, + lock_timeout_ms: 1000, } } } diff --git a/src/python_spy.rs b/src/python_spy.rs index 14ea63ae..c28be9b9 100644 --- a/src/python_spy.rs +++ b/src/python_spy.rs @@ -4,6 +4,7 @@ use std::collections::HashSet; #[cfg(all(target_os = "linux", feature = "unwind"))] use std::iter::FromIterator; use std::path::Path; +use std::sync::mpsc::Sender; use anyhow::{Context, Error, Result}; use remoteprocess::{Pid, Process, ProcessMemory, Tid}; @@ -40,6 +41,20 @@ pub struct PythonSpy { pub dockerized: bool, } +/// A small helper which automatically sends a message on an mpsc channel when it is dropped. +/// +/// * `release_lock_tx`: An mpsc sender which can send an empty () message. +#[derive(Debug)] +pub struct ProcessLocker { + release_lock_tx: Sender<()>, +} + +impl Drop for ProcessLocker { + fn drop(&mut self) { + let _ = self.release_lock_tx.send(()); + } +} + impl PythonSpy { /// Constructs a new PythonSpy object. pub fn new(pid: Pid, config: &Config) -> Result { @@ -193,6 +208,60 @@ impl PythonSpy { } } + /// Lock the process with the given PID; if the process doesn't lock in `lock_timeout_ms`, + /// error out gracefully. + /// + /// Since `remoteprocess::Process::lock` can hang if the process has exited, it is called in + /// a separate locker thread which times out (and results in an error) if the target thread + /// doesn't lock in the given `lock_timeout_ms`. The locker thread then waits to exit, holding + /// the lock in memory. When the `ProcessLocker` object returned by this function is dropped, + /// it sends a signal to the locker thread using an mpsc channel which causes the locker thread + /// to finish executing (causing the lock held in that thread to be dropped, unpausing the + /// target thread). + /// + /// The approach here follows https://stackoverflow.com/a/36182336/8100451, but with + /// `recv_timeout`. + /// + /// * `pid`: ID of the process to lock + /// * `lock_timeout_ms`: Length of time to wait before erroring out + pub fn lock_process_with_timeout(pid: Pid, lock_timeout_ms: u64) -> Result { + let (acquire_lock_tx, acquire_lock_rx) = std::sync::mpsc::channel(); + let (release_lock_tx, release_lock_rx) = std::sync::mpsc::channel::<()>(); + + // Generate a Process instance so it can be moved to the child thread + let process = remoteprocess::Process::new(pid).unwrap_or_else(|_| { + unreachable!("Process::new doesn't do anything with the PID upon initialization") + }); + + let join_handle = std::thread::spawn(move || { + // This can fail if the receiver in the main thread has timed out; silently ignore + // this, because we don't care (this only happens when the worker thread has + // already exceeded the timeout). + match process.lock() { + Ok(_lock) => { + let _ = acquire_lock_tx.send(Ok(())); + + // Wait until instructed to finish execution (and drop the lock, which + // unlocks the process) + let _ = release_lock_rx.recv(); + } + Err(error) => { + let _ = acquire_lock_tx.send(Err(error)); + } + } + }); + + acquire_lock_rx + .recv_timeout(std::time::Duration::from_millis(lock_timeout_ms)) + .context(format!("Timeout acquiring lock on process {pid}")) + .inspect_err(|_| { + drop(join_handle); + drop(acquire_lock_rx); + })? + .context(format!("Failed to suspend process {pid}"))?; + Ok(ProcessLocker { release_lock_tx }) + } + // implementation of get_stack_traces, where we have a type for the InterpreterState fn _get_stack_traces(&mut self) -> Result, Error> { // Query the OS to get if each thread in the process is running or not @@ -214,10 +283,19 @@ impl PythonSpy { // Lock the process if appropriate. Note we have to lock AFTER getting the thread // activity status from the OS (otherwise each thread would report being inactive always). + // // This has the potential for race conditions (in that the thread activity could change - // between getting the status and locking the thread, but seems unavoidable right now + // between getting the status and locking the thread), in which case self.process.lock() + // will hang. This is the reason we do this in a separate thread; after a small timeout, + // we just error out. + // + // The approach here follows https://stackoverflow.com/a/36182336/8100451, but with + // recv_timeout. let _lock = if self.config.blocking == LockingStrategy::Lock { - Some(self.process.lock().context("Failed to suspend process")?) + Some(Self::lock_process_with_timeout( + self.process.pid, + self.config.lock_timeout_ms, + )?) } else { None }; diff --git a/tests/integration_test.rs b/tests/integration_test.rs index 1052ea40..255511bb 100644 --- a/tests/integration_test.rs +++ b/tests/integration_test.rs @@ -1,6 +1,8 @@ extern crate py_spy; use py_spy::{Config, Pid, PythonSpy}; use std::collections::HashSet; +use std::io::{BufRead, BufReader, Write}; +use std::process::Stdio; struct ScriptRunner { #[allow(dead_code)] @@ -553,3 +555,77 @@ fn test_delayed_subprocess() { break; } } + +#[cfg(target_os = "linux")] +#[test] +fn test_hanging_lock_successful() { + let mut child = std::process::Command::new(env!("CARGO_BIN_EXE_hanging_lock_test")) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn() + .unwrap(); + + let child_stdout = child + .stdout + .take() + .expect("Child process doesn't have stdout handle"); + let mut stdout_reader = BufReader::new(child_stdout); + let mut buffer = String::new(); + + stdout_reader + .read_line(&mut buffer) + .expect("Should be able to read"); + assert_eq!(buffer, "awaiting input\n"); + + let locker = PythonSpy::lock_process_with_timeout(child.id().try_into().unwrap(), 1000); + assert!(locker.is_ok()); + + // Need to drop the locker in order to unpause the child process + drop(locker); + + let mut child_stdin = child + .stdin + .take() + .expect("Child process doesn't have stdin handle"); + let _ = child_stdin.write_all(b"continue\n"); + let ecode = child.wait().expect("Child process should exit"); + assert!(ecode.success()); +} + +#[cfg(target_os = "linux")] +#[test] +fn test_hanging_lock_failure() { + let mut child = std::process::Command::new(env!("CARGO_BIN_EXE_hanging_lock_test")) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn() + .unwrap(); + + let child_stdout = child + .stdout + .take() + .expect("Child process doesn't have stdout handle"); + let mut stdout_reader = BufReader::new(child_stdout); + let mut buffer = String::new(); + + stdout_reader + .read_line(&mut buffer) + .expect("Should be able to read"); + assert_eq!(buffer, "awaiting input\n"); + + let mut child_stdin = child + .stdin + .take() + .expect("Child process doesn't have stdin handle"); + let _ = child_stdin.write_all(b"continue\n"); + + let locker = PythonSpy::lock_process_with_timeout(child.id().try_into().unwrap(), 1000); + assert!(locker.is_err()); + assert!(locker + .unwrap_err() + .to_string() + .contains("Failed to suspend process")); + + let ecode = child.wait().expect("Child process should exit"); + assert!(ecode.success()); +}