diff --git a/Makefile b/Makefile index c2c089448..498832a48 100644 --- a/Makefile +++ b/Makefile @@ -23,7 +23,7 @@ ENVOY_TESTS = bazel-bin/tests/*_test BUILD_DEP_FILES = ENVOY_VERSION WORKSPACE .bazelrc envoy.bazelrc bazel/toolchains/BUILD bazel/toolchains/cc_toolchain_config.bzl SHELL=/bin/bash -o pipefail -BAZEL ?= $(QUIET) bazel +BAZEL ?= $(QUIET) tools/bazel_adaptive.py BAZEL_FILTER ?= BAZEL_OPTS ?= BAZEL_BUILD_OPTS ?= @@ -31,7 +31,7 @@ ifdef BAZEL_REMOTE_CACHE BAZEL_BUILD_OPTS += --remote_cache=$(BAZEL_REMOTE_CACHE) endif -BAZEL_TEST_OPTS ?= --jobs=HOST_RAM*.0002 --test_timeout=100 --local_test_jobs=1 --flaky_test_attempts=3 +BAZEL_TEST_OPTS ?= --test_timeout=100 BAZEL_TEST_OPTS += --test_output=errors BUILDARCH := $(subst aarch64,arm64,$(subst x86_64,amd64,$(shell uname -m))) diff --git a/tools/bazel_adaptive.py b/tools/bazel_adaptive.py new file mode 100755 index 000000000..72683b45a --- /dev/null +++ b/tools/bazel_adaptive.py @@ -0,0 +1,3640 @@ +#!/usr/bin/env python3 +"""Adaptive Bazel wrapper. + +This is a Bazel wrapper for adaptive building. This is intentionally self-contained and uses only +the Python 3 standard library. Both command line and environment are passed through, except that +--jobs is converted to a concrete integer and may be adapted between Bazel retries. + +Interface and argument handling: +- Resolve the real Bazel binary from BAZEL, otherwise find bazel on PATH without + recursing into this wrapper. If BAZEL points at this wrapper, ignore it and + fall back to PATH lookup. +- Pass stdin, stdout, stderr, command arguments, and environment through to the + real Bazel command. Forward Bazel output as-is while also decoding a copy for + monitoring; bytes are written to the terminal immediately even when Bazel is + under stress and emits partial lines without CR/LF. Wrapper diagnostics go to + stderr with a [bazel-adaptive/s] prefix, where seconds is elapsed + monotonic time since the wrapper started. +- Parse --jobs=N and --jobs N. Integer values are used directly. Values of + HOST_CPUS, HOST_CPUS*MULTIPLIER, HOST_RAM, and HOST_RAM*MULTIPLIER are + resolved once at startup and used as the maximum adaptive jobs cap; for + example HOST_CPUS*.5 or HOST_RAM*.0002. If --jobs is absent or invalid, start + from the host CPU count. Every real Bazel invocation receives an integer + --jobs value. The wrapper does not keep a built-in table of Bazel startup + options. It only scans arguments before Bazel's "--" delimiter for --jobs, + and when --jobs is absent inserts the adaptive --jobs value immediately before + that delimiter or at the end of the argument list. +- Only adapt commands that accept --jobs: build, test, run, coverage, fetch, + cquery, and aquery. Other Bazel commands are exec'd directly without adding or + rewriting --jobs, so the wrapper can be used as a general Bazel entry point. + The detector is conservative around unknown startup options with separate + values; ambiguous commands pass through unchanged. +- Read the action timeout from BAZEL_ADAPTIVE_BUILD_TIMEOUT as a bare positive + number of seconds; default to 150 seconds. This applies to builds and tests + and is independent of Bazel's --test_timeout. +- Read the low-memory threshold from BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB as + MiB; default to 1024 MiB. Memory is read from /proc/meminfo, using + MemAvailable and MemTotal. +- Start the real Bazel command with a positive nice increment so the wrapper + stays at normal scheduler priority and can keep sampling memory even when + Bazel's compile/test process tree is CPU-heavy or memory pressure causes + scheduler churn. Read the increment from BAZEL_ADAPTIVE_BAZEL_NICE; default + to 5, clamp to 0..19, and use 0 to disable this behavior. Also periodically + renice detected Bazel build child processes under the Bazel output base after + memory first enters the pause-watch band, because actions may be launched by + the Bazel server rather than by the client process that inherited the initial + nice value. This covers sandbox wrappers, compilers, language tools, and tests + without interpreting the action language. This does not make the wrapper + realtime or immune to kernel-level stalls, but it gives the monitor a better + chance to run when it matters. + +Terminal behavior: +- For interactive runs, attach Bazel stdout/stderr to a PTY so Bazel keeps its + live progress UI and Ctrl-C works naturally through process groups and + foreground terminal handoff. Size the PTY from the real terminal so progress + lines are not truncated to 80 columns. For non-interactive runs, use normal + pipes unless BAZEL_ADAPTIVE_FORCE_PTY=1 is set; PTY columns/rows can be + overridden with BAZEL_ADAPTIVE_PTY_COLUMNS and BAZEL_ADAPTIVE_PTY_ROWS. + +Monitoring model: +- Sample memory once per second by default, configurable with + BAZEL_ADAPTIVE_MEMORY_POLL_INTERVAL and clamped to at least 0.05 seconds to + avoid busy looping. Keep a rolling 30-second memory window for reports and + retry decisions. +- Every 50ms, decide whether to pause one detected Bazel action process group + with SIGSTOP, choosing the youngest action group first and always leaving at + least one action group running. Pausing is staggered across the range from + twice to once the current effective low-memory threshold: with N detected + action groups, the first pause is allowed when MemAvailable drops below 2x + the effective threshold, and later pauses require progressively lower + MemAvailable until the current running-action floor is reached. Under plain + memory pressure that floor is half of the maximum action groups observed in + the attempt; while uninterruptible I/O evidence remains sustained, that floor + is halved, rounded up, once per BAZEL_ADAPTIVE_IO_STALL_FLOOR_SECONDS window. + This tight pause watch is intentionally faster than Bazel's progress output + so a newly spawned action can be paused before it grows into a large memory + consumer, but the staggered thresholds avoid pausing every action group at + once when memory first crosses one cliff. Every 5 seconds, if MemAvailable is + above the effective low-memory threshold, high enough that the group would not + immediately qualify for pausing again, and high enough to cover the selected + group's estimated swap-in cost, resume at most one paused action group with + SIGCONT, choosing the oldest paused group first. After a normal resume, + suppress further normal resumes for one resume interval so that group's memory + use can settle before reassessing. While memory remains tight, resume is also + blocked while any running action group is in uninterruptible I/O wait, and for + one resume interval after that stall clears, to avoid immediately + reintroducing swap pressure. The + swap-in estimate is the sum of VmSwap from /proc//status for the action + group's processes; the resume gate requires MemAvailable to cover both that + estimate and the low-memory reserve. Paused processes stay in their + original cgroup. The wrapper deliberately does not create or migrate cgroups; + it lets normal kernel reclaim and any configured swap policy decide whether + stopped processes should be swapped out while running actions continue. +- Plain memory pressure alone does not pause down to a single active action + group. The normal floor is half of the maximum action groups observed + in the attempt, rounded up, so a --jobs=12 build keeps at least 6 action + groups running. If running action groups stay in uninterruptible I/O wait for + BAZEL_ADAPTIVE_IO_STALL_FLOOR_SECONDS, default 10 seconds, the floor is + halved, rounded up. If that stall evidence remains present, the floor is + halved again after each additional interval, for example 6 -> 3 -> 2 -> 1. + Sustained I/O blocking is based on repeated observations in the current stall + window, not a single D-state sighting. The evidence can come from running + action groups in D state or heavy swap-in reported by /proc/vmstat. Pausing + below the normal floor requires both tight memory and currently sustained I/O + blocking. +- After every fresh process-group scan, enforce the "at least one action group + running" invariant operationally: if all currently detected action groups are + wrapper-paused, immediately resume the oldest paused group even when memory is + still below the normal resume threshold. This covers races where Bazel's + action set changes after a pause decision and prevents an all-stopped + deadlock. +- The low-memory reserve is adaptive within the range from the configured base + threshold to twice that base. It starts at the base threshold, which should let + the kernel use some swap. If any still-running action group has a process in + uninterruptible I/O wait ("D" state), or if /proc/vmstat shows heavy swap-in, + the wrapper treats that as evidence that running jobs are stalling on memory + pressure and raises the effective threshold by 256 MiB, up to the 2x cap. Once + no running action is stalled, no wrapper-paused actions remain, and memory is + healthy, the threshold decays back toward the base in the same small steps. + This avoids guessing one static memory threshold while still preventing + pause/resume flapping. +- While any wrapper-paused action group is stopped, the progress parser records + a pause interval. Action-age decisions and forwarded Bazel action-duration + displays subtract pause overlap from Bazel's displayed action time and from + wall-clock aging, so an action that Bazel reports as 130s old after being + paused for 30s is treated and shown as roughly 100s of active runtime for the + display. Downscale timeout checks use the active runtime of real, non-paused + Bazel action process groups when that proves all active groups are old, but + Bazel's visible action-duration sample can also trigger the timeout path when + Bazel caps the displayed action list. Where possible, duration rewriting is + matched to the source files associated with the paused action groups so + running action durations continue to advance. Rewritten durations keep Bazel's + plain seconds format, for example "100s" rather than "1m40s". +- If pausing is not enough and the normal timeout/low memory condition is + reached, the existing restart/downscale fallback still applies. +- When the wrapper starts stopping Bazel for upscale/downscale, first resume + every action group paused by that attempt with SIGCONT and stop doing normal + throttle pause/resume checks for that attempt. Before any Bazel attempt ends, + including retry, Ctrl-C, exception, or final exit paths, run the same resume + sweep again. Remember the exact stopped PIDs so resume can still run if a + later process scan cannot reconstruct the sandbox group. Cleanup and process + termination run only after this resume sweep, so the wrapper does not leave + stopped build jobs behind. +- As soon as Bazel reports a killed or terminated action, resume all paused + action groups and stop normal pause/resume checks for that attempt. Bazel is + already winding down after an action death, and keeping stopped actions around + can make the server cancellation path look stuck. +- As soon as the wrapper observes Ctrl-C/SIGTERM, resume all paused action + groups and stop normal pause/resume checks while Bazel cancels. Bazel also + receives the user signal, but stopped action children must be resumed so the + server can finish cancelling the pending invocation. Bazel's own interrupt + text is treated as user cancellation only when the wrapper did not already + ask Bazel to stop for an adaptive up/downscale restart. +- Parse Bazel progress frames from both newline and carriage-return updates. + Track completed action counts, the currently reported number of running + actions, and the visible action durations. Action lines are detected by their + Bazel-style duration suffix, such as "Compiling x.cc; 27s + processwrapper-sandbox", "GoCompilePkg //pkg:go_default_library; 27s remote", + "Rustc //crate:lib; 27s linux-sandbox", or "ProtoCompile //api:v1; 27s + worker"; the action name, language, file extension, and execution backend are + not otherwise interpreted. Treat forms like "7 actions, 6 running", "6 + actions running", "1 action", and "no actions running" as running-action + counts. Track the "[done / total]" progress count when Bazel prints it. +- In non-PTY mode Bazel can print compact one-line progress updates like + "[9,890 / 10,553] Compiling x.cc; 2s processwrapper-sandbox ... (13 actions, + 12 running)" instead of a header followed by one line per running action. + Test builds may add a test-progress prefix, for example + "[8,190 / 8,369] 1 / 15 tests; Compiling x.cc; 15s processwrapper-sandbox + ... (13 actions, 12 running)". Treat the duration on such a progress-header + line as Bazel's summarized current-action age for timeout and cheap-upscale + decisions, while still using the running-action count and completed/total + counts from the same line. +- If memory pressure leaves Bazel output stuck mid-frame or mid-line, keep + forwarding the partial bytes but also account for them in monitoring instead + of waiting for CR/LF. A partial action line that already contains a duration + is counted immediately. Bazel can report N running actions while displaying + only a capped subset of their duration lines, so visible action durations are + treated as the available current-action sample rather than requiring all N + lines. Under the normal low-memory downscale gate, visible action durations + that are already over BAZEL_ADAPTIVE_BUILD_TIMEOUT can stand in for hidden + action lines. If there are no visible durations yet and Bazel output is stuck + mid-frame or mid-line, wall-clock time since the progress frame is used until + the frame has been stuck past the timeout. This same wall-clock aging applies + when Bazel printed action lines and then stopped updating output entirely; the + wrapper estimates the current action ages from the last displayed durations + instead of waiting for Bazel's next progress update. +- Swap should be configured to allow paused jobs to be swapped out on demand + and make memory available for the running builds. + +Downscale behavior: +- Downscale checks stay active for the whole run and take priority over any + pending upscale. Low memory alone does not interrupt a progressing build. +- If the latest progress frame reports at least one running action, action-age + evidence is over the action timeout, and the current MemAvailable is below + the effective low-memory threshold, gracefully interrupt Bazel and retry with + half as many jobs, rounded up. If the completed action count advanced within + the timeout window and the wrapper currently observes no running-action I/O + distress, defer this downscale because long action age alone is not failure + evidence while the build is still progressing. If action groups are already + wrapper-paused, use the higher pause-watch threshold for this gate, because + pausing has already proven that the current attempt is under memory pressure. + For example, 12 -> 6, 6 -> 3, 5 -> 3, 3 -> 2, and 2 -> 1. Before retrying, + wait until memory has recovered to at least half of total memory. +- If Bazel reports a killed or terminated action, exits with an abrupt server + failure, or exits while Bazel build processes are still dangling under the + output base, retry. If the recent rolling average memory is more than half of + total memory, retry with the same job count; otherwise downscale by half, + rounded up. Repeated killed/terminated action failures at the same job count + are retried after the standard restart-settle gate instead of downscaling, + because they can be stale Bazel/server fallout after an earlier interruption. + Cap those same-job retries with BAZEL_ADAPTIVE_SAME_JOB_RETRY_LIMIT, default + 10 per job count, so a deterministic failure cannot loop forever. When the + cap is exceeded, return the failing Bazel exit code. Keep the first same-job + retry cheap. If that retry then reports another killed/terminated action + before making meaningful Bazel action progress, treat that repeated restart + failure as evidence that Bazel/server state has not wound down yet: run bazel + shutdown, wait for known build children to disappear, add the normal settle + delay, and then retry again with the same jobs. +- If Bazel crashes with both "FATAL: bazel crashed due to an internal error" + and "Caused by: java.lang.InterruptedException", retry with the same job + count. This is deliberately narrow and is disabled when the wrapper itself is + handling Ctrl-C/SIGTERM or when the wrapper intentionally interrupted Bazel + for an adaptive restart. Retry this crash signature at most once per wrapper + run; if it repeats, return the failing Bazel exit code. +- If Bazel reports a build failure while the wrapper is stopping Bazel for an + upscale, cancel the upscale and return that failure after cleanup. +- Do not retry a downscale when jobs is already 1. + +Upscale behavior: +- The first upscale analysis interval starts only after running Bazel actions + have actually been observed, not at Bazel process start. The interval is 30 + seconds by default, with BAZEL_ADAPTIVE_UPSCALE_CHECK_INTERVAL available for + tests. +- After that warm-up interval, continuously watch on each monitoring loop for a + cheap and memory-safe upscale point. Restart with 1.5x jobs, rounded up and + capped at the initial maximum, at the first moment all of these are true: + * Bazel has reported meaningful work in this attempt, meaning the completed + action count has advanced at least once; + * the rolling 30-second average MemAvailable is more than half of MemTotal; + * the rolling 30-second minimum MemAvailable did not dip below the configured + low-memory threshold; + * no action groups are currently wrapper-paused; + * Bazel still has current running actions; + * Bazel's known remaining action count is more than twice the current --jobs + value, because a restart this close to the end usually costs more than it + saves; + * Bazel's known remaining action count is greater than the reported running + action count, because if all remaining actions are already running Bazel is + winding down; + * when Bazel reports completed/total counts, a decreased running-action count + is only treated as winding down near the end of the build. Normal mid-build + fluctuations such as 12 -> 11 -> 12 are not enough to block upscale when + many actions remain; + * at least one visible current running action duration is available, and + every visible current running action is less than 15 seconds old. Bazel may + report more running actions than it prints duration lines for, especially + when the progress UI is truncated; hidden durations do not by themselves + block upscale. +- Bazel has no drain-current-actions-then-exit mode for changing --jobs, so the + wrapper does not try to finish old actions before restarting. Instead, if the + memory condition is good but current actions are already 15+ seconds old, no + meaningful work has completed yet, no actions are running, no more than two + current job waves are known to remain, all known remaining actions are already + running, or the running count has decreased while Bazel is already near the + end of the build, keep a pending upscale and re-evaluate every monitoring + loop. If memory pressure increases while waiting, keep + postponing. When memory is still safe, work has advanced, and the current + action set becomes cheap to abandon, gracefully interrupt Bazel and restart at + the higher job count. Skipped-upscale diagnostics report both scheduled + upscale attempts and the continuous pending-watch re-evaluations, plus + separate skip counts for memory gates and job-runtime/action-state gates. + +Cleanup behavior: +- Before every retry and before final exit, refresh Bazel server/output-base + knowledge for this wrapper's process cgroup and sweep for dangling build + processes under those Bazel output bases. Terminate only processes in that + same cgroup domain; print individual process details only for jobs that could + not be killed. Escalate stalled Bazel shutdowns with bazel shutdown and + cgroup-scoped process-tree killing as needed. This keeps two simultaneous + Docker build instances on the same host from pausing or killing each other's + Bazel actions. +- Before every Bazel restart, including same-job retries, downscales, and + upscales, wait up to BAZEL_ADAPTIVE_RESTART_SETTLE_DELAY seconds for known + Bazel build child processes under the output base to disappear, then pause for + that same small delay. The default is 3 seconds. This gives the Bazel server + and processwrapper-sandbox children time to wind down and avoids immediate + restart churn. +- Discover Bazel output bases from same-cgroup Bazel server process command + lines and from abrupt-server log-file diagnostics. Treat same-cgroup + non-server processes as Bazel build children when their cwd is under a known + output base or their command line mentions that output base. This is + intentionally independent of action language and catches processwrapper, + clang, Go, Rust, proto, test runner, and other sandbox children the same way. +""" + +import errno +import fcntl +import os +import pty +import re +import selectors +import shutil +import signal +import struct +import subprocess +import sys +import termios +import time +from dataclasses import dataclass, field + + +DEFAULT_ACTION_TIMEOUT_SECONDS = 150 +BUILD_TIMEOUT_ENV = "BAZEL_ADAPTIVE_BUILD_TIMEOUT" + +JOBS_COMMANDS = frozenset({"build", "test", "run", "coverage", "fetch", "cquery", "aquery"}) + +DEFAULT_LOW_MEMORY_THRESHOLD_MB = 1024 +LOW_MEMORY_THRESHOLD_ENV = "BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB" + +ADAPTIVE_THRESHOLD_STEP_MB = 256 +ADAPTIVE_THRESHOLD_RAISE_COOLDOWN_SECONDS = 10.0 +ADAPTIVE_THRESHOLD_LOWER_COOLDOWN_SECONDS = 30.0 + +DEFAULT_IO_STALL_FLOOR_SECONDS = 3 +IO_STALL_FLOOR_SECONDS_ENV = "BAZEL_ADAPTIVE_IO_STALL_FLOOR_SECONDS" + +DEFAULT_IO_STALL_SWAP_RATE_MB_PER_SECOND = 32 +IO_STALL_SWAP_RATE_ENV = "BAZEL_ADAPTIVE_IO_STALL_SWAP_RATE_MB_PER_SECOND" + +DISPLAY_PAUSE_LABEL_GRACE_SECONDS = 60.0 +DISPLAY_PAUSE_LABEL_HISTORY_LIMIT = 512 + +DEFAULT_MEMORY_POLL_INTERVAL_SECONDS = 1.0 +MEMORY_POLL_INTERVAL_ENV = "BAZEL_ADAPTIVE_MEMORY_POLL_INTERVAL" +MIN_MEMORY_POLL_INTERVAL_SECONDS = 0.05 + +DEFAULT_UPSCALE_CHECK_INTERVAL_SECONDS = 30.0 +UPSCALE_CHECK_INTERVAL_ENV = "BAZEL_ADAPTIVE_UPSCALE_CHECK_INTERVAL" + +DEFAULT_SAME_JOB_RETRY_LIMIT = 10 +SAME_JOB_RETRY_LIMIT_ENV = "BAZEL_ADAPTIVE_SAME_JOB_RETRY_LIMIT" + +DEFAULT_RESTART_SETTLE_DELAY_SECONDS = 3.0 +RESTART_SETTLE_DELAY_ENV = "BAZEL_ADAPTIVE_RESTART_SETTLE_DELAY" + +DEFAULT_BAZEL_NICE = 5 +BAZEL_NICE_ENV = "BAZEL_ADAPTIVE_BAZEL_NICE" +MIN_BAZEL_NICE = 0 +MAX_BAZEL_NICE = 19 + +DEFAULT_MEMINFO_PATH = "/proc/meminfo" +MEMINFO_ENV = "BAZEL_ADAPTIVE_MEMINFO" +DEFAULT_VMSTAT_PATH = "/proc/vmstat" +VMSTAT_ENV = "BAZEL_ADAPTIVE_VMSTAT" + +WRAPPER_START_TIME = time.monotonic() +try: + CLOCK_TICKS_PER_SECOND = os.sysconf(os.sysconf_names["SC_CLK_TCK"]) +except (AttributeError, KeyError, OSError, ValueError): + CLOCK_TICKS_PER_SECOND = 100 +try: + PAGE_SIZE_KB = max(1, os.sysconf("SC_PAGE_SIZE") // 1024) +except (AttributeError, OSError, ValueError): + PAGE_SIZE_KB = 4 +DEFAULT_ENV_FLAG_VALUE = "" + +FORCE_PTY_ENV = "BAZEL_ADAPTIVE_FORCE_PTY" +DISABLE_PTY_ENV = "BAZEL_ADAPTIVE_DISABLE_PTY" +DEFAULT_PTY_COLUMNS = "" +PTY_COLUMNS_ENV = "BAZEL_ADAPTIVE_PTY_COLUMNS" +DEFAULT_PTY_ROWS = "" +PTY_ROWS_ENV = "BAZEL_ADAPTIVE_PTY_ROWS" +DEFAULT_TERMINAL_COLUMNS = 240 +DEFAULT_TERMINAL_ROWS = 24 + +BAZEL_ENV = "BAZEL" +DEFAULT_PATH = "" +PATH_ENV = "PATH" + +UPSCALE_READY = "ready" +UPSCALE_PENDING = "pending" +UPSCALE_BLOCKED = "blocked" +SKIP_MEMORY = "memory" +SKIP_JOB_RUNTIME = "job-runtime" + +RECENT_MEMORY_PRESSURE_SECONDS = 10.0 +MEMORY_REPORT_SECONDS = 30.0 +UPSCALE_MAX_ACTION_SECONDS = 15 +UPSCALE_REMAINING_ACTION_FINISH_JOBS_MULTIPLIER = 2 +RECENT_STALL_SECONDS = 30.0 +TIMEOUT_DOWNSCALE_DEFER_REPORT_SECONDS = 30.0 +DANGLING_PROCESS_TERM_WAIT_SECONDS = 3.0 +DANGLING_PROCESS_KILL_WAIT_SECONDS = 1.0 +BAZEL_SHUTDOWN_MIN_TIMEOUT_SECONDS = 1 +BAZEL_SHUTDOWN_MAX_TIMEOUT_SECONDS = 30 +RENICE_BUILD_CHILDREN_SECONDS = 2.0 +THROTTLE_PAUSE_CHECK_SECONDS = 0.05 +THROTTLE_IDLE_PAUSE_CHECK_SECONDS = 0.5 +THROTTLE_RESUME_CHECK_SECONDS = 5.0 +RESUME_IO_STALL_CLEAR_SECONDS = THROTTLE_RESUME_CHECK_SECONDS +RESUME_MEMORY_SETTLE_SECONDS = THROTTLE_RESUME_CHECK_SECONDS +IO_STALL_RECENT_OBSERVATION_SECONDS = 1.0 +IO_STALL_MIN_OBSERVATIONS = 2 + +# Use: split Bazel output into progress frames. Bazel can update progress with +# either newline or carriage-return records; splitting on both lets the parser +# see each progress update independently. +# Example: "[1 / 4] 2 actions running\r[2 / 4] 1 action running\n" +# Extracted fields: none; this only separates records at "\r" and "\n". +# Breakage risk: low; it relies on terminal control characters, not Bazel text. +LINE_SEPARATOR_RE = re.compile(r"[\r\n]") + +# Use: strip ANSI/VT100 escape sequences before matching Bazel text. This keeps +# color and cursor-control output from interfering with progress/failure parsing. +# Example: "\x1b[32mINFO: Build completed successfully\x1b[0m" +# Extracted fields: none; the whole escape sequence is removed. +# Breakage risk: low; this is the conventional CSI escape shape. +ANSI_RE = re.compile(r"\x1b\[[0-?]*[ -/]*[@-~]") + +# Use: identify a Bazel progress-frame header before resetting current action +# durations and parsing progress/running-action state. +# Example: "[10,586 / 10,588] 13 / 15 tests; 2 actions running" +# Extracted fields: none here; PROGRESS_COUNT_RE extracts done/total separately. +# Breakage risk: medium; the "[done / total]" prefix is common but not a formal +# API, so a future Bazel progress UI redesign could require an update. +PROGRESS_HEADER_RE = re.compile(r"^\[[0-9,]+(?:\s*/\s*[0-9,]+)?\]") + +# Use: extract Bazel's completed/total action counts from a progress header. +# The wrapper uses this to detect meaningful progress and to avoid upscaling +# when every known remaining action is already running. +# Example: "[10,586 / 10,588] 13 / 15 tests; 2 actions running" +# Extracted fields: done="10,586"; total="10,588". Commas are stripped before +# converting to integers. +# Breakage risk: medium; small whitespace changes are tolerated, but a different +# delimiter or ordering for Bazel progress counts would break it. +PROGRESS_COUNT_RE = re.compile(r"^\[(?P[0-9,]+)(?:\s*/\s*(?P[0-9,]+))?\]") + +# Use: extract the number of currently running Bazel actions from a progress +# frame. This lets the wrapper track whether Bazel currently has active work, +# detect running-count decreases, and combine Bazel's count with the wrapper's +# paused-action count when rewriting display summaries. Bazel may cap the +# number of action-duration lines it prints, so duration samples are treated as +# visible evidence rather than a complete list of all running actions. +# Example: "[10,501 / 10,575] 1 / 15 tests; 7 actions, 6 running" +# Extracted fields: listed_actions="7" and listed_running="6" for +# "7 actions, 6 running"; actions_running="2" for "2 actions running"; +# running="6" for "6 running"; action_only="1" for "1 action"; no named field +# is present for "no actions running", which maps to zero. The action_only form +# covers Bazel's near-finish output, for example +# "[10,420 / 10,421] 13 / 14 tests; 1 action; last test: //tests:foo". +# Breakage risk: medium-high; it tolerates singular/plural "action(s)" and +# Bazel omitting "running" for a lone action, but different wording such as +# "jobs running" or substantially reordered phrases would need a new pattern. +RUNNING_COUNT_RE = re.compile( + r"\b(?:" + r"(?P[0-9]+)\s+actions?,\s*(?P[0-9]+)\s+running" + r"|(?P[0-9]+)\s+actions?\s+running" + r"|(?P[0-9]+)\s+running" + r"|(?P[0-9]+)\s+actions?" + r"|no\s+actions?\s+running" + r")\b" +) + +# Use: extract Bazel's displayed age for an action from any action line, +# independent of language, mnemonic, filename, or execution backend. These ages +# drive stall detection and the "only upscale when current jobs are cheap to +# abandon" rule. +# Example: "GoCompilePkg //proxylib:go_default_library; 2m13s remote" +# Extracted fields: duration="2m13s"; hours=None; minutes="2"; seconds="13". +# For "1h02m03s", hours="1", minutes="02", seconds="03". +# Breakage risk: medium; the action text is flexible, but the parser depends on +# Bazel continuing to print durations as "; s" with h/m/s units. +ACTION_DURATION_RE = re.compile( + r";\s*(?P(?:(?P[0-9]+)h)?(?:(?P[0-9]+)m)?(?P[0-9]+)s)\b" +) + +# Use: resolve Bazel-style --jobs host expressions accepted by this wrapper. +# This is wrapper input syntax, not Bazel output. The extracted keyword selects +# CPU count or RAM in MiB, and the optional multiplier scales it. +# Example: "HOST_RAM*.0002" +# Extracted fields: keyword="HOST_RAM"; multiplier=".0002". For "HOST_CPUS", +# keyword="HOST_CPUS"; multiplier=None. +# Breakage risk: low; this is our own documented interface. +JOBS_KEYWORD_RE = re.compile( + r"^(?PHOST_CPUS|HOST_RAM)(?:\*(?P(?:[0-9]+(?:\.[0-9]*)?|\.[0-9]+)))?$" +) + +# Use: detect output indicating an action probably died due to memory pressure +# or termination. This makes a failed Bazel invocation eligible for a retry with +# fewer or same jobs depending on recent memory history. +# Example: "ERROR: ... failed: (Killed): clang failed" +# Extracted fields: none; any match sets saw_memory_kill=True. +# Breakage risk: medium-high; compiler, worker, or kernel messages vary across +# tools and platforms, so new OOM/termination wording may need to be added. +MEMORY_KILL_RE = re.compile( + r"\b(?:Killed|Cannot allocate memory|OutOfMemoryError)\b|\(\s*Terminated\s*\)", + re.IGNORECASE, +) + +# Use: detect that Bazel reported an actual action failure while the wrapper was +# stopping Bazel for an upscale. In that case, the upscale is cancelled and the +# action failure is returned instead of retrying as if the interruption +# succeeded. Generic summaries such as "Target //x failed to build" are +# intentionally excluded because Bazel also prints them after the wrapper's own +# interrupt. +# Example: "ERROR: /tmp/example: Compiling example.cc failed: error executing CppCompile" +# Extracted fields: none; any match sets saw_build_failure=True. +# Breakage risk: medium-high; Bazel and rule-specific failure text is not a +# stable API, and localized or substantially reformatted errors could escape it. +BUILD_FAILURE_RE = re.compile( + r"\bfailed:\s+(?:\(|error executing)", + re.IGNORECASE, +) + +# Use: detect abrupt Bazel server/client disconnects. These failures are treated +# as retry candidates because they often happen during severe memory pressure or +# server death. +# Example: "Server terminated abruptly (error code: 14, error message: 'Socket closed')" +# Extracted fields: none; any match sets saw_server_abrupt=True. +# Breakage risk: medium; these phrases have been observed in Bazel output, but +# future versions may change error wording or transport diagnostics. +SERVER_ABRUPT_RE = re.compile( + r"\b(?:Server terminated abruptly|Socket closed|Connection reset by peer)\b", re.IGNORECASE +) + +# Use: detect the first half of a narrow retryable Bazel crash signature. This +# is only retried when paired with JAVA_INTERRUPTED_RE and not caused by a user +# or wrapper interrupt. +# Example: "FATAL: bazel crashed due to an internal error. Printing stack trace:" +# Extracted fields: none; any match sets saw_internal_crash=True. +# Breakage risk: medium; this is Bazel's human crash text, so wording changes +# would require an update, but the phrase is intentionally specific. +INTERNAL_CRASH_RE = re.compile( + r"\bFATAL:\s+bazel crashed due to an internal error\b", + re.IGNORECASE, +) + +# Use: detect the second half of the narrow retryable Bazel crash signature. +# InterruptedException alone is not enough to retry, because user interrupts can +# also involve interruption; it must be paired with INTERNAL_CRASH_RE. +# Example: "Caused by: java.lang.InterruptedException" +# Extracted fields: none; any match sets saw_java_interrupted=True. +# Breakage risk: low-medium; Java exception names are stable, but Bazel could +# stop printing causes in this exact text form. +JAVA_INTERRUPTED_RE = re.compile( + r"\bCaused by:\s+java\.lang\.InterruptedException\b", + re.IGNORECASE, +) + +# Use: detect Bazel output proving that a user interrupt reached Bazel even if +# the wrapper process did not receive SIGINT because Bazel owned the foreground +# terminal process group. This makes the wrapper stop retrying and return 130. +# Example: "Bazel caught interrupt signal; cancelling pending invocation." +# Example: "ERROR: build interrupted" +# Extracted fields: none; any match sets saw_user_interrupt=True. +# Breakage risk: medium; this depends on Bazel's human interrupt text, but the +# alternatives are treating a user cancel as retryable killed actions. +BAZEL_USER_INTERRUPT_RE = re.compile( + r"\b(?:Bazel caught interrupt signal|build interrupted)\b", + re.IGNORECASE, +) + +# Use: rewrite Bazel progress summaries in the forwarded output when the wrapper +# has paused action groups. This is display-only; parser state still sees the +# original Bazel text. It handles both one-line non-PTY summaries and regular +# progress headers. +# Example: "(12 actions, 11 running)" or "12 actions, 11 running" +# Extracted fields: actions="12", running="11" for the parenthesized form, and +# bare_actions="12", bare_running="11" for the bare form. Rewritten as +# "(12 actions, 10 paused, 1 running)" or "12 actions, 10 paused, 1 running" +# when paused_count is 10. +# Example: "12 actions running" or "(8 actions running)" +# Extracted fields: only_running="12" or parenthesized_only_running="8"; rewritten +# as "12 actions, 10 paused, 2 running" or +# "(8 actions, 6 paused, 2 running)" when paused_count is 10 or 6. +# Breakage risk: medium; it depends on Bazel's human progress summary wording, +# but failures only affect display prettiness, not adaptive decisions. +ACTION_SUMMARY_DISPLAY_RE = re.compile( + r"\((?P[0-9]+) actions?, (?P[0-9]+) running\)" + r"|(?P[0-9]+) actions?, (?P[0-9]+) running" + r"|\((?P[0-9]+) actions running\)" + r"|(?P[0-9]+) actions running" +) + +# Use: extract Bazel's output base from abrupt-server log-file diagnostics. The +# cleanup code uses it to find and terminate dangling sandbox/build processes. +# Example: "log file: '/home/user/.cache/bazel/_bazel_user/hash/server/jvm.out'" +# Extracted fields: output_base="/home/user/.cache/bazel/_bazel_user/hash". +# Breakage risk: medium; it depends on single quotes and the server/jvm.out path +# suffix. Different quoting or log-file naming would require adjustment. +OUTPUT_BASE_LOG_RE = re.compile(r"log file: '(?P[^']+)/server/jvm\.out'") + +# Use: extract the Bazel sandbox action directory from a process cwd or command +# line. Processes in the same sandbox directory are treated as one action group +# for SIGSTOP/SIGCONT throttling, renicing, and cleanup reasoning. +# Example: "/home/user/.cache/bazel/_bazel_user/hash/sandbox/processwrapper-sandbox/17/execroot/ws" +# Extracted fields: sandbox_key="processwrapper-sandbox/17". +# Breakage risk: medium; Bazel has used this sandbox path shape for a long time, +# but a future sandbox layout change would require updating this extraction. +SANDBOX_ACTION_RE = re.compile(r"/sandbox/(?P[^/\s]+/[0-9]+)(?:/|\s|$)") + +# Use: identify source-like path tokens from compiler command lines so paused +# sandbox process groups can be associated with the action labels Bazel prints. +# This makes displayed action durations pause only for the files whose process +# groups are actually SIGSTOPed. +# Example: ".../external/envoy/test/integration/http_integration.cc" +# Extracted fields: none; matching tokens are normalized into candidate labels. +# Breakage risk: low-medium; this is an extension allow-list, so uncommon source +# extensions may need to be added if they appear in Bazel action output. +SOURCE_FILE_RE = re.compile( + r"\.(?:c|cc|cpp|cxx|c\+\+|C|m|mm|h|hh|hpp|hxx|inc|S|s|rs|go|proto)$" +) + +ACTIVE_PROCESS = None +USER_TERMINATING = False + + +@dataclass +class MemInfo: + total_kb: int + available_kb: int + + +@dataclass +class SwapIo: + pages_in: int + + +@dataclass +class ParsedArgs: + original_args: list[str] + initial_jobs: int + action_timeout: int + job_locations: list[tuple[str, int]] + supports_jobs: bool + + +@dataclass +class RunResult: + exit_code: int + restart: str | None = None + upscale_skip_reason: str | None = None + upscale_skip_count: int = 0 + upscale_reevaluation_count: int = 0 + upscale_memory_skip_count: int = 0 + upscale_job_runtime_skip_count: int = 0 + upscale_description: str | None = None + failure_retry_same: bool = False + failure_average_description: str | None = None + retry_after_dangling_processes: bool = True + internal_interrupted_crash: bool = False + retryable_action_failure: bool = False + meaningful_work_done: bool = False + user_interrupted: bool = False + + +@dataclass +class UpscaleEvaluation: + status: str + reason: str | None + skip_category: str | None + + +# Remember one displayed Bazel action duration and its estimated start time. +@dataclass +class ObservedActionDuration: + displayed_seconds: int + started_at: float | None + observed_at: float | None + + +@dataclass +class BazelServer: + pid: int + output_base: str | None + + +@dataclass +class ProcessInfo: + pid: int + cmdline: str + cwd: str | None + nice: int | None + started_at_ticks: int | None + ppid: int | None = None + state: str | None = None + cgroups: tuple[tuple[str, str], ...] = field(default_factory=tuple) + + +@dataclass +class CleanupResult: + count: int + + +@dataclass +class ActionProcessGroup: + key: str + pids: list[int] + started_at_ticks: int | None + action_labels: set[str] = field(default_factory=set) + states: set[str] = field(default_factory=set) + + +# Format the diagnostic prefix with elapsed seconds since wrapper start. +def diag_prefix() -> str: + elapsed_seconds = max(0, int(time.monotonic() - WRAPPER_START_TIME)) + return f"[bazel-adaptive/{elapsed_seconds}s]" + + +# Emit wrapper diagnostics without corrupting Bazel's active TTY progress line. +def diag(message: str) -> None: + try: + stderr_is_tty = os.isatty(sys.stderr.fileno()) + except (AttributeError, OSError): + stderr_is_tty = False + + if stderr_is_tty: + sys.stderr.write(f"\r\x1b[K{diag_prefix()} {message}\n") + else: + sys.stderr.write(f"{diag_prefix()} {message}\n") + sys.stderr.flush() + + +# Parse a positive integer option value, returning None for invalid input. +def positive_int(value: str) -> int | None: + try: + parsed = int(value, 10) + except ValueError: + return None + if parsed <= 0: + return None + return parsed + + +# Parse a Bazel --jobs value, including HOST_CPUS/HOST_RAM expressions. +def jobs_value(value: str) -> int | None: + parsed = positive_int(value) + if parsed is not None: + return parsed + + match = JOBS_KEYWORD_RE.match(value) + if not match: + return None + + keyword = match.group("keyword") + if keyword == "HOST_CPUS": + base = os.cpu_count() or 1 + else: + try: + # Resolve HOST_RAM job expressions against MemTotal in MiB. + base = read_meminfo().total_kb // 1024 + except OSError: + base = 0 + if base <= 0: + return None + + multiplier = float(match.group("multiplier") or "1") + if multiplier <= 0: + return None + return max(1, int(base * multiplier)) + + +# Read the adaptive action timeout from the environment. +def build_timeout_from_env(env: dict[str, str] | None = None) -> int: + value = (env or os.environ).get(BUILD_TIMEOUT_ENV) + if value is None: + return DEFAULT_ACTION_TIMEOUT_SECONDS + + timeout = positive_int(value) + if timeout is None: + raise ValueError(f"{BUILD_TIMEOUT_ENV} must be a positive integer number of seconds") + return timeout + + +# Read the low-memory threshold from the environment in KiB. +def low_memory_threshold_kb(env: dict[str, str] | None = None) -> int: + value = (env or os.environ).get(LOW_MEMORY_THRESHOLD_ENV) + if value is None: + return DEFAULT_LOW_MEMORY_THRESHOLD_MB * 1024 + + threshold_mb = positive_int(value) + if threshold_mb is None: + raise ValueError(f"{LOW_MEMORY_THRESHOLD_ENV} must be a positive integer number of MiB") + return threshold_mb * 1024 + + +# Read how long I/O stalls must persist before lowering the running-action floor. +def io_stall_floor_seconds(env: dict[str, str] | None = None) -> int: + value = (env or os.environ).get(IO_STALL_FLOOR_SECONDS_ENV) + if value is None: + return DEFAULT_IO_STALL_FLOOR_SECONDS + + seconds = positive_int(value) + if seconds is None: + raise ValueError(f"{IO_STALL_FLOOR_SECONDS_ENV} must be a positive integer") + return seconds + + +def io_stall_swap_rate_kb_per_second(env: dict[str, str] | None = None) -> int: + value = (env or os.environ).get(IO_STALL_SWAP_RATE_ENV) + if value is None: + return DEFAULT_IO_STALL_SWAP_RATE_MB_PER_SECOND * 1024 + + rate_mb = positive_int(value) + if rate_mb is None: + raise ValueError(f"{IO_STALL_SWAP_RATE_ENV} must be a positive integer") + return rate_mb * 1024 + + +# Return the index of Bazel's "--" delimiter, or the end of args if absent. +def bazel_option_end(args: list[str]) -> int: + try: + return args.index("--") + except ValueError: + return len(args) + + +def bazel_command_supports_jobs(args: list[str]) -> bool: + end = bazel_option_end(args) + skip_possible_option_value = False + for arg in args[:end]: + if skip_possible_option_value: + skip_possible_option_value = False + continue + if arg.startswith("-"): + if "=" not in arg: + skip_possible_option_value = True + continue + return arg in JOBS_COMMANDS + return False + + +# Parse Bazel arguments enough to find the initial jobs cap. +def parse_bazel_args(args: list[str], action_timeout: int | None = None) -> ParsedArgs: + initial_jobs = None + job_locations: list[tuple[str, int]] = [] + supports_jobs = bazel_command_supports_jobs(args) + + end = bazel_option_end(args) + if supports_jobs: + i = 0 + while i < end: + arg = args[i] + if arg.startswith("--jobs="): + job_locations.append(("equals", i)) + parsed = jobs_value(arg.split("=", 1)[1]) + if parsed is not None: + initial_jobs = parsed + elif arg == "--jobs" and i + 1 < end: + job_locations.append(("separate", i)) + parsed = jobs_value(args[i + 1]) + if parsed is not None: + initial_jobs = parsed + i += 1 + i += 1 + + if initial_jobs is None: + initial_jobs = os.cpu_count() or 1 + if action_timeout is None: + action_timeout = build_timeout_from_env() + + return ParsedArgs( + original_args=list(args), + initial_jobs=initial_jobs, + action_timeout=action_timeout, + job_locations=job_locations, + supports_jobs=supports_jobs, + ) + + +# Return Bazel args with this attempt's concrete --jobs value applied. +def bazel_args_with_jobs(parsed: ParsedArgs, jobs: int) -> list[str]: + bazel_args = list(parsed.original_args) + if not parsed.supports_jobs: + return bazel_args + if parsed.job_locations: + for kind, index in parsed.job_locations: + if kind == "equals": + bazel_args[index] = f"--jobs={jobs}" + elif kind == "separate" and index + 1 < len(bazel_args): + bazel_args[index + 1] = str(jobs) + return bazel_args + + insert_at = bazel_option_end(bazel_args) + return bazel_args[:insert_at] + [f"--jobs={jobs}"] + bazel_args[insert_at:] + + +# Extract Bazel's displayed action duration from a progress line. +def parse_duration_seconds(line: str) -> int | None: + match = ACTION_DURATION_RE.search(line) + if not match: + return None + hours = int(match.group("hours") or 0) + minutes = int(match.group("minutes") or 0) + seconds = int(match.group("seconds") or 0) + return hours * 3600 + minutes * 60 + seconds + +# Track Bazel progress frames and failure hints from forwarded output. +class ProgressFrameParser: + def __init__(self) -> None: + self._buffer = "" + self._buffer_updated_at: float | None = None + self.running_count: int | None = None + self.running_count_decreased = False + self.completed_count: int | None = None + self.completed_count_advanced_at: float | None = None + self.total_count: int | None = None + self.meaningful_work_done = False + self.action_durations: list[ObservedActionDuration] = [] + self.current_frame_started_at: float | None = None + self.current_frame_has_summary_duration = False + self._pause_started_at: float | None = None + self._pause_intervals: list[tuple[float, float]] = [] + self._label_pause_started_at: dict[str, float] = {} + self._label_pause_intervals: dict[str, list[tuple[float, float]]] = {} + self._label_last_seen_at: dict[str, float] = {} + self._live_action_labels: set[str] = set() + self.saw_memory_kill = False + self.saw_build_failure = False + self.saw_server_abrupt = False + self.saw_internal_crash = False + self.saw_java_interrupted = False + self.saw_user_interrupt = False + self.output_bases: set[str] = set() + + def feed(self, text: str, now: float | None = None) -> None: + if text: + self._buffer_updated_at = now + self._buffer += text + parts = LINE_SEPARATOR_RE.split(self._buffer) + self._buffer = parts.pop() + for line in parts: + self._process_line(line, now) + + def _process_line(self, line: str, now: float | None = None) -> None: + clean = ANSI_RE.sub("", line).rstrip() + if not clean: + return + + if MEMORY_KILL_RE.search(clean): + self.saw_memory_kill = True + if BUILD_FAILURE_RE.search(clean): + self.saw_build_failure = True + if SERVER_ABRUPT_RE.search(clean): + self.saw_server_abrupt = True + if INTERNAL_CRASH_RE.search(clean): + self.saw_internal_crash = True + if JAVA_INTERRUPTED_RE.search(clean): + self.saw_java_interrupted = True + if BAZEL_USER_INTERRUPT_RE.search(clean): + self.saw_user_interrupt = True + for match in OUTPUT_BASE_LOG_RE.finditer(clean): + self.output_bases.add(os.path.realpath(match.group("output_base"))) + + if PROGRESS_HEADER_RE.match(clean): + self.current_frame_started_at = now + progress_match = PROGRESS_COUNT_RE.match(clean) + if progress_match: + completed_count = int(progress_match.group("done").replace(",", "")) + if self.completed_count is not None and completed_count > self.completed_count: + self.meaningful_work_done = True + self.completed_count_advanced_at = now + self.completed_count = completed_count + + total = progress_match.group("total") + if total is None: + self.total_count = None + else: + self.total_count = int(total.replace(",", "")) + + previous_running_count = self.running_count + running_match = RUNNING_COUNT_RE.search(clean) + if running_match: + running_value = running_match.group("listed_running") + if running_value is None: + running_value = running_match.group("actions_running") + if running_value is None: + running_value = running_match.group("running") + if running_value is None: + running_value = running_match.group("action_only") + + if running_value is None: + self.running_count = 0 + else: + self.running_count = int(running_value) + else: + self.running_count = None + + if previous_running_count is not None and self.running_count is not None: + if self.running_count < previous_running_count: + self.running_count_decreased = True + elif self.running_count > previous_running_count: + self.running_count_decreased = False + + self.action_durations = [] + self.current_frame_has_summary_duration = False + duration = parse_duration_seconds(clean) + if duration is not None: + self._remember_duration(duration, now) + self.current_frame_has_summary_duration = True + return + + duration = parse_duration_seconds(clean) + if duration is None: + return + + self._remember_duration(duration, now) + + # Store a displayed action duration with the wall-clock time it was seen. + def _remember_duration(self, duration: int, now: float | None) -> None: + if now is None: + action_started_at = None + else: + action_started_at = now - duration + self.action_durations.append( + ObservedActionDuration( + displayed_seconds=duration, + started_at=action_started_at, + observed_at=now, + ) + ) + + # Start a pause interval while at least one action group is SIGSTOPped. + def note_actions_paused(self, now: float) -> None: + if self._pause_started_at is None: + self._pause_started_at = now + + # End the current pause interval when all paused action groups are resumed. + def note_actions_resumed(self, now: float) -> None: + if self._pause_started_at is None: + return + start = self._pause_started_at + end = max(start, now) + self._pause_intervals.append((start, end)) + self._pause_started_at = None + + # Track pause intervals for the concrete source labels attached to stopped actions. + def note_paused_labels(self, labels: set[str], now: float) -> None: + labels = {label for label in labels if label} + for label in sorted(set(self._label_pause_started_at) - labels): + start = self._label_pause_started_at.pop(label) + end = max(start, now) + self._label_pause_intervals.setdefault(label, []).append((start, end)) + self._label_last_seen_at[label] = now + + for label in sorted(labels - set(self._label_pause_started_at)): + self._label_pause_started_at[label] = now + self._label_last_seen_at[label] = now + + for label in labels: + self._label_last_seen_at[label] = now + self.prune_label_pause_history(now) + + def note_live_action_labels(self, labels: set[str], now: float) -> None: + self._live_action_labels = {label for label in labels if label} + for label in self._live_action_labels & self.paused_duration_labels(): + self._label_last_seen_at[label] = now + self.prune_label_pause_history(now) + + def note_display_label_seen(self, label: str, now: float) -> None: + if label: + self._label_last_seen_at[label] = now + self.prune_label_pause_history(now) + + def paused_duration_labels(self) -> set[str]: + return set(self._label_pause_intervals) | set(self._label_pause_started_at) + + def prune_label_pause_history(self, now: float) -> None: + tracked = self.paused_duration_labels() + if not tracked: + return + + protected = set(self._label_pause_started_at) | self._live_action_labels + forget_before = now - DISPLAY_PAUSE_LABEL_GRACE_SECONDS + forget_labels = [ + label + for label in tracked + if label not in protected + and self._label_last_seen_at.get(label, 0.0) < forget_before + ] + + if len(tracked) - len(forget_labels) > DISPLAY_PAUSE_LABEL_HISTORY_LIMIT: + candidates = [ + label + for label in tracked + if label not in protected and label not in forget_labels + ] + candidates.sort(key=lambda label: self._label_last_seen_at.get(label, 0.0)) + overflow = len(tracked) - len(forget_labels) - DISPLAY_PAUSE_LABEL_HISTORY_LIMIT + forget_labels.extend(candidates[:overflow]) + + for label in forget_labels: + self._label_pause_intervals.pop(label, None) + self._label_last_seen_at.pop(label, None) + + # Return how much of one wall-clock span was spent with actions paused. + def _paused_overlap_seconds(self, start: float, end: float) -> float: + if end <= start: + return 0.0 + + paused_seconds = 0.0 + for pause_start, pause_end in self._pause_intervals: + overlap_start = max(start, pause_start) + overlap_end = min(end, pause_end) + if overlap_end > overlap_start: + paused_seconds += overlap_end - overlap_start + + if self._pause_started_at is not None: + overlap_start = max(start, self._pause_started_at) + if end > overlap_start: + paused_seconds += end - overlap_start + + return paused_seconds + + def _label_paused_overlap_seconds(self, label: str, start: float, end: float) -> float: + if end <= start: + return 0.0 + + paused_seconds = 0.0 + for pause_start, pause_end in self._label_pause_intervals.get(label, []): + overlap_start = max(start, pause_start) + overlap_end = min(end, pause_end) + if overlap_end > overlap_start: + paused_seconds += overlap_end - overlap_start + + pause_start = self._label_pause_started_at.get(label) + if pause_start is not None: + overlap_start = max(start, pause_start) + if end > overlap_start: + paused_seconds += end - overlap_start + + return paused_seconds + + # Return wall-clock elapsed time with wrapper-induced paused time removed. + def _active_elapsed_seconds(self, start: float, end: float) -> float: + elapsed = end - start + paused = self._paused_overlap_seconds(start, end) + return max(0.0, elapsed - paused) + + def _active_elapsed_seconds_for_label(self, label: str, start: float, end: float) -> float: + elapsed = end - start + paused = self._label_paused_overlap_seconds(label, start, end) + return max(0.0, elapsed - paused) + + # Return action durations aged by active time, optionally including a partial line. + def _effective_durations( + self, + now: float | None = None, + include_partial: bool = False, + ) -> list[float]: + durations = [] + for action in self.action_durations: + end = now if now is not None else action.observed_at + if end is not None and action.started_at is not None: + durations.append(self._active_elapsed_seconds(action.started_at, end)) + else: + durations.append(float(action.displayed_seconds)) + if include_partial and self._buffer: + partial_duration = parse_duration_seconds(ANSI_RE.sub("", self._buffer)) + if partial_duration is not None: + if now is not None and self._buffer_updated_at is not None: + partial_action_started_at = self._buffer_updated_at - partial_duration + durations.append( + self._active_elapsed_seconds(partial_action_started_at, now) + ) + else: + durations.append(float(partial_duration)) + return durations + + def all_reported_actions_over(self, limit_seconds: int, now: float | None = None) -> bool: + durations = self._effective_durations(now, include_partial=True) + + if not durations: + frame_has_stalled = ( + now is not None + and self.current_frame_started_at is not None + and self._active_elapsed_seconds(self.current_frame_started_at, now) + > limit_seconds + ) + return ( + self.running_count is not None + and self.running_count > 0 + and bool(self._buffer) + and frame_has_stalled + ) + + if self.running_count == 0: + return False + + all_visible_actions_are_over_limit = all( + duration > limit_seconds for duration in durations + ) + + if self.running_count is None: + return all_visible_actions_are_over_limit + + if len(durations) < self.running_count: + return all_visible_actions_are_over_limit + + reported_action_durations = durations[: self.running_count] + return all(duration > limit_seconds for duration in reported_action_durations) + + def all_displayed_actions_over(self, limit_seconds: int, now: float | None = None) -> bool: + durations = self._effective_durations(now, include_partial=True) + if not durations: + return False + if self.running_count == 0: + return False + if self.running_count is None: + return all(duration > limit_seconds for duration in durations) + + reported_action_durations = durations[: self.running_count] + return all(duration > limit_seconds for duration in reported_action_durations) + + def completed_progress_recent(self, now: float, window_seconds: float) -> bool: + return ( + self.completed_count_advanced_at is not None + and now - self.completed_count_advanced_at <= window_seconds + ) + + def has_running_actions(self) -> bool: + if self.running_count is not None: + return self.running_count > 0 + return bool(self.action_durations) + + def current_action_durations(self, now: float | None = None) -> list[float]: + durations = self._effective_durations(now, include_partial=False) + if self.running_count is not None: + return durations[: self.running_count] + return durations + + def upscale_action_skip_reason( + self, + max_action_seconds: int, + remaining_action_finish_threshold: int, + now: float | None = None, + ) -> str | None: + if not self.meaningful_work_done: + return "completed action count has not advanced in this Bazel attempt" + if not self.has_running_actions(): + return "no actions are currently running; letting Bazel finish" + have_progress_counts = ( + self.completed_count is not None + and self.total_count is not None + ) + remaining_count = None + if have_progress_counts: + remaining_count = max(0, self.total_count - self.completed_count) + if remaining_count <= remaining_action_finish_threshold: + return ( + f"only {remaining_count} action(s) remain; " + f"need more than {remaining_action_finish_threshold} before upscale" + ) + if self.running_count is not None and remaining_count == self.running_count: + return ( + f"only {remaining_count} action(s) remain and " + f"{self.running_count} action(s) are running; letting Bazel finish" + ) + running_count_decrease_suggests_finish = self.running_count_decreased + if ( + running_count_decrease_suggests_finish + and remaining_count is not None + and self.running_count is not None + and remaining_count > max(self.running_count * 2, self.running_count + 8) + ): + running_count_decrease_suggests_finish = False + if running_count_decrease_suggests_finish: + return "running action count is decreasing; letting Bazel finish" + durations = self.current_action_durations(now) + if not durations: + return "current running action durations are unavailable" + oldest_action_seconds = max(durations) + if oldest_action_seconds >= max_action_seconds: + oldest_display_seconds = int(oldest_action_seconds) + return ( + f"oldest current running action is {oldest_display_seconds}s; " + f"need all current actions under {max_action_seconds}s before upscale" + ) + return None + + def current_action_age_description(self, now: float | None = None) -> str: + if not self.has_running_actions(): + return "no current running actions" + durations = self.current_action_durations(now) + if not durations: + return "current action ages unavailable" + return f"oldest current action {int(max(durations))}s" + + +# Read total and available memory from /proc/meminfo or a test override. +def read_meminfo(path: str | None = None) -> MemInfo: + values: dict[str, int] = {} + meminfo_path = path or os.environ.get(MEMINFO_ENV, DEFAULT_MEMINFO_PATH) + with open(meminfo_path, encoding="utf-8") as meminfo: + for line in meminfo: + fields = line.split() + if len(fields) >= 2 and fields[0].endswith(":"): + try: + values[fields[0][:-1]] = int(fields[1]) + except ValueError: + continue + return MemInfo( + total_kb=values.get("MemTotal", 0), + available_kb=values.get("MemAvailable", values.get("MemFree", 0)), + ) + + +def read_swap_io(path: str | None = None) -> SwapIo: + values: dict[str, int] = {} + vmstat_path = path or os.environ.get(VMSTAT_ENV, DEFAULT_VMSTAT_PATH) + with open(vmstat_path, encoding="utf-8") as vmstat: + for line in vmstat: + fields = line.split() + if len(fields) != 2: + continue + if fields[0] != "pswpin": + continue + try: + values[fields[0]] = int(fields[1]) + except ValueError: + continue + return SwapIo(pages_in=values.get("pswpin", 0)) + + +# Increase jobs by 1.5x, rounded up and capped at the initial maximum. +def upscale_jobs(jobs: int, max_jobs: int) -> int: + return min(max_jobs, max(jobs + 1, (jobs * 3 + 1) // 2)) + + +# Reduce jobs by half, rounded up so odd counts are not cut too sharply. +def downscale_jobs(jobs: int) -> int: + return max(1, (jobs + 1) // 2) + + +# Read the memory polling interval, clamped away from a busy loop. +def memory_poll_interval() -> float: + value = os.environ.get(MEMORY_POLL_INTERVAL_ENV) + if value is None: + return DEFAULT_MEMORY_POLL_INTERVAL_SECONDS + try: + return max(MIN_MEMORY_POLL_INTERVAL_SECONDS, float(value)) + except ValueError: + return DEFAULT_MEMORY_POLL_INTERVAL_SECONDS + + +# Limit repeated same-job retries for action failures that happen with healthy memory. +def same_job_retry_limit() -> int: + value = os.environ.get(SAME_JOB_RETRY_LIMIT_ENV) + if value is None: + return DEFAULT_SAME_JOB_RETRY_LIMIT + parsed = positive_int(value) + if parsed is None: + return DEFAULT_SAME_JOB_RETRY_LIMIT + return parsed + + +# Read the short settling delay used before every Bazel restart. +def restart_settle_delay() -> float: + value = os.environ.get(RESTART_SETTLE_DELAY_ENV) + if value is None: + return DEFAULT_RESTART_SETTLE_DELAY_SECONDS + try: + return max(0.0, float(value)) + except ValueError: + return DEFAULT_RESTART_SETTLE_DELAY_SECONDS + + +# Read the nice increment inherited by Bazel's long-running build process tree. +def bazel_nice_increment() -> int: + value = os.environ.get(BAZEL_NICE_ENV) + if value is None: + return DEFAULT_BAZEL_NICE + try: + parsed = int(value, 10) + except ValueError: + return DEFAULT_BAZEL_NICE + return min(MAX_BAZEL_NICE, max(MIN_BAZEL_NICE, parsed)) + + +# Put Bazel in its own process group and lower only Bazel's scheduling priority. +def prepare_bazel_child() -> None: + os.setpgrp() + nice_increment = bazel_nice_increment() + if nice_increment == 0: + return + try: + os.nice(nice_increment) + except OSError: + pass + + +# Maintain the rolling memory window used by retry and upscale decisions. +class MemoryPressureMonitor: + def __init__(self, poll_interval: float | None = None) -> None: + self.poll_interval = poll_interval if poll_interval is not None else memory_poll_interval() + self.next_poll = 0.0 + self.last: MemInfo | None = None + self.last_low_at: float | None = None + self.last_low: MemInfo | None = None + self.samples: list[tuple[float, MemInfo]] = [] + + def sample(self, now: float, force: bool = False) -> MemInfo | None: + if not force and now < self.next_poll: + return self.last + self.next_poll = now + self.poll_interval + try: + meminfo = read_meminfo() + except OSError as error: + diag(f"could not read memory information: {error}") + return self.last + self.last = meminfo + self.samples.append((now, meminfo)) + self.samples = [ + (sampled_at, sample) + for sampled_at, sample in self.samples + if now - sampled_at <= MEMORY_REPORT_SECONDS + ] + if meminfo.available_kb < low_memory_threshold_kb(): + self.last_low_at = now + self.last_low = meminfo + return meminfo + + def recent_samples(self, now: float) -> list[MemInfo]: + recent = [] + for sampled_at, sample in self.samples: + if now - sampled_at <= MEMORY_REPORT_SECONDS: + recent.append(sample) + return recent + + def recent_average_available_kb(self, now: float) -> int | None: + recent_samples = self.recent_samples(now) + if not recent_samples: + return None + return sum(sample.available_kb for sample in recent_samples) // len(recent_samples) + + def recent_min_available_kb(self, now: float) -> int | None: + recent_samples = self.recent_samples(now) + if not recent_samples: + return None + return min(sample.available_kb for sample in recent_samples) + + def recent_total_kb(self, now: float) -> int: + recent_samples = self.recent_samples(now) + if recent_samples: + return recent_samples[-1].total_kb + if self.last is not None: + return self.last.total_kb + return 0 + + def upscale_skip_reason( + self, + now: float, + running_actions_seconds: float | None, + required_running_actions_seconds: float, + ) -> str | None: + average_available_kb = self.recent_average_available_kb(now) + total_kb = self.recent_total_kb(now) + if average_available_kb is None or total_kb <= 0: + return "memory data is unavailable" + recent_min_available_kb = self.recent_min_available_kb(now) + threshold_mb = low_memory_threshold_kb() // 1024 + if ( + recent_min_available_kb is not None + and recent_min_available_kb < low_memory_threshold_kb() + ): + return ( + f"memory dipped below low-memory threshold in last " + f"{int(MEMORY_REPORT_SECONDS)}s: min " + f"{recent_min_available_kb // 1024} MiB < {threshold_mb} MiB" + ) + if running_actions_seconds is None: + return ( + "running Bazel actions have not been observed yet; need " + f"{int(required_running_actions_seconds)}s before upscale" + ) + if running_actions_seconds < required_running_actions_seconds: + return ( + f"running Bazel actions observed for {int(running_actions_seconds)}s; " + f"need {int(required_running_actions_seconds)}s before upscale" + ) + if total_kb > 0 and average_available_kb * 2 > total_kb: + return None + return ( + "average available memory over last " + f"{int(MEMORY_REPORT_SECONDS)}s is {average_available_kb // 1024} MiB " + f"of {total_kb // 1024} MiB; need more than {total_kb // 2048} MiB" + ) + + def upscale_ready_description(self, now: float, running_actions_seconds: float | None) -> str: + average_available_kb = self.recent_average_available_kb(now) or 0 + recent_min_available_kb = self.recent_min_available_kb(now) or 0 + total_kb = self.recent_total_kb(now) + latest_available_kb = self.last.available_kb if self.last is not None else 0 + running_seconds = int(running_actions_seconds or 0) + return ( + f"memory latest {latest_available_kb // 1024}/{total_kb // 1024} MiB; " + f"{int(MEMORY_REPORT_SECONDS)}s average {average_available_kb // 1024} MiB; " + f"min {recent_min_available_kb // 1024} MiB; " + f"running actions observed for {running_seconds}s" + ) + + def retry_same_jobs_after_failure(self, now: float) -> bool: + average_available_kb = self.recent_average_available_kb(now) + total_kb = self.recent_total_kb(now) + return ( + average_available_kb is not None + and total_kb > 0 + and average_available_kb * 2 > total_kb + ) + + def failure_average_description(self, now: float) -> str: + average_available_kb = self.recent_average_available_kb(now) or 0 + total_kb = self.recent_total_kb(now) + return ( + f"{int(MEMORY_REPORT_SECONDS)}s average memory " + f"{average_available_kb // 1024}/{total_kb // 1024} MiB" + ) + + def recent_low_memory(self, now: float) -> bool: + return ( + self.last_low_at is not None + and now - self.last_low_at <= RECENT_MEMORY_PRESSURE_SECONDS + ) + + def recent_low_memory_description(self) -> str: + if self.last_low is None: + return "recent low memory" + return f"{self.last_low.available_kb // 1024} MiB available" + + def failure_report(self, now: float) -> str: + if self.last is None: + return "memory pressure: unavailable" + + recent_samples = self.recent_samples(now) + recent_min = min(recent_samples, key=lambda sample: sample.available_kb, default=self.last) + recent_average = self.recent_average_available_kb(now) or self.last.available_kb + recent_low = "yes" if self.recent_low_memory(now) else "no" + threshold_mb = low_memory_threshold_kb() // 1024 + return ( + "memory pressure: " + f"latest {self.last.available_kb // 1024}/{self.last.total_kb // 1024} MiB; " + f"{int(MEMORY_REPORT_SECONDS)}s average {recent_average // 1024} MiB; " + f"min {recent_min.available_kb // 1024} MiB; " + f"less than low-memory threshold {threshold_mb} MiB: {recent_low}" + ) + + +# Temporarily hand the foreground terminal to Bazel for natural Ctrl-C handling. +class TerminalForeground: + def __init__(self) -> None: + self.enabled = False + self.fd = -1 + self.wrapper_pgid = os.getpgrp() + self._given = False + + try: + fd = sys.stdin.fileno() + except (AttributeError, OSError): + return + if os.isatty(fd): + self.enabled = True + self.fd = fd + + def give_to(self, pgid: int) -> None: + if not self.enabled: + return + self._set_foreground(pgid) + self._given = True + + def restore(self) -> None: + if not self.enabled or not self._given: + return + self._set_foreground(self.wrapper_pgid) + self._given = False + + def _set_foreground(self, pgid: int) -> None: + old_handler = signal.signal(signal.SIGTTOU, signal.SIG_IGN) + try: + os.tcsetpgrp(self.fd, pgid) + except OSError: + pass + finally: + signal.signal(signal.SIGTTOU, old_handler) + + +# Parse boolean environment flags used for PTY behavior. +def env_flag(name: str) -> bool: + return os.environ.get(name, DEFAULT_ENV_FLAG_VALUE).lower() in {"1", "true", "yes", "on"} + + +# Convert signal-style negative subprocess return codes to shell exit codes. +def normalize_returncode(returncode: int) -> int: + if returncode < 0: + return 128 + abs(returncode) + return returncode + + +# Rewrite displayed Bazel action counts to include wrapper-paused action groups. +def rewrite_action_duration_display( + text: str, + parser: ProgressFrameParser, + now: float, + paused_count: int, + paused_labels: set[str] | None = None, +) -> str: + rewritten_count = 0 + current_labels = paused_labels or set() + labels = parser.paused_duration_labels() | current_labels + if paused_count <= 0 and not labels: + return text + + def line_for_match(match: re.Match) -> str: + line_start = text.rfind("\n", 0, match.start()) + 1 + carriage_start = text.rfind("\r", 0, match.start()) + 1 + line_start = max(line_start, carriage_start) + line_end_candidates = [ + index + for index in ( + text.find("\n", match.end()), + text.find("\r", match.end()), + ) + if index != -1 + ] + line_end = min(line_end_candidates) if line_end_candidates else len(text) + return text[line_start:line_end] + + def matching_label(match: re.Match) -> str | None: + if not labels: + return None + line = ANSI_RE.sub("", line_for_match(match)).replace("\\", "/") + for label in sorted(labels, key=len, reverse=True): + if label in line: + return label + return None + + def replace(match: re.Match) -> str: + nonlocal rewritten_count + duration = parse_duration_seconds(match.group(0)) + if duration is None: + return match.group(0) + started_at = now - duration + + label = matching_label(match) + if label is not None: + parser.note_display_label_seen(label, now) + if label in parser.paused_duration_labels(): + active_duration = int( + parser._active_elapsed_seconds_for_label(label, started_at, now) + ) + else: + active_duration = int(parser._active_elapsed_seconds(started_at, now)) + else: + if paused_count <= 0 or labels or rewritten_count >= paused_count: + return match.group(0) + active_duration = int(parser._active_elapsed_seconds(started_at, now)) + + rewritten_count += 1 + return f"; {max(0, active_duration)}s" + + return ACTION_DURATION_RE.sub(replace, text) + + +def rewrite_action_summary_display(text: str, paused_count: int) -> str: + if paused_count <= 0: + return text + + def split_actions(actions: int, running: int, parenthesized: bool) -> str: + paused = min(paused_count, running) + active_running = running - paused + action_word = "action" if actions == 1 else "actions" + rewritten = f"{actions} {action_word}, {paused} paused, {active_running} running" + if parenthesized: + return f"({rewritten})" + return rewritten + + def replace(match: re.Match) -> str: + running_text = match.group("running") + if running_text is not None: + return split_actions(int(match.group("actions")), int(running_text), True) + + bare_running_text = match.group("bare_running") + if bare_running_text is not None: + return split_actions( + int(match.group("bare_actions")), + int(bare_running_text), + False, + ) + + parenthesized_only_running = match.group("parenthesized_only_running") + only_running_text = parenthesized_only_running or match.group("only_running") + return split_actions( + int(only_running_text), + int(only_running_text), + parenthesized_only_running is not None, + ) + + return ACTION_SUMMARY_DISPLAY_RE.sub(replace, text) + + +# Remember where a selected stream should be forwarded and how to close it. +class StreamTarget: + def __init__(self, output, close_on_eof=None) -> None: + self.output = output + self.close_on_eof = close_on_eof + + def close(self) -> None: + if self.close_on_eof is None: + return + try: + self.close_on_eof() + except OSError: + pass + + +# Choose PTY or pipe output wiring and register it with the selector. +class BazelOutput: + def __init__(self) -> None: + self.master_fd: int | None = None + self.slave_fd: int | None = None + self.use_pty = False + + if env_flag(DISABLE_PTY_ENV): + return + + stdout_fd = None + stderr_fd = None + try: + stdout_fd = sys.stdout.fileno() + stderr_fd = sys.stderr.fileno() + except (AttributeError, OSError): + pass + + has_terminal = ( + stdout_fd is not None + and stderr_fd is not None + and os.isatty(stdout_fd) + and os.isatty(stderr_fd) + ) + if has_terminal or env_flag(FORCE_PTY_ENV): + self.master_fd, self.slave_fd = pty.openpty() + rows = positive_int(os.environ.get(PTY_ROWS_ENV, DEFAULT_PTY_ROWS)) + columns = positive_int(os.environ.get(PTY_COLUMNS_ENV, DEFAULT_PTY_COLUMNS)) + if rows is None or columns is None: + fd_rows = fd_columns = 0 + if stdout_fd is not None: + try: + # Prefer the real terminal size so Bazel progress frames are not truncated. + data = fcntl.ioctl( + stdout_fd, + termios.TIOCGWINSZ, + struct.pack("HHHH", 0, 0, 0, 0), + ) + fd_rows, fd_columns, _, _ = struct.unpack("HHHH", data) + except OSError: + pass + if fd_rows <= 0 or fd_columns <= 0: + fallback = shutil.get_terminal_size( + fallback=(DEFAULT_TERMINAL_COLUMNS, DEFAULT_TERMINAL_ROWS) + ) + fd_rows, fd_columns = fallback.lines, fallback.columns + rows = rows or fd_rows + columns = columns or fd_columns + try: + # Apply the chosen PTY size before Bazel starts writing progress frames. + window_size = struct.pack("HHHH", rows, columns, 0, 0) + fcntl.ioctl(self.slave_fd, termios.TIOCSWINSZ, window_size) + except OSError: + pass + self.use_pty = True + + def popen_kwargs(self) -> dict: + if self.use_pty: + return {"stdout": self.slave_fd, "stderr": self.slave_fd} + return {"stdout": subprocess.PIPE, "stderr": subprocess.PIPE} + + def parent_after_spawn(self) -> None: + if self.use_pty and self.slave_fd is not None: + os.close(self.slave_fd) + self.slave_fd = None + + def register(self, selector: selectors.DefaultSelector, process: subprocess.Popen) -> None: + if self.use_pty and self.master_fd is not None: + selector.register( + self.master_fd, + selectors.EVENT_READ, + StreamTarget(sys.stdout.buffer, self.close_master), + ) + return + if process.stdout is not None: + selector.register(process.stdout, selectors.EVENT_READ, StreamTarget(sys.stdout.buffer)) + if process.stderr is not None: + selector.register(process.stderr, selectors.EVENT_READ, StreamTarget(sys.stderr.buffer)) + + def close(self) -> None: + if self.slave_fd is not None: + try: + os.close(self.slave_fd) + except OSError: + pass + self.slave_fd = None + self.close_master() + + def close_master(self) -> None: + if self.master_fd is None: + return + try: + os.close(self.master_fd) + except OSError: + pass + self.master_fd = None + + +# Drain ready Bazel output streams while forwarding bytes and parsing a copy. +def drain_ready_streams( + selector: selectors.DefaultSelector, + parser: ProgressFrameParser, + timeout: float, + paused_count=None, + paused_labels=None, +) -> None: + for key, _ in selector.select(timeout): + fd = key.fileobj if isinstance(key.fileobj, int) else key.fileobj.fileno() + try: + # Read what is available without blocking; PTY EIO means EOF. + data = os.read(fd, 65536) + except BlockingIOError: + data = b"" + except OSError as error: + if error.errno != errno.EIO: + raise + data = None + if data: + target = key.data + # Forward bytes immediately so partial lines never wait for CR/LF. + text = data.decode("utf-8", errors="ignore") + now = time.monotonic() + current_paused_count = paused_count() if paused_count is not None else 0 + displayed = rewrite_action_duration_display( + text, + parser, + now, + current_paused_count, + paused_labels() if paused_labels is not None else None, + ) + displayed = rewrite_action_summary_display(displayed, current_paused_count) + target.output.write(displayed.encode("utf-8")) + target.output.flush() + parser.feed(text, now) + continue + try: + selector.unregister(key.fileobj) + except KeyError: + pass + if not isinstance(key.fileobj, int): + key.fileobj.close() + key.data.close() + + +# Drain any buffered Bazel output after the process exits. +def drain_remaining_streams( + selector: selectors.DefaultSelector, + parser: ProgressFrameParser, + paused_count=None, + paused_labels=None, +) -> None: + while selector.get_map(): + before = len(selector.get_map()) + drain_ready_streams(selector, parser, 0, paused_count, paused_labels) + after = len(selector.get_map()) + if before == after: + break + + +# Read process command line and cwd from /proc, tolerating races with exit. +def process_info(pid: int) -> ProcessInfo | None: + try: + with open(f"/proc/{pid}/cmdline", "rb") as cmdline: + command = cmdline.read().replace(b"\0", b" ").decode("utf-8", errors="ignore") + except OSError: + return None + nice: int | None = None + started_at_ticks: int | None = None + ppid: int | None = None + state: str | None = None + try: + with open(f"/proc/{pid}/stat", encoding="utf-8") as stat: + fields = stat.read().rsplit(")", 1)[1].strip().split() + state = fields[0] + ppid = int(fields[1]) + nice = int(fields[16]) + started_at_ticks = int(fields[19]) + except (OSError, IndexError, ValueError): + ppid = None + state = None + nice = None + started_at_ticks = None + try: + cwd = os.path.realpath(os.readlink(f"/proc/{pid}/cwd")) + except OSError: + cwd = None + cgroups = process_cgroups(pid) + return ProcessInfo( + pid=pid, + cmdline=command, + cwd=cwd, + nice=nice, + started_at_ticks=started_at_ticks, + ppid=ppid, + state=state, + cgroups=cgroups, + ) + + +def process_cgroups(pid: int) -> tuple[tuple[str, str], ...]: + try: + with open(f"/proc/{pid}/cgroup", encoding="utf-8") as cgroup_file: + entries = [] + for line in cgroup_file: + fields = line.rstrip("\n").split(":", 2) + if len(fields) == 3: + entries.append((fields[1], fields[2])) + return tuple(sorted(entries)) + except OSError: + return () + + +def useful_cgroup_paths(cgroups: tuple[tuple[str, str], ...]) -> tuple[tuple[str, str], ...]: + return tuple((controllers, path) for controllers, path in cgroups if path not in {"", "/"}) + + +def same_process_domain( + owner_cgroups: tuple[tuple[str, str], ...], + candidate_cgroups: tuple[tuple[str, str], ...], +) -> bool: + owner_paths = useful_cgroup_paths(owner_cgroups) + if not owner_paths: + return True + if not candidate_cgroups: + return False + + candidate_by_controller = dict(candidate_cgroups) + for controllers, owner_path in owner_paths: + candidate_path = candidate_by_controller.get(controllers) + if candidate_path is None: + continue + if candidate_path == owner_path or candidate_path.startswith(owner_path.rstrip("/") + "/"): + return True + return False + + +def process_swap_kb(pid: int) -> int: + try: + with open(f"/proc/{pid}/status", encoding="utf-8") as status: + for line in status: + if line.startswith("VmSwap:"): + fields = line.split() + if len(fields) >= 2: + return int(fields[1]) + return 0 + except (OSError, ValueError): + return 0 + return 0 + + +# List current Linux process ids from /proc. +def proc_pids() -> list[int]: + pids: list[int] = [] + for name in os.listdir("/proc"): + if name.isdigit(): + pids.append(int(name)) + return pids + + +# Check whether a candidate process is still alive. +def pid_exists(pid: int) -> bool: + try: + os.kill(pid, 0) + except ProcessLookupError: + return False + except PermissionError: + return True + return True + + +# Track workspace, process domain, and Bazel output bases for cleanup. +class BuildContext: + def __init__( + self, + workspace: str, + cgroups: tuple[tuple[str, str], ...] | None = None, + ) -> None: + self.workspace = os.path.realpath(workspace) + self.cgroups = cgroups if cgroups is not None else process_cgroups(os.getpid()) + self.output_bases: set[str] = set() + + def add_output_base(self, output_base: str | None) -> None: + if output_base: + self.output_bases.add(os.path.realpath(output_base)) + + def add_output_bases(self, output_bases: set[str]) -> None: + for output_base in output_bases: + self.add_output_base(output_base) + + def owns_process(self, info: ProcessInfo) -> bool: + return same_process_domain(self.cgroups, info.cgroups) + + def refresh_from_bazel_servers(self) -> None: + for server in bazel_servers_for_workspace(self.workspace, self.cgroups): + self.add_output_base(server.output_base) + + +# Find Bazel server processes that belong to this workspace. +def bazel_servers_for_workspace( + workspace: str, + owner_cgroups: tuple[tuple[str, str], ...] = (), +) -> list[BazelServer]: + candidates = {workspace, os.path.realpath(workspace)} + servers: list[BazelServer] = [] + for pid in proc_pids(): + info = process_info(pid) + if info is None: + continue + if not same_process_domain(owner_cgroups, info.cgroups): + continue + if "A-server.jar" not in info.cmdline and "bazel(" not in info.cmdline: + continue + if any(f"--workspace_directory={candidate}" in info.cmdline for candidate in candidates): + output_base = None + for field in info.cmdline.split(): + if field.startswith("--output_base="): + # Capture Bazel's output base so cleanup can find sandbox children. + output_base = os.path.realpath(field.split("=", 1)[1]) + break + servers.append(BazelServer(pid=pid, output_base=output_base)) + return servers + + +# Find leftover Bazel sandbox/build processes tied to this workspace. +def dangling_build_processes(context: BuildContext) -> list[ProcessInfo]: + context.refresh_from_bazel_servers() + if not context.output_bases: + return [] + + own_pid = os.getpid() + candidates: dict[int, ProcessInfo] = {} + for pid in proc_pids(): + if pid == own_pid: + continue + info = process_info(pid) + if info is None or not info.cmdline: + continue + if not context.owns_process(info): + continue + if ( + "A-server.jar" in info.cmdline + or "bazel(" in info.cmdline + or os.path.basename(info.cmdline.split(" ", 1)[0] or "") == "bazel" + ): + continue + candidates[pid] = info + + matched_pids: set[int] = set() + for info in candidates.values(): + for output_base in context.output_bases: + try: + # Match processes whose cwd or command ties them to this Bazel output base. + cwd_under_output_base = ( + info.cwd is not None + and os.path.commonpath( + [os.path.realpath(info.cwd), os.path.realpath(output_base)] + ) + == os.path.realpath(output_base) + ) + except ValueError: + cwd_under_output_base = False + if cwd_under_output_base or output_base in info.cmdline: + matched_pids.add(info.pid) + break + + children_by_parent: dict[int, list[int]] = {} + for info in candidates.values(): + if info.ppid is not None: + children_by_parent.setdefault(info.ppid, []).append(info.pid) + + processes: list[ProcessInfo] = [] + seen: set[int] = set() + stack = list(sorted(matched_pids)) + while stack: + pid = stack.pop() + if pid in seen: + continue + seen.add(pid) + info = candidates.get(pid) + if info is None: + continue + processes.append(info) + stack.extend(children_by_parent.get(pid, [])) + return processes + + +def source_labels_from_process(info: ProcessInfo, context: BuildContext) -> set[str]: + labels: set[str] = set() + for raw_token in info.cmdline.split(): + token = raw_token.strip("'\"") + if not SOURCE_FILE_RE.search(token): + continue + normalized = token.replace("\\", "/") + labels.add(normalized) + try: + real_token = os.path.realpath(token) + if os.path.isabs(token) and os.path.commonpath( + [real_token, context.workspace] + ) == context.workspace: + labels.add(os.path.relpath(real_token, context.workspace).replace("\\", "/")) + except (OSError, ValueError): + pass + + execroot_marker = "/execroot/" + if execroot_marker in normalized: + after_execroot = normalized.split(execroot_marker, 1)[1] + parts = after_execroot.split("/", 1) + if len(parts) == 2: + execroot_relative = parts[1] + labels.add(execroot_relative) + external_parts = execroot_relative.split("/", 2) + if ( + len(external_parts) == 3 + and external_parts[0] == "external" + ): + labels.add(external_parts[2]) + + return {label for label in labels if "/" in label or label.startswith("//")} + + +# Group build processes by Bazel sandbox action directory when possible. +def build_process_groups(context: BuildContext) -> list[ActionProcessGroup]: + processes = dangling_build_processes(context) + processes_by_pid = {process.pid: process for process in processes} + + def sandbox_key(info: ProcessInfo) -> str | None: + for text in (info.cwd or "", info.cmdline): + match = SANDBOX_ACTION_RE.search(text) + if match: + return match.group("sandbox_key") + return None + + direct_keys = { + process.pid: key + for process in processes + if (key := sandbox_key(process)) is not None + } + + def inherited_key(info: ProcessInfo) -> str: + seen: set[int] = set() + current: ProcessInfo | None = info + while current is not None and current.pid not in seen: + seen.add(current.pid) + key = direct_keys.get(current.pid) + if key is not None: + return key + if current.ppid is None: + break + current = processes_by_pid.get(current.ppid) + return f"pid:{info.pid}" + + grouped: dict[str, list[ProcessInfo]] = {} + for info in processes: + grouped.setdefault(inherited_key(info), []).append(info) + + groups: list[ActionProcessGroup] = [] + for key, processes in grouped.items(): + starts = [ + process.started_at_ticks + for process in processes + if process.started_at_ticks is not None + ] + action_labels: set[str] = set() + for process in processes: + action_labels.update(source_labels_from_process(process, context)) + states = {process.state for process in processes if process.state is not None} + groups.append( + ActionProcessGroup( + key=key, + pids=sorted(process.pid for process in processes), + started_at_ticks=min(starts) if starts else None, + action_labels=action_labels, + states=states, + ) + ) + return groups + + +# Pause and resume Bazel action process groups to reduce memory pressure. +class ActionThrottler: + def __init__(self, context: BuildContext) -> None: + self.context = context + self.paused_keys: set[str] = set() + self.paused_pids: dict[str, set[int]] = {} + self.paused_action_labels: dict[str, set[str]] = {} + self.paused_started_at: dict[str, float] = {} + self.paused_total_seconds: dict[str, float] = {} + self.current_action_labels: set[str] = set() + self.base_threshold_kb = low_memory_threshold_kb() + self.effective_threshold_kb = self.base_threshold_kb + self.max_threshold_kb = self.base_threshold_kb * 2 + self.threshold_step_kb = ADAPTIVE_THRESHOLD_STEP_MB * 1024 + self.next_threshold_raise_at = 0.0 + self.next_threshold_lower_at = 0.0 + self.max_observed_action_groups = 0 + self.io_stall_floor_seconds = io_stall_floor_seconds() + self.io_stall_swap_rate_kb_per_second = io_stall_swap_rate_kb_per_second() + self.io_stall_started_at: float | None = None + self.io_stall_observations: list[tuple[float, bool]] = [] + self.current_io_stall_observed = False + self.io_stall_floor_groups: int | None = None + self.next_io_stall_floor_drop_at: float | None = None + self.last_running_io_stall_at: float | None = None + self.last_swap_io_sample: tuple[float, SwapIo] | None = None + self.last_swap_io_rate_kb_per_second = 0.0 + self.next_normal_resume_at = 0.0 + + def paused_count(self) -> int: + return len(self.paused_keys) + + def paused_labels(self) -> set[str]: + labels: set[str] = set() + for key in self.paused_keys: + labels.update(self.paused_action_labels.get(key, set())) + return labels + + def group_is_physically_stopped(self, group: ActionProcessGroup) -> bool: + return bool(group.states) and group.states <= {"T"} + + def group_is_paused(self, group: ActionProcessGroup) -> bool: + return group.key in self.paused_keys or self.group_is_physically_stopped(group) + + def group_is_action(self, group: ActionProcessGroup) -> bool: + return not group.key.startswith("pid:") + + def running_action_groups( + self, + groups: list[ActionProcessGroup], + ) -> list[ActionProcessGroup]: + return [ + group + for group in groups + if self.group_is_action(group) and not self.group_is_paused(group) + ] + + def stalled_running_groups( + self, + groups: list[ActionProcessGroup], + ) -> list[ActionProcessGroup]: + return [group for group in self.running_action_groups(groups) if "D" in group.states] + + def swap_io_is_heavy(self, now: float) -> bool: + try: + current = read_swap_io() + except OSError: + return False + + previous = self.last_swap_io_sample + self.last_swap_io_sample = (now, current) + if previous is None: + return False + + previous_at, previous_sample = previous + elapsed = now - previous_at + if elapsed <= 0: + return False + + pages_in = max(0, current.pages_in - previous_sample.pages_in) + self.last_swap_io_rate_kb_per_second = pages_in * PAGE_SIZE_KB / elapsed + return self.last_swap_io_rate_kb_per_second >= self.io_stall_swap_rate_kb_per_second + + def io_stall_reason(self, stalled_running: list[ActionProcessGroup]) -> str: + if stalled_running: + return ( + f"{len(stalled_running)} running action group(s) " + "in uninterruptible I/O" + ) + return ( + "swap-in at " + f"{int(self.last_swap_io_rate_kb_per_second // 1024)} MiB/s" + ) + + def record_io_stall_observation(self, now: float, stalled: bool) -> None: + self.current_io_stall_observed = stalled + self.io_stall_observations.append((now, stalled)) + cutoff = now - self.io_stall_floor_seconds * 2 + self.io_stall_observations = [ + observation + for observation in self.io_stall_observations + if observation[0] >= cutoff + ] + if stalled: + self.last_running_io_stall_at = now + if self.io_stall_started_at is None: + self.io_stall_started_at = now + elif not self.recent_io_stall_observed(now): + self.io_stall_started_at = None + + def recent_io_stall_observed(self, now: float) -> bool: + return ( + self.last_running_io_stall_at is not None + and now - self.last_running_io_stall_at <= IO_STALL_RECENT_OBSERVATION_SECONDS + ) + + def sustained_io_stall_observed(self, now: float) -> bool: + if self.io_stall_started_at is None: + return False + if now - self.io_stall_started_at < self.io_stall_floor_seconds: + return False + if not self.current_io_stall_observed: + return False + stalled_observations = sum( + 1 for _observed_at, stalled in self.io_stall_observations if stalled + ) + return stalled_observations >= IO_STALL_MIN_OBSERVATIONS + + def group_active_elapsed_seconds( + self, + group: ActionProcessGroup, + now: float, + ) -> float | None: + if group.started_at_ticks is None: + return None + started_at = group.started_at_ticks / CLOCK_TICKS_PER_SECOND + paused_seconds = self.paused_total_seconds.get(group.key, 0.0) + pause_started_at = self.paused_started_at.get(group.key) + if pause_started_at is not None and now > pause_started_at: + paused_seconds += now - pause_started_at + return max(0.0, now - started_at - paused_seconds) + + def all_running_action_groups_over(self, limit_seconds: int, now: float) -> bool | None: + groups = self.refresh_paused_groups(build_process_groups(self.context)) + running_groups = self.running_action_groups(groups) + if not running_groups: + return None + durations: list[float] = [] + for group in running_groups: + duration = self.group_active_elapsed_seconds(group, now) + if duration is None: + return None + durations.append(duration) + return all(duration > limit_seconds for duration in durations) + + def update(self, meminfo: MemInfo | None) -> None: + if meminfo is None: + return + if self.paused_count() > 0 and meminfo.available_kb > self.low_memory_threshold_kb(): + if not self.resume_if_needed(meminfo): + self.pause_if_needed(meminfo) + else: + self.pause_if_needed(meminfo) + + def low_memory_threshold_kb(self) -> int: + return self.effective_threshold_kb + + def pause_watch_threshold_kb(self) -> int: + return self.low_memory_threshold_kb() * 2 + + def downscale_memory_threshold_kb(self) -> int: + if self.paused_count() > 0: + return self.pause_watch_threshold_kb() + return self.low_memory_threshold_kb() + + def maybe_adapt_threshold(self, groups: list[ActionProcessGroup], meminfo: MemInfo) -> None: + now = time.monotonic() + stalled_running = self.stalled_running_groups(groups) + swap_io_stalled = self.swap_io_is_heavy(now) + io_stalled = bool(stalled_running) or swap_io_stalled + self.record_io_stall_observation(now, io_stalled) + if io_stalled: + self.maybe_lower_io_stall_floor(groups, now) + if ( + self.effective_threshold_kb < self.max_threshold_kb + and now >= self.next_threshold_raise_at + ): + old_mb = self.effective_threshold_kb // 1024 + self.effective_threshold_kb = min( + self.max_threshold_kb, + self.effective_threshold_kb + self.threshold_step_kb, + ) + self.next_threshold_raise_at = ( + now + ADAPTIVE_THRESHOLD_RAISE_COOLDOWN_SECONDS + ) + self.next_threshold_lower_at = ( + now + ADAPTIVE_THRESHOLD_LOWER_COOLDOWN_SECONDS + ) + diag( + "raising low-memory threshold from " + f"{old_mb} to {self.effective_threshold_kb // 1024} MiB " + "after observing " + f"{self.io_stall_reason(stalled_running)}" + ) + return + + if ( + not self.sustained_io_stall_observed(now) + and self.io_stall_floor_groups is not None + ): + self.io_stall_floor_groups = None + self.next_io_stall_floor_drop_at = None + diag( + "uninterruptible I/O cleared; restoring normal pause floor of " + f"{self.minimum_running_groups(len(groups))} running action group(s)" + ) + + if self.effective_threshold_kb <= self.base_threshold_kb: + return + if self.paused_count() > 0: + return + if meminfo.available_kb <= self.pause_watch_threshold_kb(): + return + if now < self.next_threshold_lower_at: + return + + old_mb = self.effective_threshold_kb // 1024 + self.effective_threshold_kb = max( + self.base_threshold_kb, + self.effective_threshold_kb - self.threshold_step_kb, + ) + self.next_threshold_lower_at = now + ADAPTIVE_THRESHOLD_LOWER_COOLDOWN_SECONDS + diag( + "lowering low-memory threshold from " + f"{old_mb} to {self.effective_threshold_kb // 1024} MiB " + "after running actions avoided I/O stalls" + ) + + def normal_minimum_running_groups(self, current_group_count: int) -> int: + if current_group_count <= 0: + return 0 + observed = max(self.max_observed_action_groups, current_group_count) + floor = max(1, (observed + 1) // 2) + return min(current_group_count, floor) + + def maybe_lower_io_stall_floor( + self, + groups: list[ActionProcessGroup], + now: float, + ) -> None: + if not self.sustained_io_stall_observed(now): + return + if ( + self.next_io_stall_floor_drop_at is not None + and now < self.next_io_stall_floor_drop_at + ): + return + + old_floor = self.minimum_running_groups(len(groups)) + new_floor = max(1, (old_floor + 1) // 2) + self.io_stall_floor_groups = new_floor + self.next_io_stall_floor_drop_at = now + self.io_stall_floor_seconds + if new_floor < old_floor: + diag( + "sustained uninterruptible I/O observed for " + f"{self.io_stall_floor_seconds}s; lowering pause floor from " + f"{old_floor} to {new_floor} running action group(s)" + ) + + def refresh_paused_groups( + self, + groups: list[ActionProcessGroup], + ) -> list[ActionProcessGroup]: + action_groups = [group for group in groups if self.group_is_action(group)] + self.max_observed_action_groups = max( + self.max_observed_action_groups, len(action_groups) + ) + groups_by_key = {group.key: group for group in action_groups} + self.paused_keys.intersection_update(groups_by_key.keys()) + self.paused_pids = { + key: pids for key, pids in self.paused_pids.items() if key in self.paused_keys + } + self.paused_action_labels = { + key: labels + for key, labels in self.paused_action_labels.items() + if key in self.paused_keys + } + self.paused_started_at = { + key: started_at + for key, started_at in self.paused_started_at.items() + if key in self.paused_keys + } + self.paused_total_seconds = { + key: paused_seconds + for key, paused_seconds in self.paused_total_seconds.items() + if key in groups_by_key + } + self.current_action_labels = set() + for group in action_groups: + self.current_action_labels.update(group.action_labels) + return action_groups + + def minimum_running_groups(self, current_group_count: int) -> int: + floor = self.normal_minimum_running_groups(current_group_count) + if self.io_stall_floor_groups is not None: + floor = min(floor, self.io_stall_floor_groups) + return min(current_group_count, floor) + + def running_io_stall_recently_cleared(self, now: float) -> bool: + return ( + self.last_running_io_stall_at is not None + and now - self.last_running_io_stall_at < RESUME_IO_STALL_CLEAR_SECONDS + ) + + def memory_is_tight_for_resume(self, meminfo: MemInfo) -> bool: + return meminfo.available_kb <= self.pause_watch_threshold_kb() + + def resume_memory_is_settling(self, now: float) -> bool: + return now < self.next_normal_resume_at + + def remember_paused_group(self, group: ActionProcessGroup, now: float) -> None: + self.paused_keys.add(group.key) + self.paused_pids[group.key] = set(group.pids) + self.paused_action_labels[group.key] = set(group.action_labels) + self.paused_started_at.setdefault(group.key, now) + + def forget_paused_group(self, key: str, now: float) -> None: + pause_started_at = self.paused_started_at.pop(key, None) + if pause_started_at is not None and now > pause_started_at: + self.paused_total_seconds[key] = ( + self.paused_total_seconds.get(key, 0.0) + now - pause_started_at + ) + self.paused_keys.discard(key) + self.paused_pids.pop(key, None) + self.paused_action_labels.pop(key, None) + + def ensure_one_action_group_running( + self, + groups: list[ActionProcessGroup], + ) -> bool: + if not groups: + return False + + action_groups = [group for group in groups if self.group_is_action(group)] + running = [group for group in action_groups if not self.group_is_paused(group)] + if running: + return False + + paused = [group for group in action_groups if self.group_is_paused(group)] + if not paused: + return False + + selected = min(paused, key=self.group_sort_key) + self.signal_group(selected, signal.SIGCONT) + self.forget_paused_group(selected.key, time.monotonic()) + diag( + "resumed oldest paused Bazel action group " + f"{selected.key} to keep at least one action group running" + ) + return True + + # Return the staggered memory threshold for the next pause number. + def pause_threshold_kb(self, total_groups: int, pause_number: int) -> int: + threshold_kb = self.low_memory_threshold_kb() + pausable_count = max(1, total_groups - 1) + capped_pause_number = min(pausable_count, pause_number) + if pausable_count == 1: + return threshold_kb * 2 + return ( + threshold_kb * 2 + - threshold_kb * (capped_pause_number - 1) // (pausable_count - 1) + ) + + def pause_if_needed(self, meminfo: MemInfo | None) -> None: + if meminfo is None: + return + threshold_kb = self.low_memory_threshold_kb() + if meminfo.available_kb > threshold_kb * 2: + return + + groups = self.refresh_paused_groups(build_process_groups(self.context)) + if self.ensure_one_action_group_running(groups): + return + self.maybe_adapt_threshold(groups, meminfo) + + running = [group for group in groups if not self.group_is_paused(group)] + if len(running) <= 1: + return + if len(running) <= self.minimum_running_groups(len(groups)): + return + + next_pause_number = len(self.paused_keys) + 1 + now = time.monotonic() + stall_floor_active = ( + self.io_stall_floor_groups is not None + and self.sustained_io_stall_observed(now) + ) + if ( + not stall_floor_active + and meminfo.available_kb > self.pause_threshold_kb(len(groups), next_pause_number) + ): + return + + selected = max(running, key=self.group_sort_key) + self.signal_group(selected, signal.SIGSTOP) + self.remember_paused_group(selected, now) + + def resume_if_needed(self, meminfo: MemInfo | None) -> bool: + if meminfo is None: + return False + + groups = self.refresh_paused_groups(build_process_groups(self.context)) + if self.ensure_one_action_group_running(groups): + return True + if meminfo.available_kb <= self.low_memory_threshold_kb(): + return False + self.maybe_adapt_threshold(groups, meminfo) + now = time.monotonic() + memory_is_tight = self.memory_is_tight_for_resume(meminfo) + if ( + self.resume_memory_is_settling(now) + or ( + memory_is_tight + and ( + self.stalled_running_groups(groups) + or self.running_io_stall_recently_cleared(now) + ) + ) + ): + return False + + paused = [group for group in groups if self.group_is_paused(group)] + if not paused: + return False + + selected = min(paused, key=self.group_sort_key) + resume_threshold_kb = max( + self.pause_threshold_kb(len(groups), len(self.paused_keys)), + self.low_memory_threshold_kb() + self.resume_memory_kb(selected), + ) + if meminfo.available_kb <= resume_threshold_kb: + return False + + self.signal_group(selected, signal.SIGCONT) + self.forget_paused_group(selected.key, now) + self.next_normal_resume_at = now + RESUME_MEMORY_SETTLE_SECONDS + return True + + def resume_memory_kb(self, group: ActionProcessGroup) -> int: + return sum(process_swap_kb(pid) for pid in group.pids) + + def resume_all(self, reason: str | None = None) -> int: + resumed_groups = 0 + now = time.monotonic() + groups_by_key = {group.key: group for group in build_process_groups(self.context)} + for key in list(self.paused_keys): + group = groups_by_key.get(key) + if group is not None: + self.signal_group(group, signal.SIGCONT) + resumed_groups += 1 + else: + remembered_pids = self.paused_pids.get(key, set()) + for pid in sorted(remembered_pids): + try: + os.kill(pid, signal.SIGCONT) + except ProcessLookupError: + pass + except PermissionError: + pass + if remembered_pids: + resumed_groups += 1 + self.forget_paused_group(key, now) + if resumed_groups > 0 and reason is not None: + diag(f"resumed {resumed_groups} paused Bazel action group(s) {reason}") + return resumed_groups + + def group_sort_key(self, group: ActionProcessGroup) -> tuple[int, str]: + if group.started_at_ticks is None: + return (-1, group.key) + return (group.started_at_ticks, group.key) + + def signal_group(self, group: ActionProcessGroup, sig: int) -> None: + for pid in group.pids: + try: + os.kill(pid, sig) + except ProcessLookupError: + pass + except PermissionError: + pass + + +def action_timeout_evidence( + parser: ProgressFrameParser, + action_throttler: ActionThrottler, + limit_seconds: int, + now: float, +) -> tuple[bool, str]: + running_groups_over_timeout_fn = getattr( + action_throttler, + "all_running_action_groups_over", + None, + ) + running_groups_over_timeout = ( + running_groups_over_timeout_fn(limit_seconds, now) + if running_groups_over_timeout_fn is not None + else None + ) + if running_groups_over_timeout: + return True, "all active Bazel action groups" + if parser.all_reported_actions_over(limit_seconds, now): + return True, "all reported running actions" + return False, "action-age evidence" + + +def timeout_downscale_defer_reason( + parser: ProgressFrameParser, + action_throttler: ActionThrottler, + limit_seconds: int, + now: float, +) -> str | None: + if not parser.completed_progress_recent(now, limit_seconds): + return None + if action_throttler.recent_io_stall_observed(now): + return None + if action_throttler.current_io_stall_observed: + return None + return ( + "completed action count advanced recently and no running action " + "I/O stall is currently observed" + ) + + +# Lower scheduler priority for Bazel action children that the server launches. +def renice_build_processes(context: BuildContext) -> None: + target_nice = bazel_nice_increment() + if target_nice <= 0: + return + + for info in dangling_build_processes(context): + if info.nice is not None and info.nice >= target_nice: + continue + try: + os.setpriority(os.PRIO_PROCESS, info.pid, target_nice) + except (OSError, PermissionError): + pass + + +# Wait briefly for leftover Bazel build processes to exit before retrying. +def wait_for_no_dangling_build_processes(context: BuildContext, wait_seconds: float) -> bool: + deadline = time.monotonic() + wait_seconds + while time.monotonic() < deadline: + if not dangling_build_processes(context): + return True + time.sleep(0.1) + return not dangling_build_processes(context) + + +# Run "bazel shutdown" and report whether the command itself succeeded. +def run_bazel_shutdown(bazel_path: str, timeout_seconds: int) -> None: + try: + # Ask the server to stop before escalating to process-group killing. + completed = subprocess.run( + [bazel_path, "shutdown"], + stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + timeout=min( + max(timeout_seconds, BAZEL_SHUTDOWN_MIN_TIMEOUT_SECONDS), + BAZEL_SHUTDOWN_MAX_TIMEOUT_SECONDS, + ), + check=False, + preexec_fn=os.setpgrp, + ) + except (OSError, subprocess.TimeoutExpired) as error: + diag(f"bazel shutdown did not complete cleanly: {error}") + return + diag(f"bazel shutdown completed with exit code {completed.returncode}") + + +# Shared restart gate: optionally stop the Bazel server, then wait and pause. +def settle_before_restart( + context: BuildContext, + restart_description: str, + bazel_path: str | None = None, + shutdown_timeout: int | None = None, +) -> None: + if bazel_path is not None: + diag( + "asking Bazel server to shut down before " + f"{restart_description} because the previous retry made no " + "meaningful progress" + ) + run_bazel_shutdown(bazel_path, shutdown_timeout or 1) + + delay = restart_settle_delay() + if delay <= 0: + return + diag( + f"waiting up to {delay:g}s for Bazel build processes to exit " + f"before {restart_description}" + ) + if wait_for_no_dangling_build_processes(context, delay): + diag("Bazel build processes are gone") + else: + diag(f"Bazel build processes are still present after {delay:g}s") + diag(f"settling for {delay:g}s before {restart_description}") + time.sleep(delay) + + +# Signal a process tree by walking /proc parent relationships. +def kill_process_tree(pid: int, sig: int, context: BuildContext | None = None) -> None: + # Build a parent map from /proc so children are signaled before their parent. + parents: dict[int, int] = {} + for name in os.listdir("/proc"): + if not name.isdigit(): + continue + proc_pid = int(name) + try: + with open(f"/proc/{proc_pid}/stat", encoding="utf-8") as stat: + content = stat.read() + except OSError: + continue + try: + after_name = content.rsplit(")", 1)[1].strip().split() + parents[proc_pid] = int(after_name[1]) + except (IndexError, ValueError): + continue + + children_by_parent: dict[int, list[int]] = {} + for child, parent in parents.items(): + children_by_parent.setdefault(parent, []).append(child) + + descendants: list[int] = [] + stack = list(children_by_parent.get(pid, [])) + while stack: + child = stack.pop() + descendants.append(child) + stack.extend(children_by_parent.get(child, [])) + for child in reversed(descendants): + if context is not None: + info = process_info(child) + if info is None or not context.owns_process(info): + continue + try: + os.kill(child, sig) + except ProcessLookupError: + pass + except PermissionError: + pass + try: + os.kill(pid, sig) + except ProcessLookupError: + pass + except PermissionError: + pass + + +# Terminate leftover Bazel sandbox/build processes tied to this workspace. +def cleanup_dangling_build_processes(context: BuildContext) -> CleanupResult: + # Poll process liveness with a small sleep so cleanup waits without spinning. + def wait_for_exit(processes: list[ProcessInfo], wait_seconds: float) -> bool: + deadline = time.monotonic() + wait_seconds + while time.monotonic() < deadline: + if not any(pid_exists(info.pid) for info in processes): + return True + time.sleep(0.1) + return not any(pid_exists(info.pid) for info in processes) + + processes = dangling_build_processes(context) + if not processes: + return CleanupResult(count=0) + + diag(f"found {len(processes)} dangling Bazel build process(es); terminating") + + for info in processes: + kill_process_tree(info.pid, signal.SIGTERM, context) + if wait_for_exit(processes, DANGLING_PROCESS_TERM_WAIT_SECONDS): + return CleanupResult(count=len(processes)) + + survivors = [info for info in processes if pid_exists(info.pid)] + if not survivors: + return CleanupResult(count=len(processes)) + diag(f"{len(survivors)} dangling Bazel build process(es) survived SIGTERM; killing") + for info in survivors: + kill_process_tree(info.pid, signal.SIGKILL, context) + if wait_for_exit(survivors, DANGLING_PROCESS_KILL_WAIT_SECONDS): + return CleanupResult(count=len(processes)) + + remaining = [info for info in survivors if pid_exists(info.pid)] + if remaining: + diag(f"{len(remaining)} dangling Bazel build process(es) still running after SIGKILL") + for info in remaining: + command = info.cmdline.strip() or "" + if len(command) > 140: + command = command[:137] + "..." + cwd = f" cwd={info.cwd}" if info.cwd else "" + # Only verbose-print process details when cleanup could not kill them. + diag(f"dangling build process: pid={info.pid}{cwd} cmd={command}") + return CleanupResult(count=len(processes)) + + +# Interrupt Bazel's process group and explain why. +def graceful_stop( + process: subprocess.Popen, + reason: str, + action: str = "interrupting Bazel", +) -> None: + diag(f"{reason}; {action}") + try: + os.killpg(process.pid, signal.SIGINT) + except ProcessLookupError: + pass + + +# Run one Bazel attempt and classify whether the wrapper should retry. +def run_once( + bazel_path: str, + parsed: ParsedArgs, + jobs: int, + max_jobs: int, + context: BuildContext, +) -> RunResult: + global ACTIVE_PROCESS + + # Rewrite or insert --jobs directly before launching this Bazel attempt. + bazel_args = bazel_args_with_jobs(parsed, jobs) + command = [bazel_path, *bazel_args] + parser = ProgressFrameParser() + selector = selectors.DefaultSelector() + terminal = TerminalForeground() + output = BazelOutput() + memory_monitor = MemoryPressureMonitor() + action_throttler = ActionThrottler(context) + now = time.monotonic() + memory_monitor.sample(now, force=True) + # Tests can shorten the upscale warm-up interval with an environment override. + try: + configured_upscale_interval = float( + os.environ.get( + UPSCALE_CHECK_INTERVAL_ENV, + str(DEFAULT_UPSCALE_CHECK_INTERVAL_SECONDS), + ) + ) + except ValueError: + configured_upscale_interval = DEFAULT_UPSCALE_CHECK_INTERVAL_SECONDS + if configured_upscale_interval <= 0: + configured_upscale_interval = DEFAULT_UPSCALE_CHECK_INTERVAL_SECONDS + next_upscale_check_interval = max(memory_poll_interval(), configured_upscale_interval) + next_upscale_check: float | None = None + required_running_actions_seconds = next_upscale_check_interval + running_actions_since: float | None = None + last_stall_at: float | None = None + stop_reason: str | None = None + stop_deadline = 0.0 + last_upscale_skip_reason: str | None = None + upscale_skip_count = 0 + upscale_reevaluation_count = 0 + upscale_memory_skip_count = 0 + upscale_job_runtime_skip_count = 0 + upscale_description: str | None = None + pending_upscale_next_jobs: int | None = None + next_renice_check = now + next_pause_check = now + next_resume_check = now + next_downscale_defer_report = now + memory_tightness_observed = False + memory_kill_resume_done = False + user_termination_resume_done = False + + # Package this attempt's result with the latest monitoring context. + def result( + exit_code: int, + restart: str | None = None, + retry_after_dangling_processes: bool = True, + ) -> RunResult: + return RunResult( + exit_code=exit_code, + restart=restart, + upscale_skip_reason=last_upscale_skip_reason, + upscale_skip_count=upscale_skip_count, + upscale_reevaluation_count=upscale_reevaluation_count, + upscale_memory_skip_count=upscale_memory_skip_count, + upscale_job_runtime_skip_count=upscale_job_runtime_skip_count, + upscale_description=upscale_description, + failure_retry_same=memory_monitor.retry_same_jobs_after_failure(now), + failure_average_description=memory_monitor.failure_average_description(now), + retry_after_dangling_processes=retry_after_dangling_processes, + internal_interrupted_crash=( + parser.saw_internal_crash + and parser.saw_java_interrupted + and stop_reason is None + and not parser.saw_user_interrupt + ), + retryable_action_failure=parser.saw_memory_kill and stop_reason is None, + meaningful_work_done=parser.meaningful_work_done, + user_interrupted=parser.saw_user_interrupt and stop_reason is None, + ) + + diag(f"starting Bazel with --jobs={jobs}") + process = subprocess.Popen( + command, + stdin=None, + bufsize=0, + preexec_fn=prepare_bazel_child, + **output.popen_kwargs(), + ) + output.parent_after_spawn() + ACTIVE_PROCESS = process + output.register(selector, process) + context.refresh_from_bazel_servers() + terminal.give_to(process.pid) + + # Attribute each skipped upscale evaluation to memory or job-runtime state. + def upscale_skip_category(reason: str, from_memory_gate: bool = False) -> str: + if not from_memory_gate: + return SKIP_JOB_RUNTIME + if reason.startswith("running Bazel actions"): + return SKIP_JOB_RUNTIME + return SKIP_MEMORY + + # Classify whether upscale should run now, stay pending, or be blocked. + def evaluate_upscale( + running_actions_seconds: float | None, + keep_pending_on_memory_skip: bool = False, + ) -> UpscaleEvaluation: + memory_monitor.sample(now, force=True) + memory_skip = memory_monitor.upscale_skip_reason( + now, + running_actions_seconds, + required_running_actions_seconds, + ) + if memory_skip is not None: + if keep_pending_on_memory_skip: + skip_category = upscale_skip_category(memory_skip, from_memory_gate=True) + return UpscaleEvaluation(UPSCALE_PENDING, memory_skip, skip_category) + + skip_category = upscale_skip_category(memory_skip, from_memory_gate=True) + return UpscaleEvaluation(UPSCALE_BLOCKED, memory_skip, skip_category) + paused_count = action_throttler.paused_count() + if paused_count > 0: + return UpscaleEvaluation( + UPSCALE_PENDING, + f"{paused_count} action group(s) are paused", + SKIP_JOB_RUNTIME, + ) + remaining_action_finish_threshold = jobs * UPSCALE_REMAINING_ACTION_FINISH_JOBS_MULTIPLIER + action_skip = parser.upscale_action_skip_reason( + UPSCALE_MAX_ACTION_SECONDS, + remaining_action_finish_threshold, + now, + ) + if action_skip is not None: + return UpscaleEvaluation(UPSCALE_PENDING, action_skip, SKIP_JOB_RUNTIME) + return UpscaleEvaluation(UPSCALE_READY, None, None) + + # Count why an upscale evaluation could not proceed. + def record_upscale_skip(category: str | None) -> None: + nonlocal upscale_memory_skip_count, upscale_job_runtime_skip_count + if category == SKIP_MEMORY: + upscale_memory_skip_count += 1 + elif category == SKIP_JOB_RUNTIME: + upscale_job_runtime_skip_count += 1 + + # Keep action-age accounting aligned with wrapper-induced SIGSTOP intervals. + def update_pause_accounting(previous_paused_count: int) -> None: + paused_count = action_throttler.paused_count() + if previous_paused_count == 0 and paused_count > 0: + parser.note_actions_paused(now) + elif previous_paused_count > 0 and paused_count == 0: + parser.note_actions_resumed(now) + parser.note_live_action_labels( + getattr(action_throttler, "current_action_labels", set()), + now, + ) + parser.note_paused_labels(action_throttler.paused_labels(), now) + + # Stop Bazel at a cheap point so the next attempt can use more jobs. + def begin_upscale(next_jobs: int, running_actions_seconds: float | None) -> None: + nonlocal stop_reason, stop_deadline, upscale_description + upscale_description = memory_monitor.upscale_ready_description( + now, + running_actions_seconds, + ) + upscale_description += f"; {parser.current_action_age_description(now)}" + terminal.restore() + paused_count = action_throttler.paused_count() + action_throttler.resume_all("before stopping Bazel for upscale") + update_pause_accounting(paused_count) + reason = ( + f"upscale: {upscale_description}; " + f"stopping Bazel at --jobs={jobs} so the wrapper can " + f"restart at --jobs={next_jobs}" + ) + graceful_stop( + process, + reason, + "interrupting Bazel at a cheap upscale point", + ) + stop_reason = "up" + stop_deadline = now + parsed.action_timeout + + try: + while True: + drain_timeout = 0.2 + if stop_reason is None and not parser.saw_memory_kill: + drain_timeout = min( + drain_timeout, + max(0.0, next_pause_check - time.monotonic()), + max(0.0, next_resume_check - time.monotonic()), + ) + drain_ready_streams( + selector, + parser, + drain_timeout, + action_throttler.paused_count, + action_throttler.paused_labels, + ) + now = time.monotonic() + meminfo = memory_monitor.sample(now) + if ( + (USER_TERMINATING or (parser.saw_user_interrupt and stop_reason is None)) + and not user_termination_resume_done + ): + paused_count = action_throttler.paused_count() + if USER_TERMINATING: + reason = "because the wrapper received a user signal" + else: + reason = "because Bazel reported a user interrupt" + action_throttler.resume_all(reason) + update_pause_accounting(paused_count) + user_termination_resume_done = True + if parser.saw_memory_kill and not memory_kill_resume_done: + paused_count = action_throttler.paused_count() + action_throttler.resume_all( + "because Bazel reported a killed or terminated action" + ) + update_pause_accounting(paused_count) + memory_kill_resume_done = True + if ( + stop_reason is None + and not parser.saw_memory_kill + and not parser.saw_user_interrupt + ): + if now >= next_pause_check: + pause_meminfo = memory_monitor.sample(now, force=True) + pause_watch_threshold_kb = getattr( + action_throttler, + "pause_watch_threshold_kb", + lambda: low_memory_threshold_kb() * 2, + )() + if ( + pause_meminfo is not None + and pause_meminfo.available_kb + <= pause_watch_threshold_kb + ): + if not memory_tightness_observed: + memory_tightness_observed = True + next_renice_check = now + paused_count = action_throttler.paused_count() + action_throttler.pause_if_needed(pause_meminfo) + update_pause_accounting(paused_count) + pause_check_delay = ( + THROTTLE_PAUSE_CHECK_SECONDS + if action_throttler.paused_count() != paused_count + else THROTTLE_IDLE_PAUSE_CHECK_SECONDS + ) + else: + pause_check_delay = THROTTLE_IDLE_PAUSE_CHECK_SECONDS + next_pause_check = now + pause_check_delay + if now >= next_resume_check: + resume_meminfo = memory_monitor.sample(now, force=True) + resume_threshold_kb = getattr( + action_throttler, + "low_memory_threshold_kb", + low_memory_threshold_kb, + )() + if ( + resume_meminfo is not None + and resume_meminfo.available_kb + > resume_threshold_kb + ): + paused_count = action_throttler.paused_count() + action_throttler.resume_if_needed(resume_meminfo) + update_pause_accounting(paused_count) + next_resume_check = now + THROTTLE_RESUME_CHECK_SECONDS + if memory_tightness_observed and now >= next_renice_check: + renice_build_processes(context) + next_renice_check = now + RENICE_BUILD_CHILDREN_SECONDS + if parser.has_running_actions(): + if running_actions_since is None: + running_actions_since = now + next_upscale_check = now + next_upscale_check_interval + if running_actions_since is None: + running_actions_seconds = None + else: + running_actions_seconds = now - running_actions_since + if parser.all_displayed_actions_over(parsed.action_timeout, now): + last_stall_at = now + returncode = process.poll() + if returncode is not None: + drain_remaining_streams( + selector, + parser, + action_throttler.paused_count, + action_throttler.paused_labels, + ) + if parser.all_displayed_actions_over(parsed.action_timeout, now): + last_stall_at = now + context.add_output_bases(parser.output_bases) + context.refresh_from_bazel_servers() + normalized_returncode = normalize_returncode(returncode) + if normalized_returncode != 0 and stop_reason is None: + diag(memory_monitor.failure_report(now)) + if ( + normalized_returncode != 0 + and parser.saw_user_interrupt + and stop_reason is None + ): + return result(130) + if ( + normalized_returncode != 0 + and parser.saw_memory_kill + and stop_reason is None + and jobs > 1 + ): + if memory_monitor.retry_same_jobs_after_failure(now): + diag( + "Bazel reported a killed or terminated action, but " + f"{memory_monitor.failure_average_description(now)}; " + f"retrying with same --jobs={jobs}" + ) + return result(normalized_returncode, "same") + diag("Bazel reported a killed or terminated action; retrying with fewer jobs") + return result(normalized_returncode, "down") + recent_stall = ( + last_stall_at is not None and now - last_stall_at <= RECENT_STALL_SECONDS + ) + if ( + normalized_returncode != 0 + and parser.saw_server_abrupt + and jobs > 1 + ): + evidence = [] + if memory_monitor.recent_low_memory(now): + evidence.append( + "recent memory pressure " + f"({memory_monitor.recent_low_memory_description()})" + ) + if recent_stall: + evidence.append( + f"recent visible action stall over {parsed.action_timeout}s" + ) + if not evidence: + evidence.append("server failure") + evidence_description = " and ".join(evidence) + if memory_monitor.retry_same_jobs_after_failure(now): + diag( + "Bazel server terminated abruptly after " + f"{evidence_description}, but " + f"{memory_monitor.failure_average_description(now)}; " + f"retrying with same --jobs={jobs}" + ) + return result(normalized_returncode, "same") + diag( + "Bazel server terminated abruptly after " + f"{evidence_description}; " + "retrying with fewer jobs" + ) + return result(normalized_returncode, "down") + if ( + normalized_returncode != 0 + and stop_reason == "up" + and parser.saw_build_failure + ): + diag( + "upscale cancelled because Bazel reported a build failure " + "while stopping for upscale" + ) + return result( + normalized_returncode, + retry_after_dangling_processes=False, + ) + return result(normalized_returncode, stop_reason) + + if USER_TERMINATING: + continue + + if stop_reason is not None: + if now >= stop_deadline: + terminal.restore() + context.add_output_bases(parser.output_bases) + context.refresh_from_bazel_servers() + diag("graceful stop timed out; asking Bazel server to shut down") + run_bazel_shutdown(bazel_path, parsed.action_timeout) + + if process.poll() is None: + diag("Bazel client is still running; killing its process group") + try: + os.killpg(process.pid, signal.SIGKILL) + except ProcessLookupError: + pass + + for server in bazel_servers_for_workspace(os.getcwd(), context.cgroups): + server_pid = server.pid + if server_pid in {os.getpid(), process.pid}: + continue + diag(f"Bazel server pid {server_pid} is still running; killing it") + kill_process_tree(server_pid, signal.SIGKILL, context) + try: + process.wait(timeout=1) + except subprocess.TimeoutExpired: + pass + returncode = process.poll() + if returncode is not None: + drain_remaining_streams( + selector, + parser, + action_throttler.paused_count, + action_throttler.paused_labels, + ) + return result(normalize_returncode(returncode), stop_reason) + continue + + downscale_memory_threshold_fn = getattr( + action_throttler, + "downscale_memory_threshold_kb", + None, + ) + downscale_memory_threshold_kb = ( + downscale_memory_threshold_fn() + if downscale_memory_threshold_fn is not None + else low_memory_threshold_kb() + ) + if ( + jobs > 1 + and meminfo is not None + and meminfo.available_kb < downscale_memory_threshold_kb + ): + running_actions_over_timeout, timeout_subject = action_timeout_evidence( + parser, + action_throttler, + parsed.action_timeout, + now, + ) + if running_actions_over_timeout: + defer_reason = timeout_downscale_defer_reason( + parser, + action_throttler, + parsed.action_timeout, + now, + ) + if defer_reason is not None: + if now >= next_downscale_defer_report: + diag( + "downscale deferred despite old action-age evidence: " + f"{defer_reason}" + ) + next_downscale_defer_report = ( + now + TIMEOUT_DOWNSCALE_DEFER_REPORT_SECONDS + ) + else: + terminal.restore() + paused_count = action_throttler.paused_count() + action_throttler.resume_all("before stopping Bazel for downscale") + update_pause_accounting(paused_count) + reason = ( + f"{timeout_subject} are over {parsed.action_timeout}s " + "and memory is low " + f"({meminfo.available_kb // 1024} MiB available; " + f"threshold {downscale_memory_threshold_kb // 1024} MiB)" + ) + graceful_stop(process, reason) + stop_reason = "down" + stop_deadline = now + parsed.action_timeout + diag( + "downscale decision used " + f"{timeout_subject}; {paused_count} action group(s) were paused" + ) + diag( + "action timeout and low memory detected; " + "retrying with fewer jobs" + ) + continue + + if pending_upscale_next_jobs is not None: + upscale_reevaluation_count += 1 + evaluation = evaluate_upscale( + running_actions_seconds, + keep_pending_on_memory_skip=True, + ) + record_upscale_skip(evaluation.skip_category) + if evaluation.status == UPSCALE_READY: + begin_upscale(pending_upscale_next_jobs, running_actions_seconds) + pending_upscale_next_jobs = None + continue + if evaluation.status == UPSCALE_BLOCKED: + pending_upscale_next_jobs = None + upscale_skip_count += 1 + last_upscale_skip_reason = evaluation.reason + next_upscale_check = now + next_upscale_check_interval + elif evaluation.reason is not None: + last_upscale_skip_reason = evaluation.reason + + if ( + pending_upscale_next_jobs is None + and next_upscale_check is not None + and now >= next_upscale_check + ): + next_upscale_check = now + next_upscale_check_interval + if jobs < max_jobs: + next_jobs = upscale_jobs(jobs, max_jobs) + evaluation = evaluate_upscale( + running_actions_seconds, + keep_pending_on_memory_skip=True, + ) + if evaluation.status == UPSCALE_READY: + last_upscale_skip_reason = None + begin_upscale(next_jobs, running_actions_seconds) + elif evaluation.status == UPSCALE_PENDING: + pending_upscale_next_jobs = next_jobs + upscale_skip_count += 1 + record_upscale_skip(evaluation.skip_category) + last_upscale_skip_reason = evaluation.reason + diag( + f"upscale watch active: {evaluation.reason}; " + f"will restart with --jobs={next_jobs} when memory " + "and current action ages allow" + ) + else: + upscale_skip_count += 1 + record_upscale_skip(evaluation.skip_category) + last_upscale_skip_reason = evaluation.reason + finally: + now = time.monotonic() + paused_count = action_throttler.paused_count() + action_throttler.resume_all() + update_pause_accounting(paused_count) + terminal.restore() + ACTIVE_PROCESS = None + selector.close() + output.close() + + +# Retry Bazel attempts while adapting the current jobs value. +def run_adaptive(bazel_path: str, parsed: ParsedArgs) -> int: + if not parsed.supports_jobs: + os.execvpe(bazel_path, [bazel_path, *parsed.original_args], os.environ) + + jobs = parsed.initial_jobs + max_jobs = parsed.initial_jobs + context = BuildContext(os.getcwd()) + internal_interrupted_crash_retries = 0 + same_job_action_failure_retries: dict[int, int] = {} + clean_server_before_same_retry = False + + try: + while True: + result = run_once(bazel_path, parsed, jobs, max_jobs, context) + cleanup = cleanup_dangling_build_processes(context) + if USER_TERMINATING or result.user_interrupted: + return result.exit_code + if ( + result.exit_code != 0 + and result.restart is None + and result.internal_interrupted_crash + ): + if internal_interrupted_crash_retries >= 1: + diag( + "Bazel crashed internally after java.lang.InterruptedException " + "again; not retrying" + ) + return result.exit_code + internal_interrupted_crash_retries += 1 + diag( + "Bazel crashed internally after java.lang.InterruptedException; " + f"retrying with same --jobs={jobs}" + ) + result.restart = "same" + if ( + result.restart != "up" + and result.upscale_skip_count > 0 + and result.upscale_skip_reason is not None + ): + attempt_word = "attempt" if result.upscale_skip_count == 1 else "attempts" + reevaluation_word = ( + "reevaluation" + if result.upscale_reevaluation_count == 1 + else "reevaluations" + ) + diag( + "upscale watch skipped after " + f"{result.upscale_skip_count} scheduled {attempt_word} and " + f"{result.upscale_reevaluation_count} {reevaluation_word} " + f"(memory skips: {result.upscale_memory_skip_count}; " + f"job-runtime skips: {result.upscale_job_runtime_skip_count}): " + f"{result.upscale_skip_reason}" + ) + if ( + result.exit_code != 0 + and result.restart is None + and cleanup.count > 0 + and jobs > 1 + and result.retry_after_dangling_processes + ): + if result.failure_retry_same: + diag( + "Bazel exited while build processes were still running, but " + f"{result.failure_average_description}; retrying with same --jobs={jobs}" + ) + result.restart = "same" + else: + diag( + "Bazel exited while build processes were still running; " + "retrying with fewer jobs" + ) + result.restart = "down" + if result.restart == "same" and result.retryable_action_failure: + prior_same_retries = same_job_action_failure_retries.get(jobs, 0) + clean_server_before_same_retry = ( + prior_same_retries > 0 and not result.meaningful_work_done + ) + retry_count = prior_same_retries + 1 + if retry_count > same_job_retry_limit(): + diag( + "Bazel kept reporting killed or terminated actions at " + f"--jobs={jobs} after {prior_same_retries} same-job " + "retry attempt(s); not retrying" + ) + return result.exit_code + same_job_action_failure_retries[jobs] = retry_count + if result.restart == "down": + if jobs <= 1: + diag("already at --jobs=1; not retrying") + return result.exit_code + while True: + try: + # Wait for memory to recover before starting the smaller retry. + meminfo = read_meminfo() + except OSError as error: + diag( + "could not read memory information while waiting " + f"for recovery: {error}" + ) + break + if meminfo.total_kb > 0 and meminfo.available_kb * 2 >= meminfo.total_kb: + break + diag( + "waiting for memory recovery " + f"({meminfo.available_kb // 1024} MiB available " + f"of {meminfo.total_kb // 1024} MiB)" + ) + time.sleep(memory_poll_interval()) + jobs = downscale_jobs(jobs) + settle_before_restart(context, f"restarting with --jobs={jobs}") + continue + if result.restart == "same": + diag(f"retrying Bazel with same --jobs={jobs}") + if clean_server_before_same_retry: + settle_before_restart( + context, + f"retrying with same --jobs={jobs}", + bazel_path, + parsed.action_timeout, + ) + clean_server_before_same_retry = False + else: + settle_before_restart(context, f"retrying with same --jobs={jobs}") + continue + if result.restart == "up": + next_jobs = upscale_jobs(jobs, max_jobs) + upscale_context = ( + f"; {result.upscale_description}" if result.upscale_description else "" + ) + diag( + f"upscale: Bazel stopped at --jobs={jobs}; " + f"restarting with --jobs={next_jobs}{upscale_context}" + ) + jobs = next_jobs + settle_before_restart(context, f"restarting with --jobs={jobs}") + continue + return result.exit_code + finally: + cleanup_dangling_build_processes(context) + + +# CLI entry point for resolving Bazel, parsing args, and starting adaptation. +def main(argv: list[str]) -> int: + wrapper_path = os.path.realpath(__file__) + # Prefer BAZEL unless it points back at this wrapper, then search PATH. + bazel_path = os.environ.get(BAZEL_ENV) + if bazel_path and os.path.realpath(bazel_path) == wrapper_path: + bazel_path = None + if not bazel_path: + for directory in os.environ.get(PATH_ENV, DEFAULT_PATH).split(os.pathsep): + candidate = os.path.join(directory or os.curdir, "bazel") + if os.access(candidate, os.X_OK) and os.path.realpath(candidate) != wrapper_path: + bazel_path = candidate + break + if not bazel_path: + candidate = shutil.which("bazel") + if candidate and os.path.realpath(candidate) != wrapper_path: + bazel_path = candidate + if bazel_path is None: + print(f"{diag_prefix()} could not find real bazel on PATH", file=sys.stderr) + return 127 + + if not argv: + exit_code = subprocess.run([bazel_path], check=False).returncode + print( + f"{diag_prefix()} Set {BUILD_TIMEOUT_ENV}= to control adaptive " + f"build timeout; set {LOW_MEMORY_THRESHOLD_ENV}= to control " + f"low-memory detection; defaults are {DEFAULT_ACTION_TIMEOUT_SECONDS}s " + f"and {DEFAULT_LOW_MEMORY_THRESHOLD_MB} MiB.", + file=sys.stderr, + flush=True, + ) + return normalize_returncode(exit_code) + + if not bazel_command_supports_jobs(argv): + os.execvpe(bazel_path, [bazel_path, *argv], os.environ) + + try: + action_timeout = build_timeout_from_env() + low_memory_threshold_kb() + except ValueError as error: + print(f"{diag_prefix()} {error}", file=sys.stderr) + return 2 + + parsed = parse_bazel_args(argv, action_timeout) + + # Forward user termination signals to the active Bazel process group. + def forward_signal(signum: int, _frame) -> None: + global USER_TERMINATING + USER_TERMINATING = True + process = ACTIVE_PROCESS + if process is None or process.poll() is not None: + return + try: + os.killpg(process.pid, signum) + except ProcessLookupError: + pass + + signal.signal(signal.SIGINT, forward_signal) + signal.signal(signal.SIGTERM, forward_signal) + return run_adaptive(bazel_path, parsed) + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/tools/bazel_adaptive_test.py b/tools/bazel_adaptive_test.py new file mode 100755 index 000000000..5f72bb106 --- /dev/null +++ b/tools/bazel_adaptive_test.py @@ -0,0 +1,3686 @@ +#!/usr/bin/env python3 +"""Unit and fake-Bazel integration tests for the adaptive Bazel wrapper.""" + +import importlib.util +import io +import os +import signal +import subprocess +import sys +import tempfile +import threading +import textwrap +import time +import unittest +from contextlib import contextmanager +from pathlib import Path + + +sys.dont_write_bytecode = True + +TOOLS_DIR = Path(__file__).resolve().parent +WRAPPER = TOOLS_DIR / "bazel_adaptive.py" + +spec = importlib.util.spec_from_file_location("bazel_adaptive", WRAPPER) +bazel_adaptive = importlib.util.module_from_spec(spec) +assert spec.loader is not None +sys.modules["bazel_adaptive"] = bazel_adaptive +spec.loader.exec_module(bazel_adaptive) + + +def write_meminfo(path: Path, total_kb: int, available_kb: int) -> None: + path.write_text( + f"MemTotal: {total_kb} kB\n" + f"MemFree: {available_kb} kB\n" + f"MemAvailable: {available_kb} kB\n", + encoding="utf-8", + ) + + +def process_exists(pid: int) -> bool: + try: + os.kill(pid, 0) + except ProcessLookupError: + return False + except PermissionError: + return True + return True + + +@contextmanager +def temporary_env(name: str, value: str): + old_value = os.environ.get(name) + os.environ[name] = value + try: + yield + finally: + if old_value is None: + os.environ.pop(name, None) + else: + os.environ[name] = old_value + + +class ParsingTest(unittest.TestCase): + def test_parses_integer_jobs_and_uses_environment_timeout(self) -> None: + parsed = bazel_adaptive.parse_bazel_args( + ["test", "--jobs=6", "--test_timeout", "999", "//tests/..."], + action_timeout=42, + ) + + self.assertEqual(parsed.initial_jobs, 6) + self.assertEqual(parsed.action_timeout, 42) + + def test_build_timeout_from_environment(self) -> None: + self.assertEqual( + bazel_adaptive.build_timeout_from_env({"BAZEL_ADAPTIVE_BUILD_TIMEOUT": "17"}), + 17, + ) + self.assertEqual(bazel_adaptive.build_timeout_from_env({}), 150) + with self.assertRaises(ValueError): + bazel_adaptive.build_timeout_from_env({"BAZEL_ADAPTIVE_BUILD_TIMEOUT": "1m"}) + + def test_bazel_nice_increment_from_environment(self) -> None: + with temporary_env("BAZEL_ADAPTIVE_BAZEL_NICE", "0"): + self.assertEqual(bazel_adaptive.bazel_nice_increment(), 0) + with temporary_env("BAZEL_ADAPTIVE_BAZEL_NICE", "7"): + self.assertEqual(bazel_adaptive.bazel_nice_increment(), 7) + with temporary_env("BAZEL_ADAPTIVE_BAZEL_NICE", "999"): + self.assertEqual(bazel_adaptive.bazel_nice_increment(), 19) + with temporary_env("BAZEL_ADAPTIVE_BAZEL_NICE", "bad"): + self.assertEqual( + bazel_adaptive.bazel_nice_increment(), + bazel_adaptive.DEFAULT_BAZEL_NICE, + ) + + def test_low_memory_threshold_from_environment(self) -> None: + self.assertEqual(bazel_adaptive.low_memory_threshold_kb({}), 1024 * 1024) + self.assertEqual( + bazel_adaptive.low_memory_threshold_kb( + {"BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB": "1024"} + ), + 1024 * 1024, + ) + with self.assertRaises(ValueError): + bazel_adaptive.low_memory_threshold_kb( + {"BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB": "2GB"} + ) + + def test_host_cpus_jobs_expression_sets_initial_cap(self) -> None: + host_cpus = os.cpu_count() or 1 + half_cpus = max(1, int(host_cpus * 0.5)) + + parsed = bazel_adaptive.parse_bazel_args( + ["test", "--jobs=HOST_CPUS*.5"], + action_timeout=100, + ) + self.assertEqual(parsed.initial_jobs, half_cpus) + + parsed = bazel_adaptive.parse_bazel_args( + ["test", "--jobs", "HOST_CPUS"], + action_timeout=100, + ) + self.assertEqual(parsed.initial_jobs, host_cpus) + + def test_host_ram_jobs_expression_sets_initial_cap(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + meminfo = Path(tmp) / "meminfo" + write_meminfo(meminfo, total_kb=10 * 1024 * 1024, available_kb=8 * 1024 * 1024) + with temporary_env("BAZEL_ADAPTIVE_MEMINFO", str(meminfo)): + parsed = bazel_adaptive.parse_bazel_args( + ["test", "--jobs=HOST_RAM*.0002"], action_timeout=100 + ) + self.assertEqual(parsed.initial_jobs, 2) + + parsed = bazel_adaptive.parse_bazel_args( + ["test", "--jobs", "HOST_RAM"], + action_timeout=100, + ) + self.assertEqual(parsed.initial_jobs, 10 * 1024) + + def test_unknown_non_integer_jobs_falls_back_to_cpu_count(self) -> None: + parsed = bazel_adaptive.parse_bazel_args(["test", "--jobs=auto"], action_timeout=100) + + self.assertEqual(parsed.initial_jobs, os.cpu_count() or 1) + + def test_rewrites_existing_jobs_for_attempt(self) -> None: + parsed = bazel_adaptive.parse_bazel_args( + ["test", "--jobs=HOST_CPUS", "//tests/..."], action_timeout=100 + ) + self.assertEqual( + bazel_adaptive.bazel_args_with_jobs(parsed, 3), + ["test", "--jobs=3", "//tests/..."], + ) + + parsed = bazel_adaptive.parse_bazel_args( + ["test", "--jobs", "HOST_RAM", "//tests/..."], action_timeout=100 + ) + self.assertEqual( + bazel_adaptive.bazel_args_with_jobs(parsed, 4), + ["test", "--jobs", "4", "//tests/..."], + ) + + def test_inserts_jobs_without_parsing_startup_options(self) -> None: + parsed = bazel_adaptive.parse_bazel_args( + ["--future_startup_option", "value", "test", "//tests/..."], + action_timeout=100, + ) + + self.assertEqual(parsed.job_locations, []) + self.assertEqual( + bazel_adaptive.bazel_args_with_jobs(parsed, 5), + ["--future_startup_option", "value", "test", "//tests/...", "--jobs=5"], + ) + + def test_jobs_after_bazel_delimiter_are_target_arguments(self) -> None: + parsed = bazel_adaptive.parse_bazel_args( + ["run", "//tool", "--", "--jobs=2"], + action_timeout=100, + ) + + self.assertEqual(parsed.job_locations, []) + self.assertEqual( + bazel_adaptive.bazel_args_with_jobs(parsed, 7), + ["run", "//tool", "--jobs=7", "--", "--jobs=2"], + ) + + def test_does_not_add_jobs_to_commands_without_jobs_flag(self) -> None: + parsed = bazel_adaptive.parse_bazel_args( + ["query", "deps(//tests:all)"], + action_timeout=100, + ) + + self.assertFalse(parsed.supports_jobs) + self.assertEqual(parsed.job_locations, []) + self.assertEqual( + bazel_adaptive.bazel_args_with_jobs(parsed, 7), + ["query", "deps(//tests:all)"], + ) + + def test_unknown_commands_pass_through_without_jobs(self) -> None: + parsed = bazel_adaptive.parse_bazel_args( + ["future-command", "--some_flag"], + action_timeout=100, + ) + + self.assertFalse(parsed.supports_jobs) + self.assertEqual( + bazel_adaptive.bazel_args_with_jobs(parsed, 7), + ["future-command", "--some_flag"], + ) + + def test_does_not_rewrite_jobs_on_commands_without_jobs_flag(self) -> None: + parsed = bazel_adaptive.parse_bazel_args( + ["query", "--jobs=99", "deps(//tests:all)"], + action_timeout=100, + ) + + self.assertFalse(parsed.supports_jobs) + self.assertEqual(parsed.job_locations, []) + self.assertEqual( + bazel_adaptive.bazel_args_with_jobs(parsed, 7), + ["query", "--jobs=99", "deps(//tests:all)"], + ) + + def test_recognizes_jobs_commands_after_startup_options(self) -> None: + parsed = bazel_adaptive.parse_bazel_args( + ["--future_startup_option", "value", "aquery", "//tests:all"], + action_timeout=100, + ) + + self.assertTrue(parsed.supports_jobs) + self.assertEqual( + bazel_adaptive.bazel_args_with_jobs(parsed, 7), + ["--future_startup_option", "value", "aquery", "//tests:all", "--jobs=7"], + ) + + def test_jobs_command_word_as_possible_startup_option_value_passes_through(self) -> None: + parsed = bazel_adaptive.parse_bazel_args( + ["--future_startup_option", "run"], + action_timeout=100, + ) + + self.assertFalse(parsed.supports_jobs) + self.assertEqual( + bazel_adaptive.bazel_args_with_jobs(parsed, 7), + ["--future_startup_option", "run"], + ) + + def test_duration_parser(self) -> None: + self.assertEqual(bazel_adaptive.parse_duration_seconds("Compiling x; 27s sandbox"), 27) + self.assertEqual(bazel_adaptive.parse_duration_seconds("Compiling x; 27s remote"), 27) + self.assertEqual( + bazel_adaptive.parse_duration_seconds( + "GoCompilePkg //proxylib:go_default_library; 27s remote" + ), + 27, + ) + self.assertEqual(bazel_adaptive.parse_duration_seconds("Rustc //crate:lib; 13s worker"), 13) + self.assertEqual( + bazel_adaptive.parse_duration_seconds("ProtoCompile //api:v1_proto; 9s linux-sandbox"), + 9, + ) + self.assertEqual(bazel_adaptive.parse_duration_seconds("Compiling x; 2m13s sandbox"), 133) + self.assertEqual( + bazel_adaptive.parse_duration_seconds("Compiling x; 1h02m03s sandbox"), + 3723, + ) + + def test_displayed_action_durations_subtract_paused_time(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.note_actions_paused(100.0) + + displayed = bazel_adaptive.rewrite_action_duration_display( + " Compiling tests/a.cc; 56s processwrapper-sandbox\n" + " Compiling tests/b.cc; 2m13s processwrapper-sandbox\n", + parser, + now=130.0, + paused_count=2, + ) + + self.assertIn("tests/a.cc; 26s processwrapper-sandbox", displayed) + self.assertIn("tests/b.cc; 103s processwrapper-sandbox", displayed) + + def test_displayed_action_duration_rewrite_is_limited_to_paused_count(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.note_actions_paused(100.0) + + displayed = bazel_adaptive.rewrite_action_duration_display( + " Compiling tests/a.cc; 56s processwrapper-sandbox\n" + " Compiling tests/b.cc; 55s processwrapper-sandbox\n" + " Compiling tests/c.cc; 54s processwrapper-sandbox\n", + parser, + now=130.0, + paused_count=2, + ) + + self.assertIn("tests/a.cc; 26s processwrapper-sandbox", displayed) + self.assertIn("tests/b.cc; 25s processwrapper-sandbox", displayed) + self.assertIn("tests/c.cc; 54s processwrapper-sandbox", displayed) + + def test_displayed_action_duration_rewrite_uses_paused_labels(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.note_actions_paused(100.0) + parser.note_paused_labels({"tests/b.cc"}, 100.0) + + displayed = bazel_adaptive.rewrite_action_duration_display( + " Compiling tests/a.cc; 56s processwrapper-sandbox\n" + " Compiling tests/b.cc; 55s processwrapper-sandbox\n" + " Compiling tests/c.cc; 54s processwrapper-sandbox\n", + parser, + now=130.0, + paused_count=2, + paused_labels={"tests/b.cc"}, + ) + + self.assertIn("tests/a.cc; 56s processwrapper-sandbox", displayed) + self.assertIn("tests/b.cc; 25s processwrapper-sandbox", displayed) + self.assertIn("tests/c.cc; 54s processwrapper-sandbox", displayed) + + def test_displayed_action_duration_keeps_adjusting_after_resume(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.note_actions_paused(100.0) + parser.note_paused_labels({"tests/a.cc"}, 100.0) + parser.note_actions_resumed(200.0) + parser.note_paused_labels(set(), 200.0) + + displayed = bazel_adaptive.rewrite_action_duration_display( + " Compiling tests/a.cc; 101s processwrapper-sandbox\n" + " Compiling tests/b.cc; 101s processwrapper-sandbox\n", + parser, + now=201.0, + paused_count=0, + ) + + self.assertIn("tests/a.cc; 1s processwrapper-sandbox", displayed) + self.assertIn("tests/b.cc; 101s processwrapper-sandbox", displayed) + + def test_displayed_action_duration_freezes_when_paused_again(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.note_actions_paused(100.0) + parser.note_paused_labels({"tests/a.cc"}, 100.0) + parser.note_actions_resumed(200.0) + parser.note_paused_labels(set(), 200.0) + parser.note_actions_paused(210.0) + parser.note_paused_labels({"tests/a.cc"}, 210.0) + + displayed = bazel_adaptive.rewrite_action_duration_display( + " Compiling tests/a.cc; 130s processwrapper-sandbox\n", + parser, + now=230.0, + paused_count=1, + paused_labels={"tests/a.cc"}, + ) + + self.assertIn("tests/a.cc; 10s processwrapper-sandbox", displayed) + + def test_displayed_action_duration_history_is_kept_until_label_is_gone(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.note_paused_labels({"tests/a.cc"}, 100.0) + parser.note_paused_labels(set(), 101.0) + parser.note_live_action_labels({"tests/a.cc"}, 500.0) + + self.assertIn("tests/a.cc", parser.paused_duration_labels()) + + def test_displayed_action_duration_history_is_pruned_after_label_is_gone(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.note_paused_labels({"tests/a.cc"}, 100.0) + parser.note_paused_labels(set(), 101.0) + parser.note_live_action_labels(set(), 500.0) + + self.assertNotIn("tests/a.cc", parser.paused_duration_labels()) + + def test_displayed_action_durations_are_unchanged_without_paused_actions(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + text = " Compiling tests/a.cc; 56s processwrapper-sandbox\n" + + displayed = bazel_adaptive.rewrite_action_duration_display( + text, + parser, + now=130.0, + paused_count=0, + ) + + self.assertEqual(displayed, text) + + +class MatchPatternTest(unittest.TestCase): + def test_line_separator_pattern_splits_newline_and_carriage_return(self) -> None: + self.assertEqual( + bazel_adaptive.LINE_SEPARATOR_RE.split("first\rsecond\nthird"), + ["first", "second", "third"], + ) + + def test_ansi_pattern_strips_terminal_control_sequences(self) -> None: + self.assertEqual( + bazel_adaptive.ANSI_RE.sub("", "\x1b[32mINFO: Build completed successfully\x1b[0m"), + "INFO: Build completed successfully", + ) + + def test_progress_header_and_count_patterns_extract_done_and_total(self) -> None: + line = "[10,586 / 10,588] 13 / 15 tests; 2 actions running" + + self.assertIsNotNone(bazel_adaptive.PROGRESS_HEADER_RE.match(line)) + match = bazel_adaptive.PROGRESS_COUNT_RE.match(line) + self.assertIsNotNone(match) + assert match is not None + self.assertEqual(match.group("done"), "10,586") + self.assertEqual(match.group("total"), "10,588") + + def test_running_count_pattern_extracts_each_bazel_form(self) -> None: + match = bazel_adaptive.RUNNING_COUNT_RE.search("7 actions, 6 running") + self.assertIsNotNone(match) + assert match is not None + self.assertEqual(match.group("listed_actions"), "7") + self.assertEqual(match.group("listed_running"), "6") + + match = bazel_adaptive.RUNNING_COUNT_RE.search("2 actions running") + self.assertIsNotNone(match) + assert match is not None + self.assertEqual(match.group("actions_running"), "2") + + match = bazel_adaptive.RUNNING_COUNT_RE.search("6 running") + self.assertIsNotNone(match) + assert match is not None + self.assertEqual(match.group("running"), "6") + + match = bazel_adaptive.RUNNING_COUNT_RE.search("1 action; last test: //tests:foo") + self.assertIsNotNone(match) + assert match is not None + self.assertEqual(match.group("action_only"), "1") + + match = bazel_adaptive.RUNNING_COUNT_RE.search("no actions running") + self.assertIsNotNone(match) + assert match is not None + self.assertTrue(all(value is None for value in match.groupdict().values())) + + def test_action_duration_pattern_extracts_duration_fields(self) -> None: + match = bazel_adaptive.ACTION_DURATION_RE.search( + "GoCompilePkg //proxylib:go_default_library; 2m13s remote" + ) + self.assertIsNotNone(match) + assert match is not None + self.assertEqual(match.group("duration"), "2m13s") + self.assertIsNone(match.group("hours")) + self.assertEqual(match.group("minutes"), "2") + self.assertEqual(match.group("seconds"), "13") + + match = bazel_adaptive.ACTION_DURATION_RE.search("Compiling x; 1h02m03s sandbox") + self.assertIsNotNone(match) + assert match is not None + self.assertEqual(match.group("hours"), "1") + self.assertEqual(match.group("minutes"), "02") + self.assertEqual(match.group("seconds"), "03") + + def test_jobs_keyword_pattern_extracts_keyword_and_multiplier(self) -> None: + match = bazel_adaptive.JOBS_KEYWORD_RE.match("HOST_RAM*.0002") + self.assertIsNotNone(match) + assert match is not None + self.assertEqual(match.group("keyword"), "HOST_RAM") + self.assertEqual(match.group("multiplier"), ".0002") + + match = bazel_adaptive.JOBS_KEYWORD_RE.match("HOST_CPUS") + self.assertIsNotNone(match) + assert match is not None + self.assertEqual(match.group("keyword"), "HOST_CPUS") + self.assertIsNone(match.group("multiplier")) + + def test_failure_patterns_match_documented_examples(self) -> None: + self.assertIsNotNone( + bazel_adaptive.MEMORY_KILL_RE.search("ERROR: ... failed: (Killed): clang failed") + ) + self.assertIsNotNone( + bazel_adaptive.BUILD_FAILURE_RE.search( + "ERROR: /tmp/example: Compiling example.cc failed: error executing CppCompile" + ) + ) + self.assertIsNone( + bazel_adaptive.BUILD_FAILURE_RE.search("Target //:envoy_binary_test failed to build") + ) + self.assertIsNotNone( + bazel_adaptive.SERVER_ABRUPT_RE.search( + "Server terminated abruptly (error code: 14, error message: 'Socket closed')" + ) + ) + self.assertIsNotNone( + bazel_adaptive.INTERNAL_CRASH_RE.search( + "FATAL: bazel crashed due to an internal error. Printing stack trace:" + ) + ) + self.assertIsNotNone( + bazel_adaptive.JAVA_INTERRUPTED_RE.search( + "Caused by: java.lang.InterruptedException" + ) + ) + self.assertIsNotNone( + bazel_adaptive.BAZEL_USER_INTERRUPT_RE.search( + "Bazel caught interrupt signal; cancelling pending invocation." + ) + ) + self.assertIsNotNone( + bazel_adaptive.BAZEL_USER_INTERRUPT_RE.search("ERROR: build interrupted") + ) + + def test_output_base_log_pattern_extracts_output_base(self) -> None: + match = bazel_adaptive.OUTPUT_BASE_LOG_RE.search( + "log file: '/home/user/.cache/bazel/_bazel_user/hash/server/jvm.out'" + ) + self.assertIsNotNone(match) + assert match is not None + self.assertEqual(match.group("output_base"), "/home/user/.cache/bazel/_bazel_user/hash") + + +class ProgressFrameTest(unittest.TestCase): + def test_all_running_actions_must_be_over_limit(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[1 / 4] 2 actions, 2 running\n" + " GoCompilePkg //proxylib:go_default_library; 12s remote\n" + " Rustc //crate:lib; 8s worker\n" + ) + + self.assertFalse(parser.all_reported_actions_over(10)) + + parser.feed( + "[1 / 4] 2 actions, 2 running\r" + " ProtoCompile //api:v1_proto; 12s linux-sandbox\r" + " GoLink //cmd:proxy; 11s remote\r" + ) + + self.assertTrue(parser.all_reported_actions_over(10)) + + def test_visible_action_sample_can_stand_in_for_hidden_running_actions(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[1 / 4] 2 actions, 2 running\n" + " Compiling a.cc; 12s processwrapper-sandbox\n" + ) + + self.assertTrue(parser.all_reported_actions_over(10)) + + def test_visible_timeout_evidence_can_override_young_process_group_sample(self) -> None: + class YoungGroupSample: + def all_running_action_groups_over(self, _limit_seconds: int, _now: float) -> bool: + return False + + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[5,617 / 5,640] 3 / 15 tests; 9 actions, 6 running\n" + " Compiling tests/cilium_network_policy_test.cc; " + "252s processwrapper-sandbox\n", + now=0.0, + ) + + has_evidence, subject = bazel_adaptive.action_timeout_evidence( + parser, + YoungGroupSample(), + 100, + 0.0, + ) + + self.assertTrue(has_evidence) + self.assertEqual(subject, "all reported running actions") + + def test_partial_action_line_with_duration_counts_for_downscale(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed("[1 / 4] 2 actions, 2 running\n", now=0.0) + parser.feed(" Compiling a.cc; 101s processwrapper-sandbox\n", now=0.0) + parser.feed(" Compiling b.cc; 102s processwrapper-sandbox", now=0.0) + + self.assertTrue(parser.all_reported_actions_over(100, now=0.0)) + + def test_incomplete_progress_frame_can_infer_missing_action_durations(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[10,543 / 10,576] 3 / 15 tests; 12 actions running; " + "last test: //tests:health_check_sink_test\n", + now=0.0, + ) + parser.feed( + " Compiling tests/cilium_network_policy_test.cc; 136s processwrapper-sandbox\n" + " Compiling tests/bpf_metadata_config_test.cc; 131s processwrapper-sandbox\n" + " Compiling tests/bpf_metadata_integration_test.cc; 126s processwrapper-sandbox\n" + " Compiling tests/cilium_http_upstream_integration_test.cc; " + "125s processwrapper-sandbox\n" + " Compiling tests/cilium_tls_tcp_integration_test.cc", + now=0.0, + ) + + self.assertTrue(parser.all_reported_actions_over(100, now=0.0)) + + def test_incomplete_progress_frame_ages_visible_actions_by_wall_clock(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed("[1 / 8] 4 actions running\n", now=0.0) + parser.feed( + " Rustc //crate:lib; 80s remote\n" + " GoCompilePkg //pkg:go_default_library", + now=0.0, + ) + + self.assertFalse(parser.all_reported_actions_over(100, now=10.0)) + self.assertTrue(parser.all_reported_actions_over(100, now=21.0)) + + def test_complete_progress_frame_ages_visible_actions_during_silence(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[10,538 / 10,574] 2 / 14 tests; 7 actions, 6 running\n" + " Compiling tests/cilium_network_policy_test.cc; 76s processwrapper-sandbox\n" + " Compiling tests/bpf_metadata_config_test.cc; 76s processwrapper-sandbox\n" + " Compiling tests/bpf_metadata_integration_test.cc; 76s processwrapper-sandbox\n" + " Compiling tests/cilium_tcp_integration_test.cc; 76s processwrapper-sandbox\n" + " Compiling tests/cilium_tcp_integration.cc; 76s processwrapper-sandbox\n" + " Compiling tests/cilium_tcp_integration.cc; 76s processwrapper-sandbox\n", + now=0.0, + ) + + self.assertFalse(parser.all_reported_actions_over(100, now=24.0)) + self.assertTrue(parser.all_reported_actions_over(100, now=25.0)) + + def test_paused_time_does_not_count_toward_action_timeout(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[1 / 4] 2 actions, 2 running\n" + " Compiling a.cc; 90s processwrapper-sandbox\n" + " Compiling b.cc; 90s processwrapper-sandbox\n", + now=90.0, + ) + + parser.note_actions_paused(95.0) + parser.note_actions_resumed(125.0) + parser.feed( + "[1 / 4] 2 actions, 2 running\n" + " Compiling a.cc; 130s processwrapper-sandbox\n" + " Compiling b.cc; 130s processwrapper-sandbox\n", + now=130.0, + ) + + self.assertFalse(parser.all_reported_actions_over(100, now=130.0)) + + parser.feed( + "[1 / 4] 2 actions, 2 running\n" + " Compiling a.cc; 131s processwrapper-sandbox\n" + " Compiling b.cc; 131s processwrapper-sandbox\n", + now=131.0, + ) + + self.assertTrue(parser.all_reported_actions_over(100, now=131.0)) + + def test_paused_time_does_not_age_actions_during_silent_output(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[1 / 4] 2 actions, 2 running\n" + " Compiling a.cc; 95s processwrapper-sandbox\n" + " Compiling b.cc; 95s processwrapper-sandbox\n", + now=95.0, + ) + + parser.note_actions_paused(96.0) + self.assertFalse(parser.all_reported_actions_over(100, now=200.0)) + parser.note_actions_resumed(200.0) + + self.assertFalse(parser.all_reported_actions_over(100, now=204.0)) + self.assertTrue(parser.all_reported_actions_over(100, now=205.1)) + + def test_incomplete_progress_frame_without_visible_durations_uses_frame_age(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed("[1 / 8] 4 actions running\n", now=0.0) + parser.feed(" Compiling tests/cilium_network_policy_test.cc", now=0.0) + + self.assertFalse(parser.all_reported_actions_over(100, now=100.0)) + self.assertTrue(parser.all_reported_actions_over(100, now=101.0)) + + def test_actions_running_header_counts_all_running_actions(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[1 / 10] 3 actions running\n" + " Compiling a.cc; 12s processwrapper-sandbox\n" + " Compiling b.cc; 12s processwrapper-sandbox\n" + "[2 / 10] 3 actions running\n" + " Compiling a.cc; 12s processwrapper-sandbox\n" + " Compiling b.cc; 12s processwrapper-sandbox\n" + ) + + self.assertTrue(parser.has_running_actions()) + self.assertTrue(parser.all_reported_actions_over(10)) + self.assertIsNone(parser.upscale_action_skip_reason(15, 2)) + + def test_comma_actions_header_uses_running_count(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[10,501 / 10,575] 1 / 15 tests; 7 actions, 6 running\n" + " Compiling a.cc; 12s processwrapper-sandbox\n" + " Compiling b.cc; 12s processwrapper-sandbox\n" + " Compiling c.cc; 12s processwrapper-sandbox\n" + " Compiling d.cc; 12s processwrapper-sandbox\n" + " Compiling e.cc; 12s processwrapper-sandbox\n" + " Compiling f.cc; 12s processwrapper-sandbox\n" + ) + + self.assertEqual(parser.running_count, 6) + self.assertTrue(parser.all_reported_actions_over(10)) + + def test_non_pty_one_line_progress_extracts_summary_state(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[9,890 / 10,553] Compiling cilium/api/nphds.pb.cc; " + "2s processwrapper-sandbox ... (13 actions, 12 running)\n", + now=0.0, + ) + + self.assertEqual(parser.completed_count, 9890) + self.assertEqual(parser.total_count, 10553) + self.assertEqual(parser.running_count, 12) + self.assertEqual(parser.current_action_durations(now=0.0), [2.0]) + self.assertFalse(parser.all_reported_actions_over(100, now=0.0)) + + def test_non_pty_one_line_progress_can_trigger_timeout_downscale(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[9,890 / 10,553] Compiling cilium/api/nphds.pb.cc; " + "101s processwrapper-sandbox ... (13 actions, 12 running)\n", + now=0.0, + ) + + self.assertTrue(parser.all_reported_actions_over(100, now=0.0)) + + def test_non_pty_one_line_progress_can_be_a_cheap_upscale_point(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[9,890 / 10,553] Compiling cilium/api/nphds.pb.cc; " + "0s processwrapper-sandbox ... (12 actions, 11 running)\n" + "[9,891 / 10,553] Compiling cilium/api/npds.pb.cc; " + "5s processwrapper-sandbox ... (13 actions, 12 running)\n", + now=0.0, + ) + + self.assertTrue(parser.meaningful_work_done) + self.assertIsNone(parser.upscale_action_skip_reason(15, 2, now=0.0)) + + def test_recent_completed_progress_is_tracked(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[9,890 / 10,553] Compiling a.cc; 1s processwrapper-sandbox " + "... (13 actions, 12 running)\n", + now=10.0, + ) + parser.feed( + "[9,891 / 10,553] Compiling b.cc; 1s processwrapper-sandbox " + "... (13 actions, 12 running)\n", + now=20.0, + ) + + self.assertTrue(parser.completed_progress_recent(now=50.0, window_seconds=31.0)) + self.assertFalse(parser.completed_progress_recent(now=52.0, window_seconds=31.0)) + + def test_mid_build_running_count_fluctuation_is_not_winding_down(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[10,429 / 10,553] Compiling source/common/upstream/upstream_impl.cc; " + "11s processwrapper-sandbox ... (12 actions, 11 running)\n" + "[10,429 / 10,553] Compiling source/common/upstream/upstream_impl.cc; " + "13s processwrapper-sandbox ... (13 actions, 12 running)\n" + "[10,430 / 10,553] Compiling source/common/upstream/upstream_impl.cc; " + "15s processwrapper-sandbox ... (12 actions, 11 running)\n" + "[10,430 / 10,553] Compiling source/common/upstream/upstream_impl.cc; " + "16s processwrapper-sandbox ... (13 actions, 12 running)\n" + "[10,441 / 10,553] Compiling source/extensions/upstreams/http/generic/config.cc; " + "9s processwrapper-sandbox ... (13 actions, 12 running)\n" + "[10,442 / 10,553] Compiling source/extensions/clusters/logical_dns/" + "logical_dns_cluster.cc; 10s processwrapper-sandbox ... " + "(12 actions, 11 running)\n", + now=0.0, + ) + + self.assertEqual(parser.running_count, 11) + self.assertTrue(parser.running_count_decreased) + self.assertIsNone(parser.upscale_action_skip_reason(15, 2, now=0.0)) + + def test_non_pty_one_line_test_progress_extracts_summary_state(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[8,182 / 8,369] Compiling source/common/listener_manager/" + "listener_manager_impl.cc; 18s processwrapper-sandbox ... " + "(12 actions, 11 running)\n" + "[8,183 / 8,369] Compiling source/common/listener_manager/" + "listener_manager_impl.cc; 19s processwrapper-sandbox ... " + "(13 actions, 12 running)\n" + "[8,190 / 8,369] 1 / 15 tests; Compiling test/mocks/upstream/" + "host_set.cc; 15s processwrapper-sandbox ... (13 actions, 12 running)\n" + "[8,191 / 8,369] 1 / 15 tests; Compiling source/common/tcp_proxy/" + "upstream.cc; 15s processwrapper-sandbox ... (13 actions, 12 running)\n" + "[8,192 / 8,369] 1 / 15 tests; Compiling test/mocks/upstream/" + "cluster_info.cc; 15s processwrapper-sandbox ... (12 actions, 11 running)\n" + "[8,195 / 8,369] 1 / 15 tests; Compiling test/mocks/upstream/" + "cluster_info.cc; 23s processwrapper-sandbox ... (13 actions, 12 running)\n", + now=0.0, + ) + + self.assertEqual(parser.completed_count, 8195) + self.assertEqual(parser.total_count, 8369) + self.assertEqual(parser.running_count, 12) + self.assertEqual(parser.current_action_durations(now=0.0), [23.0]) + self.assertTrue(parser.meaningful_work_done) + self.assertTrue(parser.current_frame_has_summary_duration) + self.assertTrue(parser.all_reported_actions_over(20, now=0.0)) + + def test_upscale_action_age_guard(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[1 / 10] 2 actions, 2 running\n" + " Compiling a.cc; 14s processwrapper-sandbox\n" + " Compiling b.cc; 3s processwrapper-sandbox\n" + "[2 / 10] 2 actions, 2 running\n" + " Compiling a.cc; 14s remote\n" + " Compiling b.cc; 3s remote\n" + ) + self.assertIsNone(parser.upscale_action_skip_reason(15, 2)) + self.assertEqual( + parser.current_action_age_description(), + "oldest current action 14s", + ) + + parser.feed( + "[1 / 4] 2 actions, 2 running\n" + " Compiling a.cc; 15s processwrapper-sandbox\n" + " Compiling b.cc; 3s processwrapper-sandbox\n" + ) + self.assertIn( + "oldest current running action is 15s", + parser.upscale_action_skip_reason(15, 2), + ) + + def test_upscale_action_age_uses_wall_clock_when_bazel_is_silent(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[1 / 10] 2 actions, 2 running\n" + " Compiling a.cc; 0s processwrapper-sandbox\n" + " Compiling b.cc; 0s processwrapper-sandbox\n" + "[2 / 10] 2 actions, 2 running\n" + " Compiling a.cc; 0s processwrapper-sandbox\n" + " Compiling b.cc; 0s processwrapper-sandbox\n", + now=0.0, + ) + + self.assertIsNone(parser.upscale_action_skip_reason(15, 2, now=14.0)) + self.assertIn( + "oldest current running action is 15s", + parser.upscale_action_skip_reason(15, 2, now=15.0), + ) + + def test_upscale_requires_meaningful_work(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[1 / 4] 2 actions, 2 running\n" + " Compiling a.cc; 1s remote\n" + " Compiling b.cc; 1s remote\n" + ) + + self.assertIn( + "completed action count has not advanced", + parser.upscale_action_skip_reason(15, 2), + ) + + def test_upscale_skips_when_actions_are_winding_down(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[1 / 10] 3 actions, 3 running\n" + " Compiling a.cc; 1s remote\n" + " Compiling b.cc; 1s remote\n" + " Compiling c.cc; 1s remote\n" + "[2 / 10] 2 actions, 2 running\n" + " Compiling b.cc; 1s remote\n" + " Compiling c.cc; 1s remote\n" + ) + + self.assertIn("running action count is decreasing", parser.upscale_action_skip_reason(15, 2)) + + parser.feed("[3 / 4] no actions running\n") + self.assertIn("no actions are currently running", parser.upscale_action_skip_reason(15, 2)) + + def test_upscale_skips_when_all_remaining_actions_are_running(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[10,585 / 10,588] 13 / 15 tests; 2 actions running\n" + " Testing //tests:cilium_tls_http_integration_test; 0s processwrapper-sandbox\n" + " Testing //tests:cilium_tls_tcp_integration_test; 0s processwrapper-sandbox\n" + "[10,586 / 10,588] 13 / 15 tests; 2 actions running; " + "last test: //tests:cilium_http_integration_test\n" + " Testing //tests:cilium_tls_http_integration_test; 0s processwrapper-sandbox\n" + " Testing //tests:cilium_tls_tcp_integration_test; 0s processwrapper-sandbox\n" + ) + + self.assertIn("only 2 action(s) remain", parser.upscale_action_skip_reason(15, 2)) + + def test_upscale_skips_singular_action_near_finish(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[10,419 / 10,421] 13 / 14 tests; 2 actions running; last test: //tests:foo\n" + " Testing //tests:foo; 1s processwrapper-sandbox\n" + " Testing //tests:bar; 1s processwrapper-sandbox\n" + "[10,420 / 10,421] 13 / 14 tests; 1 action; last test: //tests:foo\n" + " Testing //tests:foo; 2s processwrapper-sandbox\n" + ) + + self.assertEqual(parser.running_count, 1) + self.assertIn("only 1 action(s) remain", parser.upscale_action_skip_reason(15, 2)) + + def test_upscale_skips_compact_near_finish_without_running_count(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[5,641 / 5,642] 14 / 15 tests; [Prepa] " + "Linking tests/cilium_websocket_codec_integration_test\n" + "[5,642 / 5,643] 14 / 15 tests; " + "Testing //tests:cilium_websocket_codec_integration_test; " + "0s processwrapper-sandbox\n" + ) + + self.assertIsNone(parser.running_count) + self.assertIn("only 1 action(s) remain", parser.upscale_action_skip_reason(15, 2)) + + def test_upscale_near_finish_guard_uses_current_jobs_threshold(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[100 / 120] 8 actions running\n" + " Compiling a.cc; 1s processwrapper-sandbox\n" + "[110 / 120] 8 actions running\n" + " Compiling b.cc; 1s processwrapper-sandbox\n" + ) + + self.assertIsNone(parser.upscale_action_skip_reason(15, 8)) + self.assertIn("need more than 12", parser.upscale_action_skip_reason(15, 12)) + + +class DiagnosticsTest(unittest.TestCase): + def test_diag_clears_terminal_line_only_for_tty(self) -> None: + class FakeStderr(io.StringIO): + def fileno(self) -> int: + return 123 + + old_stderr = sys.stderr + old_isatty = bazel_adaptive.os.isatty + old_start_time = bazel_adaptive.WRAPPER_START_TIME + try: + tty_output = FakeStderr() + sys.stderr = tty_output + bazel_adaptive.os.isatty = lambda fd: fd == 123 + bazel_adaptive.WRAPPER_START_TIME = time.monotonic() - 5 + bazel_adaptive.diag("hello") + self.assertEqual(tty_output.getvalue(), "\r\x1b[K[bazel-adaptive/5s] hello\n") + + plain_output = FakeStderr() + sys.stderr = plain_output + bazel_adaptive.os.isatty = lambda _fd: False + bazel_adaptive.WRAPPER_START_TIME = time.monotonic() - 7 + bazel_adaptive.diag("hello") + self.assertEqual(plain_output.getvalue(), "[bazel-adaptive/7s] hello\n") + finally: + sys.stderr = old_stderr + bazel_adaptive.os.isatty = old_isatty + bazel_adaptive.WRAPPER_START_TIME = old_start_time + + +class StreamForwardingTest(unittest.TestCase): + def test_drain_ready_streams_forwards_partial_lines_immediately(self) -> None: + read_fd, write_fd = os.pipe() + selector = None + try: + output = io.BytesIO() + selector = bazel_adaptive.selectors.DefaultSelector() + selector.register( + read_fd, + bazel_adaptive.selectors.EVENT_READ, + bazel_adaptive.StreamTarget(output, lambda: os.close(read_fd)), + ) + os.write(write_fd, b"partial Bazel progress without newline") + parser = bazel_adaptive.ProgressFrameParser() + + bazel_adaptive.drain_ready_streams(selector, parser, 1.0) + + self.assertEqual(output.getvalue(), b"partial Bazel progress without newline") + self.assertEqual(parser._buffer, "partial Bazel progress without newline") + finally: + os.close(write_fd) + if selector is not None: + selector.close() + try: + os.close(read_fd) + except OSError: + pass + + def test_drain_ready_streams_rewrites_paused_action_counts(self) -> None: + read_fd, write_fd = os.pipe() + selector = None + try: + output = io.BytesIO() + selector = bazel_adaptive.selectors.DefaultSelector() + selector.register( + read_fd, + bazel_adaptive.selectors.EVENT_READ, + bazel_adaptive.StreamTarget(output, lambda: os.close(read_fd)), + ) + data = ( + b"[1 / 4] Compiling a.cc; 1s processwrapper-sandbox ... " + b"(12 actions, 11 running)\n" + b"[2 / 4] Compiling b.cc; 1s processwrapper-sandbox ... " + b"(8 actions running)\n" + b"[3 / 4] 8 actions running\n" + b"[4 / 4] 13 actions, 12 running\n" + ) + os.write(write_fd, data) + parser = bazel_adaptive.ProgressFrameParser() + + bazel_adaptive.drain_ready_streams(selector, parser, 1.0, lambda: 10) + + displayed = output.getvalue().decode("utf-8") + self.assertIn("(12 actions, 10 paused, 1 running)", displayed) + self.assertIn("(8 actions, 8 paused, 0 running)", displayed) + self.assertIn("[3 / 4] 8 actions, 8 paused, 0 running", displayed) + self.assertIn("[4 / 4] 13 actions, 10 paused, 2 running", displayed) + self.assertEqual(parser.running_count, 12) + finally: + os.close(write_fd) + if selector is not None: + selector.close() + try: + os.close(read_fd) + except OSError: + pass + + +class MemoryTest(unittest.TestCase): + def test_memory_thresholds(self) -> None: + low = bazel_adaptive.MemInfo(total_kb=8 * 1024 * 1024, available_kb=512 * 1024) + high = bazel_adaptive.MemInfo(total_kb=8 * 1024 * 1024, available_kb=5 * 1024 * 1024) + + self.assertLess(low.available_kb, bazel_adaptive.low_memory_threshold_kb()) + self.assertLessEqual(low.available_kb * 2, low.total_kb) + self.assertGreaterEqual(high.available_kb, bazel_adaptive.low_memory_threshold_kb()) + self.assertGreater(high.available_kb * 2, high.total_kb) + self.assertEqual(bazel_adaptive.downscale_jobs(12), 6) + self.assertEqual(bazel_adaptive.downscale_jobs(6), 3) + self.assertEqual(bazel_adaptive.downscale_jobs(5), 3) + self.assertEqual(bazel_adaptive.downscale_jobs(3), 2) + self.assertEqual(bazel_adaptive.downscale_jobs(2), 1) + self.assertEqual(bazel_adaptive.downscale_jobs(1), 1) + self.assertEqual(bazel_adaptive.upscale_jobs(2, 12), 3) + self.assertEqual(bazel_adaptive.upscale_jobs(3, 12), 5) + self.assertEqual(bazel_adaptive.upscale_jobs(8, 12), 12) + + def test_configurable_memory_threshold(self) -> None: + meminfo = bazel_adaptive.MemInfo(total_kb=8 * 1024 * 1024, available_kb=1536 * 1024) + + self.assertGreaterEqual(meminfo.available_kb, bazel_adaptive.low_memory_threshold_kb()) + with temporary_env("BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB", "2048"): + self.assertLess(meminfo.available_kb, bazel_adaptive.low_memory_threshold_kb()) + + def test_recent_average_controls_upscale_decision(self) -> None: + monitor = bazel_adaptive.MemoryPressureMonitor(poll_interval=1.0) + now = 100.0 + monitor.samples = [ + ( + now - 29, + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=5 * 1024 * 1024, + ), + ), + ( + now - 10, + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=5 * 1024 * 1024, + ), + ), + ] + monitor.last = monitor.samples[-1][1] + + self.assertIsNone( + monitor.upscale_skip_reason( + now, + running_actions_seconds=30.0, + required_running_actions_seconds=30.0, + ) + ) + self.assertTrue(monitor.retry_same_jobs_after_failure(now)) + + monitor.samples.append( + ( + now, + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=512 * 1024, + ), + ) + ) + monitor.last = monitor.samples[-1][1] + self.assertIn( + "memory dipped below low-memory threshold", + monitor.upscale_skip_reason( + now, + running_actions_seconds=30.0, + required_running_actions_seconds=30.0, + ), + ) + self.assertFalse(monitor.retry_same_jobs_after_failure(now)) + + def test_upscale_waits_for_running_action_window(self) -> None: + monitor = bazel_adaptive.MemoryPressureMonitor(poll_interval=1.0) + now = 100.0 + monitor.samples = [ + ( + now - 29, + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=5 * 1024 * 1024, + ), + ), + ( + now, + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=5 * 1024 * 1024, + ), + ), + ] + monitor.last = monitor.samples[-1][1] + + self.assertIn( + "not been observed", + monitor.upscale_skip_reason( + now, + running_actions_seconds=None, + required_running_actions_seconds=30.0, + ), + ) + self.assertIn( + "observed for 10s", + monitor.upscale_skip_reason( + now, + running_actions_seconds=10.0, + required_running_actions_seconds=30.0, + ), + ) + self.assertIsNone( + monitor.upscale_skip_reason( + now, + running_actions_seconds=30.0, + required_running_actions_seconds=30.0, + ) + ) + + def test_recent_low_memory_blocks_upscale_even_if_average_is_high(self) -> None: + monitor = bazel_adaptive.MemoryPressureMonitor(poll_interval=1.0) + now = 100.0 + monitor.samples = [ + ( + now - 29, + bazel_adaptive.MemInfo( + total_kb=16 * 1024 * 1024, + available_kb=16 * 1024 * 1024, + ), + ), + ( + now, + bazel_adaptive.MemInfo( + total_kb=16 * 1024 * 1024, + available_kb=512 * 1024, + ), + ), + ] + monitor.last = monitor.samples[-1][1] + + self.assertIn( + "memory dipped below low-memory threshold", + monitor.upscale_skip_reason( + now, + running_actions_seconds=30.0, + required_running_actions_seconds=30.0, + ), + ) + + def test_failure_report_is_compact(self) -> None: + monitor = bazel_adaptive.MemoryPressureMonitor(poll_interval=1.0) + now = 100.0 + monitor.samples = [ + (now - 20, bazel_adaptive.MemInfo(total_kb=8 * 1024 * 1024, available_kb=1024 * 1024)), + (now, bazel_adaptive.MemInfo(total_kb=8 * 1024 * 1024, available_kb=3 * 1024 * 1024)), + ] + monitor.last = monitor.samples[-1][1] + monitor.last_low_at = now - 5 + monitor.last_low = monitor.samples[0][1] + + self.assertEqual( + monitor.failure_report(now), + "memory pressure: latest 3072/8192 MiB; 30s average 2048 MiB; " + "min 1024 MiB; less than low-memory threshold 1024 MiB: yes", + ) + + def test_renice_build_processes_updates_detected_children(self) -> None: + old_dangling = bazel_adaptive.dangling_build_processes + old_setpriority = bazel_adaptive.os.setpriority + old_env = os.environ.get("BAZEL_ADAPTIVE_BAZEL_NICE") + calls: list[tuple[int, int, int]] = [] + try: + os.environ["BAZEL_ADAPTIVE_BAZEL_NICE"] = "5" + bazel_adaptive.dangling_build_processes = lambda _context: [ + bazel_adaptive.ProcessInfo(101, "clang", "/tmp/out", 0, 10), + bazel_adaptive.ProcessInfo(102, "clang", "/tmp/out", 5, 20), + bazel_adaptive.ProcessInfo(103, "clang", "/tmp/out", None, 30), + ] + bazel_adaptive.os.setpriority = ( + lambda which, pid, priority: calls.append((which, pid, priority)) + ) + + bazel_adaptive.renice_build_processes(bazel_adaptive.BuildContext("/tmp/work")) + + self.assertEqual( + calls, + [ + (os.PRIO_PROCESS, 101, 5), + (os.PRIO_PROCESS, 103, 5), + ], + ) + finally: + bazel_adaptive.dangling_build_processes = old_dangling + bazel_adaptive.os.setpriority = old_setpriority + if old_env is None: + os.environ.pop("BAZEL_ADAPTIVE_BAZEL_NICE", None) + else: + os.environ["BAZEL_ADAPTIVE_BAZEL_NICE"] = old_env + + def test_build_process_groups_use_sandbox_action_key(self) -> None: + old_dangling = bazel_adaptive.dangling_build_processes + try: + bazel_adaptive.dangling_build_processes = lambda _context: [ + bazel_adaptive.ProcessInfo( + 201, + "process-wrapper", + "/tmp/out/sandbox/processwrapper-sandbox/7/execroot/ws", + 0, + 100, + ), + bazel_adaptive.ProcessInfo( + 202, + "clang /tmp/out/execroot/cilium/external/envoy/test/mocks/server/foo.cc", + "/tmp/out/sandbox/processwrapper-sandbox/7/execroot/ws", + 0, + 105, + ), + bazel_adaptive.ProcessInfo( + 203, + "clang /tmp/out/sandbox/processwrapper-sandbox/8/execroot/ws/input", + None, + 0, + 200, + ), + ] + + groups = bazel_adaptive.build_process_groups(bazel_adaptive.BuildContext("/tmp/work")) + + groups_by_key = {group.key: group for group in groups} + self.assertEqual(groups_by_key["processwrapper-sandbox/7"].pids, [201, 202]) + self.assertEqual(groups_by_key["processwrapper-sandbox/7"].started_at_ticks, 100) + self.assertIn( + "test/mocks/server/foo.cc", + groups_by_key["processwrapper-sandbox/7"].action_labels, + ) + self.assertEqual(groups_by_key["processwrapper-sandbox/8"].pids, [203]) + finally: + bazel_adaptive.dangling_build_processes = old_dangling + + def test_build_process_groups_include_sandbox_descendants(self) -> None: + old_dangling = bazel_adaptive.dangling_build_processes + try: + bazel_adaptive.dangling_build_processes = lambda _context: [ + bazel_adaptive.ProcessInfo( + 211, + "process-wrapper", + "/tmp/out/sandbox/processwrapper-sandbox/9/execroot/ws", + 0, + 100, + ), + bazel_adaptive.ProcessInfo( + 212, + "clang -c tests/child.cc", + None, + 0, + 110, + ppid=211, + ), + ] + + groups = bazel_adaptive.build_process_groups(bazel_adaptive.BuildContext("/tmp/work")) + + self.assertEqual(groups[0].key, "processwrapper-sandbox/9") + self.assertEqual(groups[0].pids, [211, 212]) + self.assertIn("tests/child.cc", groups[0].action_labels) + finally: + bazel_adaptive.dangling_build_processes = old_dangling + + def test_bazel_servers_for_workspace_are_scoped_to_wrapper_cgroup(self) -> None: + old_proc_pids = bazel_adaptive.proc_pids + old_process_info = bazel_adaptive.process_info + own_cgroup = (("", "/docker/build-a"),) + other_cgroup = (("", "/docker/build-b"),) + infos = { + 401: bazel_adaptive.ProcessInfo( + 401, + "java -jar A-server.jar --workspace_directory=/tmp/work " + "--output_base=/tmp/out-a", + None, + 0, + 10, + cgroups=own_cgroup, + ), + 402: bazel_adaptive.ProcessInfo( + 402, + "java -jar A-server.jar --workspace_directory=/tmp/work " + "--output_base=/tmp/out-b", + None, + 0, + 20, + cgroups=other_cgroup, + ), + } + try: + bazel_adaptive.proc_pids = lambda: sorted(infos) + bazel_adaptive.process_info = lambda pid: infos.get(pid) + + servers = bazel_adaptive.bazel_servers_for_workspace("/tmp/work", own_cgroup) + + self.assertEqual([server.pid for server in servers], [401]) + self.assertEqual(servers[0].output_base, "/tmp/out-a") + finally: + bazel_adaptive.proc_pids = old_proc_pids + bazel_adaptive.process_info = old_process_info + + def test_dangling_build_processes_ignores_other_cgroup_output_base(self) -> None: + old_proc_pids = bazel_adaptive.proc_pids + old_process_info = bazel_adaptive.process_info + own_cgroup = (("", "/docker/build-a"),) + other_cgroup = (("", "/docker/build-b"),) + infos = { + 411: bazel_adaptive.ProcessInfo( + 411, + "process-wrapper /tmp/out-a/sandbox/processwrapper-sandbox/1/execroot/ws", + "/tmp/out-a/sandbox/processwrapper-sandbox/1/execroot/ws", + 0, + 10, + cgroups=own_cgroup, + ), + 412: bazel_adaptive.ProcessInfo( + 412, + "clang -c a.cc", + None, + 0, + 20, + ppid=411, + cgroups=own_cgroup, + ), + 421: bazel_adaptive.ProcessInfo( + 421, + "process-wrapper /tmp/out-b/sandbox/processwrapper-sandbox/1/execroot/ws", + "/tmp/out-b/sandbox/processwrapper-sandbox/1/execroot/ws", + 0, + 30, + cgroups=other_cgroup, + ), + 422: bazel_adaptive.ProcessInfo( + 422, + "clang -c /tmp/out-b/execroot/ws/b.cc", + None, + 0, + 40, + ppid=421, + cgroups=other_cgroup, + ), + } + try: + bazel_adaptive.proc_pids = lambda: sorted(infos) + bazel_adaptive.process_info = lambda pid: infos.get(pid) + context = bazel_adaptive.BuildContext("/tmp/work", cgroups=own_cgroup) + context.add_output_base("/tmp/out-a") + context.add_output_base("/tmp/out-b") + + processes = bazel_adaptive.dangling_build_processes(context) + + self.assertEqual([process.pid for process in processes], [411, 412]) + finally: + bazel_adaptive.proc_pids = old_proc_pids + bazel_adaptive.process_info = old_process_info + + def test_action_throttler_pauses_youngest_and_resumes_oldest(self) -> None: + old_build_process_groups = bazel_adaptive.build_process_groups + old_kill = bazel_adaptive.os.kill + old_stderr = sys.stderr + signals: list[tuple[int, int]] = [] + groups = [ + bazel_adaptive.ActionProcessGroup("old", [301, 302], 100), + bazel_adaptive.ActionProcessGroup("middle", [303], 200), + bazel_adaptive.ActionProcessGroup("young", [304], 300, {"tests/young.cc"}), + ] + try: + stderr = io.StringIO() + sys.stderr = stderr + bazel_adaptive.build_process_groups = lambda _context: groups + bazel_adaptive.os.kill = lambda pid, sig: signals.append((pid, sig)) + with temporary_env("BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB", "2048"): + throttler = bazel_adaptive.ActionThrottler( + bazel_adaptive.BuildContext("/tmp/work") + ) + + throttler.pause_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=5 * 1024 * 1024, + ) + ) + throttler.pause_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=4 * 1024 * 1024, + ) + ) + throttler.pause_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=3 * 1024 * 1024, + ) + ) + throttler.pause_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=2 * 1024 * 1024, + ) + ) + throttler.resume_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=3 * 1024 * 1024, + ) + ) + + self.assertEqual( + signals, + [ + (304, signal.SIGSTOP), + ], + ) + self.assertEqual(throttler.paused_keys, {"young"}) + self.assertEqual(throttler.paused_pids, {"young": {304}}) + self.assertEqual(throttler.paused_labels(), {"tests/young.cc"}) + self.assertEqual(stderr.getvalue(), "") + finally: + bazel_adaptive.build_process_groups = old_build_process_groups + bazel_adaptive.os.kill = old_kill + sys.stderr = old_stderr + + def test_action_throttler_does_not_resume_into_immediate_repause(self) -> None: + old_build_process_groups = bazel_adaptive.build_process_groups + old_kill = bazel_adaptive.os.kill + old_stderr = sys.stderr + signals: list[tuple[int, int]] = [] + groups = [ + bazel_adaptive.ActionProcessGroup("old", [601], 100), + bazel_adaptive.ActionProcessGroup("young", [602], 200), + ] + try: + sys.stderr = io.StringIO() + bazel_adaptive.build_process_groups = lambda _context: groups + bazel_adaptive.os.kill = lambda pid, sig: signals.append((pid, sig)) + with temporary_env("BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB", "2048"): + throttler = bazel_adaptive.ActionThrottler( + bazel_adaptive.BuildContext("/tmp/work") + ) + + throttler.pause_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=4 * 1024 * 1024, + ) + ) + throttler.resume_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=3 * 1024 * 1024, + ) + ) + throttler.resume_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=5 * 1024 * 1024, + ) + ) + + self.assertEqual( + signals, + [ + (602, signal.SIGSTOP), + (602, signal.SIGCONT), + ], + ) + self.assertEqual(throttler.paused_keys, set()) + finally: + bazel_adaptive.build_process_groups = old_build_process_groups + bazel_adaptive.os.kill = old_kill + sys.stderr = old_stderr + + def test_action_throttler_waits_for_swap_in_budget_before_resume(self) -> None: + old_build_process_groups = bazel_adaptive.build_process_groups + old_process_swap_kb = bazel_adaptive.process_swap_kb + old_kill = bazel_adaptive.os.kill + old_stderr = sys.stderr + signals: list[tuple[int, int]] = [] + groups = [ + bazel_adaptive.ActionProcessGroup("old", [701], 100), + bazel_adaptive.ActionProcessGroup("young", [702], 200), + ] + try: + sys.stderr = io.StringIO() + bazel_adaptive.build_process_groups = lambda _context: groups + bazel_adaptive.process_swap_kb = lambda _pid: 3 * 1024 * 1024 + bazel_adaptive.os.kill = lambda pid, sig: signals.append((pid, sig)) + with temporary_env("BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB", "2048"): + throttler = bazel_adaptive.ActionThrottler( + bazel_adaptive.BuildContext("/tmp/work") + ) + + throttler.pause_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=4 * 1024 * 1024, + ) + ) + throttler.resume_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=5 * 1024 * 1024, + ) + ) + throttler.resume_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=6 * 1024 * 1024, + ) + ) + + self.assertEqual( + signals, + [ + (702, signal.SIGSTOP), + (702, signal.SIGCONT), + ], + ) + self.assertEqual(throttler.paused_keys, set()) + finally: + bazel_adaptive.build_process_groups = old_build_process_groups + bazel_adaptive.process_swap_kb = old_process_swap_kb + bazel_adaptive.os.kill = old_kill + sys.stderr = old_stderr + + def test_action_throttler_waits_for_running_io_stalls_to_clear_before_resume_when_memory_is_tight( + self, + ) -> None: + old_build_process_groups = bazel_adaptive.build_process_groups + old_process_swap_kb = bazel_adaptive.process_swap_kb + old_kill = bazel_adaptive.os.kill + old_stderr = sys.stderr + signals: list[tuple[int, int]] = [] + groups = [ + bazel_adaptive.ActionProcessGroup("old", [711], 100), + bazel_adaptive.ActionProcessGroup("middle", [712], 200, states={"T"}), + bazel_adaptive.ActionProcessGroup("young", [713], 300, states={"T"}), + ] + try: + sys.stderr = io.StringIO() + bazel_adaptive.build_process_groups = lambda _context: groups + bazel_adaptive.process_swap_kb = lambda _pid: 0 + bazel_adaptive.os.kill = lambda pid, sig: signals.append((pid, sig)) + with temporary_env("BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB", "2048"): + throttler = bazel_adaptive.ActionThrottler( + bazel_adaptive.BuildContext("/tmp/work") + ) + throttler.paused_keys = {"middle", "young"} + throttler.paused_pids = {"middle": {712}, "young": {713}} + groups = [ + bazel_adaptive.ActionProcessGroup("old", [711], 100, states={"D"}), + bazel_adaptive.ActionProcessGroup("middle", [712], 200, states={"T"}), + bazel_adaptive.ActionProcessGroup("young", [713], 300, states={"T"}), + ] + throttler.resume_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=3 * 1024 * 1024, + ) + ) + groups = [ + bazel_adaptive.ActionProcessGroup("old", [711], 100), + bazel_adaptive.ActionProcessGroup("middle", [712], 200, states={"T"}), + bazel_adaptive.ActionProcessGroup("young", [713], 300, states={"T"}), + ] + throttler.resume_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=3 * 1024 * 1024, + ) + ) + throttler.last_running_io_stall_at = ( + time.monotonic() - bazel_adaptive.RESUME_IO_STALL_CLEAR_SECONDS + ) + throttler.resume_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=3 * 1024 * 1024, + ) + ) + groups = [ + bazel_adaptive.ActionProcessGroup("old", [711], 100), + bazel_adaptive.ActionProcessGroup("middle", [712], 200), + bazel_adaptive.ActionProcessGroup("young", [713], 300, states={"T"}), + ] + throttler.resume_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=8 * 1024 * 1024, + ) + ) + + self.assertEqual( + signals, + [ + (712, signal.SIGCONT), + ], + ) + self.assertEqual(throttler.paused_keys, {"young"}) + finally: + bazel_adaptive.build_process_groups = old_build_process_groups + bazel_adaptive.process_swap_kb = old_process_swap_kb + bazel_adaptive.os.kill = old_kill + sys.stderr = old_stderr + + def test_action_throttler_waits_for_resumed_memory_to_settle(self) -> None: + old_build_process_groups = bazel_adaptive.build_process_groups + old_process_swap_kb = bazel_adaptive.process_swap_kb + old_kill = bazel_adaptive.os.kill + old_stderr = sys.stderr + signals: list[tuple[int, int]] = [] + groups = [ + bazel_adaptive.ActionProcessGroup("old", [713], 100), + bazel_adaptive.ActionProcessGroup("middle", [714], 200, states={"T"}), + bazel_adaptive.ActionProcessGroup("young", [715], 300, states={"T"}), + ] + try: + sys.stderr = io.StringIO() + bazel_adaptive.build_process_groups = lambda _context: groups + bazel_adaptive.process_swap_kb = lambda _pid: 0 + bazel_adaptive.os.kill = lambda pid, sig: signals.append((pid, sig)) + with temporary_env("BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB", "2048"): + throttler = bazel_adaptive.ActionThrottler( + bazel_adaptive.BuildContext("/tmp/work") + ) + throttler.paused_keys = {"middle", "young"} + throttler.paused_pids = {"middle": {714}, "young": {715}} + + throttler.resume_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=8 * 1024 * 1024, + ) + ) + groups = [ + bazel_adaptive.ActionProcessGroup("old", [713], 100), + bazel_adaptive.ActionProcessGroup("middle", [714], 200), + bazel_adaptive.ActionProcessGroup("young", [715], 300, states={"T"}), + ] + throttler.resume_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=8 * 1024 * 1024, + ) + ) + throttler.next_normal_resume_at = ( + time.monotonic() - bazel_adaptive.RESUME_MEMORY_SETTLE_SECONDS + ) + throttler.resume_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=8 * 1024 * 1024, + ) + ) + + self.assertEqual( + signals, + [ + (714, signal.SIGCONT), + (715, signal.SIGCONT), + ], + ) + self.assertEqual(throttler.paused_keys, set()) + finally: + bazel_adaptive.build_process_groups = old_build_process_groups + bazel_adaptive.process_swap_kb = old_process_swap_kb + bazel_adaptive.os.kill = old_kill + sys.stderr = old_stderr + + def test_action_throttler_applies_lowered_stall_floor_without_waiting_for_lower_memory( + self, + ) -> None: + old_build_process_groups = bazel_adaptive.build_process_groups + old_process_swap_kb = bazel_adaptive.process_swap_kb + old_kill = bazel_adaptive.os.kill + old_stderr = sys.stderr + signals: list[tuple[int, int]] = [] + groups = [ + bazel_adaptive.ActionProcessGroup( + f"running-{index}", + [720 + index], + 100 + index, + states={"D"} if index == 0 else set(), + ) + for index in range(6) + ] + [ + bazel_adaptive.ActionProcessGroup( + f"paused-{index}", + [730 + index], + 200 + index, + states={"T"}, + ) + for index in range(6) + ] + try: + sys.stderr = io.StringIO() + bazel_adaptive.build_process_groups = lambda _context: groups + bazel_adaptive.process_swap_kb = lambda _pid: 0 + bazel_adaptive.os.kill = lambda pid, sig: signals.append((pid, sig)) + with temporary_env("BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB", "2048"): + throttler = bazel_adaptive.ActionThrottler( + bazel_adaptive.BuildContext("/tmp/work") + ) + throttler.paused_keys = {f"paused-{index}" for index in range(6)} + throttler.paused_pids = { + f"paused-{index}": {730 + index} for index in range(6) + } + throttler.io_stall_started_at = ( + time.monotonic() - bazel_adaptive.DEFAULT_IO_STALL_FLOOR_SECONDS + ) + + throttler.update( + bazel_adaptive.MemInfo( + total_kb=16 * 1024 * 1024, + available_kb=2500 * 1024, + ) + ) + + self.assertEqual(signals, [(725, signal.SIGSTOP)]) + self.assertEqual(throttler.io_stall_floor_groups, 3) + self.assertEqual(len(throttler.paused_keys), 7) + finally: + bazel_adaptive.build_process_groups = old_build_process_groups + bazel_adaptive.process_swap_kb = old_process_swap_kb + bazel_adaptive.os.kill = old_kill + sys.stderr = old_stderr + + def test_action_throttler_resumes_one_group_if_all_current_groups_are_paused(self) -> None: + old_build_process_groups = bazel_adaptive.build_process_groups + old_kill = bazel_adaptive.os.kill + old_stderr = sys.stderr + signals: list[tuple[int, int]] = [] + groups = [ + bazel_adaptive.ActionProcessGroup("old", [721], 100), + bazel_adaptive.ActionProcessGroup("young", [722], 200), + ] + try: + stderr = io.StringIO() + sys.stderr = stderr + bazel_adaptive.build_process_groups = lambda _context: groups + bazel_adaptive.os.kill = lambda pid, sig: signals.append((pid, sig)) + with temporary_env("BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB", "2048"): + throttler = bazel_adaptive.ActionThrottler( + bazel_adaptive.BuildContext("/tmp/work") + ) + throttler.paused_keys = {"old", "young"} + throttler.paused_pids = {"old": {721}, "young": {722}} + + throttler.resume_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=1 * 1024 * 1024, + ) + ) + + self.assertEqual(signals, [(721, signal.SIGCONT)]) + self.assertEqual(throttler.paused_keys, {"young"}) + self.assertIn( + "to keep at least one action group running", + stderr.getvalue(), + ) + finally: + bazel_adaptive.build_process_groups = old_build_process_groups + bazel_adaptive.os.kill = old_kill + sys.stderr = old_stderr + + def test_action_throttler_resumes_one_group_if_all_current_groups_are_physically_stopped( + self, + ) -> None: + old_build_process_groups = bazel_adaptive.build_process_groups + old_kill = bazel_adaptive.os.kill + old_stderr = sys.stderr + signals: list[tuple[int, int]] = [] + groups = [ + bazel_adaptive.ActionProcessGroup("old", [731], 100, states={"T"}), + bazel_adaptive.ActionProcessGroup("young", [732], 200, states={"T"}), + ] + try: + stderr = io.StringIO() + sys.stderr = stderr + bazel_adaptive.build_process_groups = lambda _context: groups + bazel_adaptive.os.kill = lambda pid, sig: signals.append((pid, sig)) + with temporary_env("BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB", "2048"): + throttler = bazel_adaptive.ActionThrottler( + bazel_adaptive.BuildContext("/tmp/work") + ) + + throttler.pause_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=1 * 1024 * 1024, + ) + ) + + self.assertEqual(signals, [(731, signal.SIGCONT)]) + self.assertIn( + "to keep at least one action group running", + stderr.getvalue(), + ) + finally: + bazel_adaptive.build_process_groups = old_build_process_groups + bazel_adaptive.os.kill = old_kill + sys.stderr = old_stderr + + def test_action_throttler_ignores_non_action_helpers_for_running_floor(self) -> None: + old_build_process_groups = bazel_adaptive.build_process_groups + old_kill = bazel_adaptive.os.kill + old_stderr = sys.stderr + signals: list[tuple[int, int]] = [] + groups = [ + bazel_adaptive.ActionProcessGroup( + "processwrapper-sandbox/1", + [741], + 100, + states={"T"}, + ), + bazel_adaptive.ActionProcessGroup("pid:900", [900], 200, states={"S"}), + ] + try: + stderr = io.StringIO() + sys.stderr = stderr + bazel_adaptive.build_process_groups = lambda _context: groups + bazel_adaptive.os.kill = lambda pid, sig: signals.append((pid, sig)) + with temporary_env("BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB", "2048"): + throttler = bazel_adaptive.ActionThrottler( + bazel_adaptive.BuildContext("/tmp/work") + ) + + throttler.pause_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=1 * 1024 * 1024, + ) + ) + + self.assertEqual(signals, [(741, signal.SIGCONT)]) + self.assertIn( + "to keep at least one action group running", + stderr.getvalue(), + ) + finally: + bazel_adaptive.build_process_groups = old_build_process_groups + bazel_adaptive.os.kill = old_kill + sys.stderr = old_stderr + + def test_action_throttler_keeps_half_running_for_memory_pressure_alone(self) -> None: + old_build_process_groups = bazel_adaptive.build_process_groups + old_kill = bazel_adaptive.os.kill + old_stderr = sys.stderr + signals: list[tuple[int, int]] = [] + groups = [ + bazel_adaptive.ActionProcessGroup(f"group-{index}", [800 + index], index) + for index in range(12) + ] + try: + sys.stderr = io.StringIO() + bazel_adaptive.build_process_groups = lambda _context: groups + bazel_adaptive.os.kill = lambda pid, sig: signals.append((pid, sig)) + with temporary_env("BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB", "1024"): + throttler = bazel_adaptive.ActionThrottler( + bazel_adaptive.BuildContext("/tmp/work") + ) + + for _ in range(20): + throttler.pause_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=512 * 1024, + ) + ) + + self.assertEqual( + len([sig for _pid, sig in signals if sig == signal.SIGSTOP]), + 6, + ) + self.assertEqual(len(throttler.paused_keys), 6) + self.assertEqual(len(groups) - len(throttler.paused_keys), 6) + finally: + bazel_adaptive.build_process_groups = old_build_process_groups + bazel_adaptive.os.kill = old_kill + sys.stderr = old_stderr + + def test_downscale_memory_threshold_uses_pause_watch_threshold_when_paused(self) -> None: + with temporary_env("BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB", "1024"): + throttler = bazel_adaptive.ActionThrottler( + bazel_adaptive.BuildContext("/tmp/work") + ) + + self.assertEqual( + throttler.downscale_memory_threshold_kb(), + 1024 * 1024, + ) + throttler.paused_keys = {"processwrapper-sandbox/1"} + self.assertEqual( + throttler.downscale_memory_threshold_kb(), + 2 * 1024 * 1024, + ) + + def test_running_action_group_timeout_ignores_paused_groups(self) -> None: + old_build_process_groups = bazel_adaptive.build_process_groups + old_monotonic = bazel_adaptive.time.monotonic + now = 1000.0 + + def ticks_for_age(seconds: float) -> int: + return int((now - seconds) * bazel_adaptive.CLOCK_TICKS_PER_SECOND) + + groups = [ + bazel_adaptive.ActionProcessGroup("running", [840], ticks_for_age(50)), + bazel_adaptive.ActionProcessGroup( + "paused", + [841], + ticks_for_age(500), + states={"T"}, + ), + ] + try: + bazel_adaptive.build_process_groups = lambda _context: groups + bazel_adaptive.time.monotonic = lambda: now + throttler = bazel_adaptive.ActionThrottler( + bazel_adaptive.BuildContext("/tmp/work") + ) + throttler.paused_keys = {"paused"} + + self.assertFalse(throttler.all_running_action_groups_over(100, now)) + + groups[0] = bazel_adaptive.ActionProcessGroup( + "running", + [840], + ticks_for_age(150), + ) + throttler.paused_total_seconds["running"] = 60.0 + self.assertFalse(throttler.all_running_action_groups_over(100, now)) + + throttler.paused_total_seconds["running"] = 40.0 + self.assertTrue(throttler.all_running_action_groups_over(100, now)) + finally: + bazel_adaptive.build_process_groups = old_build_process_groups + bazel_adaptive.time.monotonic = old_monotonic + + def test_timeout_downscale_defers_after_recent_progress_without_io_stall(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[5,617 / 5,640] 3 / 15 tests; Compiling a.cc; " + "101s processwrapper-sandbox ... (9 actions, 6 running)\n", + now=1000.0, + ) + parser.feed( + "[5,618 / 5,640] 3 / 15 tests; Compiling b.cc; " + "101s processwrapper-sandbox ... (9 actions, 6 running)\n", + now=1010.0, + ) + throttler = bazel_adaptive.ActionThrottler(bazel_adaptive.BuildContext("/tmp/work")) + + self.assertTrue( + bazel_adaptive.action_timeout_evidence(parser, throttler, 100, 1010.0)[0] + ) + self.assertIsNotNone( + bazel_adaptive.timeout_downscale_defer_reason(parser, throttler, 100, 1010.0) + ) + + throttler.current_io_stall_observed = True + self.assertIsNone( + bazel_adaptive.timeout_downscale_defer_reason(parser, throttler, 100, 1010.0) + ) + + def test_timeout_downscale_does_not_defer_after_progress_gets_old(self) -> None: + parser = bazel_adaptive.ProgressFrameParser() + parser.feed( + "[5,617 / 5,640] 3 / 15 tests; Compiling a.cc; " + "101s processwrapper-sandbox ... (9 actions, 6 running)\n", + now=1000.0, + ) + parser.feed( + "[5,618 / 5,640] 3 / 15 tests; Compiling b.cc; " + "101s processwrapper-sandbox ... (9 actions, 6 running)\n", + now=1010.0, + ) + throttler = bazel_adaptive.ActionThrottler(bazel_adaptive.BuildContext("/tmp/work")) + + self.assertIsNone( + bazel_adaptive.timeout_downscale_defer_reason(parser, throttler, 100, 1111.0) + ) + + def test_action_throttler_halves_floor_after_sustained_io_stall(self) -> None: + old_build_process_groups = bazel_adaptive.build_process_groups + old_kill = bazel_adaptive.os.kill + old_stderr = sys.stderr + old_monotonic = bazel_adaptive.time.monotonic + signals: list[tuple[int, int]] = [] + groups = [ + bazel_adaptive.ActionProcessGroup(f"group-{index}", [900 + index], index, states={"D"}) + for index in range(12) + ] + try: + stderr = io.StringIO() + sys.stderr = stderr + bazel_adaptive.build_process_groups = lambda _context: groups + bazel_adaptive.os.kill = lambda pid, sig: signals.append((pid, sig)) + bazel_adaptive.time.monotonic = lambda: 11.0 + with temporary_env("BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB", "1024"): + throttler = bazel_adaptive.ActionThrottler( + bazel_adaptive.BuildContext("/tmp/work") + ) + throttler.io_stall_started_at = 0.0 + + for _ in range(20): + throttler.pause_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=512 * 1024, + ) + ) + + self.assertEqual(throttler.io_stall_floor_groups, 3) + self.assertEqual( + len([sig for _pid, sig in signals if sig == signal.SIGSTOP]), + 9, + ) + self.assertEqual(len(groups) - len(throttler.paused_keys), 3) + self.assertIn( + "lowering pause floor from 6 to 3 running action group(s)", + stderr.getvalue(), + ) + finally: + bazel_adaptive.build_process_groups = old_build_process_groups + bazel_adaptive.os.kill = old_kill + bazel_adaptive.time.monotonic = old_monotonic + sys.stderr = old_stderr + + def test_action_throttler_repeatedly_halves_floor_while_io_stall_remains(self) -> None: + old_build_process_groups = bazel_adaptive.build_process_groups + old_kill = bazel_adaptive.os.kill + old_stderr = sys.stderr + old_monotonic = bazel_adaptive.time.monotonic + signals: list[tuple[int, int]] = [] + now = 0.0 + groups = [ + bazel_adaptive.ActionProcessGroup(f"group-{index}", [920 + index], index, states={"D"}) + for index in range(12) + ] + try: + stderr = io.StringIO() + sys.stderr = stderr + bazel_adaptive.build_process_groups = lambda _context: groups + bazel_adaptive.os.kill = lambda pid, sig: signals.append((pid, sig)) + bazel_adaptive.time.monotonic = lambda: now + with temporary_env("BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB", "1024"): + throttler = bazel_adaptive.ActionThrottler( + bazel_adaptive.BuildContext("/tmp/work") + ) + throttler.io_stall_started_at = 0.0 + + for expected_now, expected_floor, expected_running in ( + (11.0, 3, 3), + (21.0, 2, 2), + (31.0, 1, 1), + ): + now = expected_now + for _ in range(20): + throttler.pause_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=512 * 1024, + ) + ) + self.assertEqual(throttler.io_stall_floor_groups, expected_floor) + self.assertEqual( + len(groups) - len(throttler.paused_keys), + expected_running, + ) + + self.assertIn( + "lowering pause floor from 6 to 3 running action group(s)", + stderr.getvalue(), + ) + self.assertIn( + "lowering pause floor from 3 to 2 running action group(s)", + stderr.getvalue(), + ) + self.assertIn( + "lowering pause floor from 2 to 1 running action group(s)", + stderr.getvalue(), + ) + finally: + bazel_adaptive.build_process_groups = old_build_process_groups + bazel_adaptive.os.kill = old_kill + bazel_adaptive.time.monotonic = old_monotonic + sys.stderr = old_stderr + + def test_action_throttler_resets_io_stall_window_when_blocking_clears(self) -> None: + old_build_process_groups = bazel_adaptive.build_process_groups + old_kill = bazel_adaptive.os.kill + old_stderr = sys.stderr + old_monotonic = bazel_adaptive.time.monotonic + signals: list[tuple[int, int]] = [] + now = 0.0 + stalled = True + + def groups() -> list[bazel_adaptive.ActionProcessGroup]: + states = {"D"} if stalled else {"R"} + return [ + bazel_adaptive.ActionProcessGroup( + f"group-{index}", + [940 + index], + index, + states=states, + ) + for index in range(12) + ] + + try: + sys.stderr = io.StringIO() + bazel_adaptive.build_process_groups = lambda _context: groups() + bazel_adaptive.os.kill = lambda pid, sig: signals.append((pid, sig)) + bazel_adaptive.time.monotonic = lambda: now + with temporary_env("BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB", "1024"): + throttler = bazel_adaptive.ActionThrottler( + bazel_adaptive.BuildContext("/tmp/work") + ) + + for _ in range(20): + throttler.pause_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=512 * 1024, + ) + ) + self.assertEqual(throttler.io_stall_started_at, 0.0) + + now = 4.9 + stalled = False + throttler.pause_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=512 * 1024, + ) + ) + self.assertIsNone(throttler.io_stall_started_at) + + now = 11.0 + stalled = True + for _ in range(20): + throttler.pause_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=512 * 1024, + ) + ) + + self.assertIsNone(throttler.io_stall_floor_groups) + self.assertEqual(len(groups()) - len(throttler.paused_keys), 6) + finally: + bazel_adaptive.build_process_groups = old_build_process_groups + bazel_adaptive.os.kill = old_kill + bazel_adaptive.time.monotonic = old_monotonic + sys.stderr = old_stderr + + def test_action_throttler_halves_floor_after_sustained_swap_io(self) -> None: + old_build_process_groups = bazel_adaptive.build_process_groups + old_kill = bazel_adaptive.os.kill + old_stderr = sys.stderr + old_monotonic = bazel_adaptive.time.monotonic + old_read_swap_io = bazel_adaptive.read_swap_io + signals: list[tuple[int, int]] = [] + now = 0.0 + groups = [ + bazel_adaptive.ActionProcessGroup(f"group-{index}", [960 + index], index) + for index in range(12) + ] + + def fake_swap_io() -> bazel_adaptive.SwapIo: + return bazel_adaptive.SwapIo(pages_in=int(now * 20000)) + + try: + sys.stderr = io.StringIO() + bazel_adaptive.build_process_groups = lambda _context: groups + bazel_adaptive.os.kill = lambda pid, sig: signals.append((pid, sig)) + bazel_adaptive.time.monotonic = lambda: now + bazel_adaptive.read_swap_io = fake_swap_io + with temporary_env("BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB", "1024"): + throttler = bazel_adaptive.ActionThrottler( + bazel_adaptive.BuildContext("/tmp/work") + ) + + now = 0.0 + throttler.pause_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=512 * 1024, + ) + ) + now = 1.0 + for _ in range(20): + throttler.pause_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=512 * 1024, + ) + ) + self.assertEqual(len(groups) - len(throttler.paused_keys), 6) + + now = 11.5 + for _ in range(20): + throttler.pause_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=512 * 1024, + ) + ) + now += 0.1 + + self.assertEqual(throttler.io_stall_floor_groups, 3) + self.assertEqual(len(groups) - len(throttler.paused_keys), 3) + finally: + bazel_adaptive.build_process_groups = old_build_process_groups + bazel_adaptive.os.kill = old_kill + bazel_adaptive.time.monotonic = old_monotonic + bazel_adaptive.read_swap_io = old_read_swap_io + sys.stderr = old_stderr + + def test_action_throttler_never_pauses_the_last_running_group(self) -> None: + old_build_process_groups = bazel_adaptive.build_process_groups + old_kill = bazel_adaptive.os.kill + old_stderr = sys.stderr + signals: list[tuple[int, int]] = [] + groups = [ + bazel_adaptive.ActionProcessGroup("only", [1001], 100, states={"D"}), + ] + try: + sys.stderr = io.StringIO() + bazel_adaptive.build_process_groups = lambda _context: groups + bazel_adaptive.os.kill = lambda pid, sig: signals.append((pid, sig)) + with temporary_env("BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB", "1024"): + throttler = bazel_adaptive.ActionThrottler( + bazel_adaptive.BuildContext("/tmp/work") + ) + throttler.io_stall_floor_groups = 1 + + throttler.pause_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=128 * 1024, + ) + ) + + self.assertEqual(signals, []) + self.assertEqual(throttler.paused_keys, set()) + finally: + bazel_adaptive.build_process_groups = old_build_process_groups + bazel_adaptive.os.kill = old_kill + sys.stderr = old_stderr + + def test_action_throttler_raises_threshold_when_running_jobs_stall_on_io(self) -> None: + old_build_process_groups = bazel_adaptive.build_process_groups + old_kill = bazel_adaptive.os.kill + old_stderr = sys.stderr + signals: list[tuple[int, int]] = [] + groups = [ + bazel_adaptive.ActionProcessGroup("old", [801], 100, states={"R"}), + bazel_adaptive.ActionProcessGroup("young", [802], 200, states={"D"}), + ] + try: + stderr = io.StringIO() + sys.stderr = stderr + bazel_adaptive.build_process_groups = lambda _context: groups + bazel_adaptive.os.kill = lambda pid, sig: signals.append((pid, sig)) + with temporary_env("BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB", "1024"): + throttler = bazel_adaptive.ActionThrottler( + bazel_adaptive.BuildContext("/tmp/work") + ) + + throttler.pause_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=2 * 1024 * 1024, + ) + ) + + self.assertEqual(throttler.low_memory_threshold_kb(), 1280 * 1024) + self.assertEqual(signals, [(802, signal.SIGSTOP)]) + self.assertIn( + "raising low-memory threshold from 1024 to 1280 MiB", + stderr.getvalue(), + ) + finally: + bazel_adaptive.build_process_groups = old_build_process_groups + bazel_adaptive.os.kill = old_kill + sys.stderr = old_stderr + + def test_action_throttler_lowers_threshold_after_stalls_clear(self) -> None: + old_build_process_groups = bazel_adaptive.build_process_groups + old_kill = bazel_adaptive.os.kill + old_stderr = sys.stderr + groups = [ + bazel_adaptive.ActionProcessGroup("old", [811], 100, states={"R"}), + bazel_adaptive.ActionProcessGroup("young", [812], 200, states={"R"}), + ] + try: + stderr = io.StringIO() + sys.stderr = stderr + bazel_adaptive.build_process_groups = lambda _context: groups + bazel_adaptive.os.kill = lambda _pid, _sig: None + with temporary_env("BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB", "1024"): + throttler = bazel_adaptive.ActionThrottler( + bazel_adaptive.BuildContext("/tmp/work") + ) + throttler.effective_threshold_kb = 1536 * 1024 + throttler.next_threshold_lower_at = 0.0 + + throttler.resume_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=4 * 1024 * 1024, + ) + ) + + self.assertEqual(throttler.low_memory_threshold_kb(), 1280 * 1024) + self.assertIn( + "lowering low-memory threshold from 1536 to 1280 MiB", + stderr.getvalue(), + ) + finally: + bazel_adaptive.build_process_groups = old_build_process_groups + bazel_adaptive.os.kill = old_kill + sys.stderr = old_stderr + + def test_action_throttler_resume_all_clears_stopped_groups(self) -> None: + old_build_process_groups = bazel_adaptive.build_process_groups + old_kill = bazel_adaptive.os.kill + old_stderr = sys.stderr + signals: list[tuple[int, int]] = [] + groups = [ + bazel_adaptive.ActionProcessGroup("old", [401], 100), + bazel_adaptive.ActionProcessGroup("young", [402, 403], 200), + ] + try: + sys.stderr = io.StringIO() + bazel_adaptive.build_process_groups = lambda _context: groups + bazel_adaptive.os.kill = lambda pid, sig: signals.append((pid, sig)) + with temporary_env("BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB", "2048"): + throttler = bazel_adaptive.ActionThrottler( + bazel_adaptive.BuildContext("/tmp/work") + ) + + throttler.pause_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=4 * 1024 * 1024, + ) + ) + resumed = throttler.resume_all("before test restart") + + self.assertEqual( + signals, + [ + (402, signal.SIGSTOP), + (403, signal.SIGSTOP), + (402, signal.SIGCONT), + (403, signal.SIGCONT), + ], + ) + self.assertEqual(resumed, 1) + self.assertEqual(throttler.paused_keys, set()) + self.assertEqual(throttler.paused_pids, {}) + self.assertIn( + "resumed 1 paused Bazel action group(s) before test restart", + sys.stderr.getvalue(), + ) + finally: + bazel_adaptive.build_process_groups = old_build_process_groups + bazel_adaptive.os.kill = old_kill + sys.stderr = old_stderr + + def test_action_throttler_resume_all_uses_remembered_pids(self) -> None: + old_build_process_groups = bazel_adaptive.build_process_groups + old_kill = bazel_adaptive.os.kill + old_stderr = sys.stderr + signals: list[tuple[int, int]] = [] + groups = [ + bazel_adaptive.ActionProcessGroup("old", [501], 100), + bazel_adaptive.ActionProcessGroup("young", [502], 200), + ] + try: + sys.stderr = io.StringIO() + bazel_adaptive.build_process_groups = lambda _context: groups + bazel_adaptive.os.kill = lambda pid, sig: signals.append((pid, sig)) + with temporary_env("BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB", "2048"): + throttler = bazel_adaptive.ActionThrottler( + bazel_adaptive.BuildContext("/tmp/work") + ) + + throttler.pause_if_needed( + bazel_adaptive.MemInfo( + total_kb=8 * 1024 * 1024, + available_kb=4 * 1024 * 1024, + ) + ) + bazel_adaptive.build_process_groups = lambda _context: [] + resumed = throttler.resume_all("before test exit") + + self.assertEqual( + signals, + [ + (502, signal.SIGSTOP), + (502, signal.SIGCONT), + ], + ) + self.assertEqual(resumed, 1) + self.assertEqual(throttler.paused_keys, set()) + self.assertEqual(throttler.paused_pids, {}) + finally: + bazel_adaptive.build_process_groups = old_build_process_groups + bazel_adaptive.os.kill = old_kill + sys.stderr = old_stderr + + def test_memory_kill_resumes_paused_groups_before_bazel_exits(self) -> None: + old_action_throttler = bazel_adaptive.ActionThrottler + old_stderr = sys.stderr + meminfo_env = "BAZEL_ADAPTIVE_MEMINFO" + old_meminfo_env = os.environ.get(meminfo_env) + resume_reasons: list[str | None] = [] + + class CapturingStderr(io.StringIO): + def __init__(self) -> None: + super().__init__() + self.buffer = io.BytesIO() + + def fileno(self) -> int: + raise OSError("test stderr has no file descriptor") + + class FakeActionThrottler: + def __init__(self, _context: bazel_adaptive.BuildContext) -> None: + pass + + def update(self, _meminfo: bazel_adaptive.MemInfo | None) -> None: + pass + + def pause_if_needed(self, _meminfo: bazel_adaptive.MemInfo | None) -> None: + pass + + def resume_if_needed(self, _meminfo: bazel_adaptive.MemInfo | None) -> None: + pass + + def paused_count(self) -> int: + return 0 + + def paused_labels(self) -> set[str]: + return set() + + def resume_all(self, reason: str | None = None) -> int: + resume_reasons.append(reason) + return 1 + + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + meminfo = tmpdir / "meminfo" + script = tmpdir / "bazel" + write_meminfo(meminfo, total_kb=8 * 1024 * 1024, available_kb=7 * 1024 * 1024) + script.write_text( + "#!/usr/bin/env python3\n" + "import sys, time\n" + "print('ERROR: Compiling tests/example.cc failed: (Killed): clang failed', " + "file=sys.stderr, flush=True)\n" + "time.sleep(0.2)\n" + "sys.exit(1)\n", + encoding="utf-8", + ) + script.chmod(0o755) + + try: + sys.stderr = CapturingStderr() + os.environ[meminfo_env] = str(meminfo) + bazel_adaptive.ActionThrottler = FakeActionThrottler + parsed = bazel_adaptive.parse_bazel_args( + ["build", "--jobs=2", "//:target"], + action_timeout=1, + ) + + result = bazel_adaptive.run_once( + str(script), + parsed, + jobs=2, + max_jobs=2, + context=bazel_adaptive.BuildContext(str(tmpdir)), + ) + + self.assertEqual(result.restart, "same") + self.assertEqual( + resume_reasons[0], + "because Bazel reported a killed or terminated action", + ) + finally: + sys.stderr = old_stderr + bazel_adaptive.ActionThrottler = old_action_throttler + if old_meminfo_env is None: + os.environ.pop(meminfo_env, None) + else: + os.environ[meminfo_env] = old_meminfo_env + + def test_user_signal_resumes_paused_groups_before_bazel_exits(self) -> None: + old_action_throttler = bazel_adaptive.ActionThrottler + old_stderr = sys.stderr + old_user_terminating = bazel_adaptive.USER_TERMINATING + meminfo_env = "BAZEL_ADAPTIVE_MEMINFO" + old_meminfo_env = os.environ.get(meminfo_env) + resume_reasons: list[str | None] = [] + + class CapturingStderr(io.StringIO): + def __init__(self) -> None: + super().__init__() + self.buffer = io.BytesIO() + + def fileno(self) -> int: + raise OSError("test stderr has no file descriptor") + + class FakeActionThrottler: + def __init__(self, _context: bazel_adaptive.BuildContext) -> None: + self.paused = True + + def update(self, _meminfo: bazel_adaptive.MemInfo | None) -> None: + pass + + def pause_if_needed(self, _meminfo: bazel_adaptive.MemInfo | None) -> None: + pass + + def resume_if_needed(self, _meminfo: bazel_adaptive.MemInfo | None) -> None: + pass + + def paused_count(self) -> int: + return 1 if self.paused else 0 + + def paused_labels(self) -> set[str]: + return {"tests/example.cc"} if self.paused else set() + + def resume_all(self, reason: str | None = None) -> int: + resume_reasons.append(reason) + was_paused = self.paused + self.paused = False + return 1 if was_paused else 0 + + def send_user_signal() -> None: + bazel_adaptive.USER_TERMINATING = True + process = bazel_adaptive.ACTIVE_PROCESS + if process is not None and process.poll() is None: + try: + os.killpg(process.pid, signal.SIGINT) + except ProcessLookupError: + pass + + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + meminfo = tmpdir / "meminfo" + script = tmpdir / "bazel" + write_meminfo(meminfo, total_kb=8 * 1024 * 1024, available_kb=7 * 1024 * 1024) + script.write_text( + "#!/usr/bin/env python3\n" + "import signal, sys, time\n" + "def on_sigint(signum, frame):\n" + " sys.exit(130)\n" + "signal.signal(signal.SIGINT, on_sigint)\n" + "print('[1 / 2] 1 action running', flush=True)\n" + "time.sleep(30)\n", + encoding="utf-8", + ) + script.chmod(0o755) + + timer = threading.Timer(0.2, send_user_signal) + try: + sys.stderr = CapturingStderr() + os.environ[meminfo_env] = str(meminfo) + bazel_adaptive.ActionThrottler = FakeActionThrottler + bazel_adaptive.USER_TERMINATING = False + parsed = bazel_adaptive.parse_bazel_args( + ["build", "--jobs=2", "//:target"], + action_timeout=1, + ) + + timer.start() + result = bazel_adaptive.run_once( + str(script), + parsed, + jobs=2, + max_jobs=2, + context=bazel_adaptive.BuildContext(str(tmpdir)), + ) + + self.assertEqual(result.exit_code, 130) + self.assertEqual( + resume_reasons[0], + "because the wrapper received a user signal", + ) + finally: + timer.cancel() + sys.stderr = old_stderr + bazel_adaptive.ActionThrottler = old_action_throttler + bazel_adaptive.USER_TERMINATING = old_user_terminating + if old_meminfo_env is None: + os.environ.pop(meminfo_env, None) + else: + os.environ[meminfo_env] = old_meminfo_env + + +class FakeBazelIntegrationTest(unittest.TestCase): + def make_fake_bazel(self, tmpdir: Path) -> Path: + fake = tmpdir / "fake_bazel.py" + fake.write_text( + textwrap.dedent( + """\ + #!/usr/bin/env python3 + import os + import signal + import subprocess + import sys + import time + from pathlib import Path + + log = Path(os.environ["FAKE_BAZEL_LOG"]) + meminfo = Path(os.environ["BAZEL_ADAPTIVE_MEMINFO"]) + mode = os.environ.get("FAKE_BAZEL_MODE", "scale") + count_path = Path(os.environ["FAKE_BAZEL_COUNT"]) + + def append(message): + with log.open("a", encoding="utf-8") as output: + output.write(message + "\\n") + + def count_invocation(): + try: + count = int(count_path.read_text(encoding="utf-8")) + except FileNotFoundError: + count = 0 + count += 1 + count_path.write_text(str(count), encoding="utf-8") + return count + + def write_meminfo_text(text): + tmp = meminfo.with_suffix(".tmp") + tmp.write_text(text, encoding="utf-8") + tmp.replace(meminfo) + + def write_high_mem(): + write_meminfo_text( + "MemTotal: 4194304 kB\\n" + "MemFree: 3145728 kB\\n" + "MemAvailable: 3145728 kB\\n" + ) + + def write_high_mem_later(): + subprocess.Popen( + [ + sys.executable, + "-c", + "import os, time; from pathlib import Path; " + "time.sleep(0.5); " + "Path(os.environ['BAZEL_ADAPTIVE_MEMINFO']).write_text(" + "'MemTotal: 4194304 kB\\\\n'" + "'MemFree: 3145728 kB\\\\n'" + "'MemAvailable: 3145728 kB\\\\n', encoding='utf-8')", + ], + env=os.environ.copy(), + stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + start_new_session=True, + ) + + def write_recovered_mem(): + write_meminfo_text( + "MemTotal: 4194304 kB\\n" + "MemFree: 2200000 kB\\n" + "MemAvailable: 2200000 kB\\n" + ) + + def write_low_mem(): + write_meminfo_text( + "MemTotal: 4194304 kB\\n" + "MemFree: 1048576 kB\\n" + "MemAvailable: 1048576 kB\\n" + ) + + def write_mid_large_mem(): + write_meminfo_text( + "MemTotal: 8388608 kB\\n" + "MemFree: 3145728 kB\\n" + "MemAvailable: 3145728 kB\\n" + ) + + def write_high_large_mem(): + write_meminfo_text( + "MemTotal: 8388608 kB\\n" + "MemFree: 6291456 kB\\n" + "MemAvailable: 6291456 kB\\n" + ) + + if len(sys.argv) > 1 and sys.argv[1] == "shutdown": + append("shutdown") + write_high_mem() + sys.exit(0) + + if len(sys.argv) == 1: + print("[bazel release fake]") + print("Usage: bazel ...") + sys.exit(0) + + invocation = count_invocation() + append("argv " + " ".join(sys.argv[1:])) + append(f"nice {os.nice(0)}") + print("stdout-marker", flush=True) + print("stderr-marker", file=sys.stderr, flush=True) + + def exit_on_sigint(signum, frame): + append(f"sigint {invocation}") + print( + "Bazel caught interrupt signal; cancelling pending invocation.", + file=sys.stderr, + flush=True, + ) + print("ERROR: build interrupted", file=sys.stderr, flush=True) + if mode == "upscale_failure" and invocation == 2: + print( + "ERROR: /tmp/example: Compiling example.cc failed: " + "error executing CppCompile", + file=sys.stderr, + flush=True, + ) + sys.exit(1) + if mode == "scale" and invocation == 2: + write_high_mem() + sys.exit(0) + if mode == "skip_upscale": + write_recovered_mem() + else: + write_high_mem() + sys.exit(130) + + def print_running_actions(seconds=1, completed=2, running=2, total=100): + actions = [ + "GoCompilePkg //proxylib:go_default_library", + "Rustc //crate:lib", + "ProtoCompile //api:v1_proto", + "GoLink //cmd:proxy", + ] + print( + f"[{completed} / {total}] {running} actions, " + f"{running} running", + flush=True, + ) + for index in range(running): + print( + f" {actions[index % len(actions)]}; {seconds}s remote", + flush=True, + ) + + def print_meaningful_running_actions(seconds=1, running=2): + print_running_actions(seconds, completed=2, running=running) + time.sleep(0.3) + print_running_actions(seconds, completed=3, running=running) + + if mode == "ignore": + signal.signal(signal.SIGINT, signal.SIG_IGN) + else: + signal.signal(signal.SIGINT, exit_on_sigint) + + if mode == "signal": + time.sleep(30) + sys.exit(0) + + if mode == "killed_exit" and invocation == 1: + print( + "ERROR: Compiling tests/example.cc failed: " + "(Killed): clang failed", + file=sys.stderr, + ) + write_high_mem_later() + sys.exit(1) + + if mode == "killed_high" and invocation == 1: + print( + "ERROR: Compiling tests/example.cc failed: " + "(Killed): clang failed", + file=sys.stderr, + ) + write_high_mem() + sys.exit(1) + + if mode == "terminated_exit" and invocation == 1: + print( + "ERROR: Compiling tests/example.cc failed: " + "(Terminated): clang failed", + file=sys.stderr, + ) + write_high_mem_later() + sys.exit(1) + + if mode == "terminated_high_twice" and invocation <= 2: + print( + "ERROR: Compiling tests/example.cc failed: " + "(Terminated): clang failed", + file=sys.stderr, + ) + write_high_mem() + sys.exit(1) + + if mode == "terminated_high_always": + print( + "ERROR: Compiling tests/example.cc failed: " + "(Terminated): clang failed", + file=sys.stderr, + ) + write_high_mem() + sys.exit(1) + + if mode == "server_abrupt" and invocation == 1: + print( + "Server terminated abruptly " + "(error code: 14, error message: 'Socket closed')", + file=sys.stderr, + ) + write_high_mem_later() + sys.exit(37) + + if mode == "internal_interrupted_crash" and invocation == 1: + print( + "FATAL: bazel crashed due to an internal error. " + "Printing stack trace:", + file=sys.stderr, + ) + print( + "Caused by: java.lang.InterruptedException", + file=sys.stderr, + ) + write_high_mem() + sys.exit(37) + + if mode == "internal_interrupted_crash_twice" and invocation <= 2: + print( + "FATAL: bazel crashed due to an internal error. " + "Printing stack trace:", + file=sys.stderr, + ) + print( + "Caused by: java.lang.InterruptedException", + file=sys.stderr, + ) + write_high_mem() + sys.exit(37) + + if mode == "bazel_user_interrupt" and invocation == 1: + print( + "ERROR: Compiling tests/example.cc failed: " + "(Terminated): clang failed", + file=sys.stderr, + ) + print( + "Bazel caught interrupt signal; cancelling pending invocation.", + file=sys.stderr, + ) + print("ERROR: build interrupted", file=sys.stderr) + write_high_mem() + sys.exit(8) + + if mode == "server_abrupt_stall" and invocation == 1: + print("[10,535 / 10,567] 13 actions, 12 running", flush=True) + print(" Compiling tests/a.cc; 110s processwrapper-sandbox", flush=True) + print(" Compiling tests/b.cc; 110s processwrapper-sandbox", flush=True) + print( + "Server terminated abruptly " + "(error code: 14, error message: 'Connection reset by peer')", + file=sys.stderr, + ) + write_high_mem() + sys.exit(37) + + if mode == "leak_child" and invocation == 1: + output_base = Path(os.environ["FAKE_BAZEL_OUTPUT_BASE"]) + child_cwd = output_base / "execroot" / "cilium" + child_cwd.mkdir(parents=True, exist_ok=True) + child = subprocess.Popen( + [ + sys.executable, + "-c", + "import os, time; " + "os.chdir(os.environ['FAKE_CHILD_CWD']); " + "time.sleep(60)", + ], + env={**os.environ, "FAKE_CHILD_CWD": str(child_cwd)}, + stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + start_new_session=True, + ) + append(f"leak_child_pid {child.pid}") + print( + "Server terminated abruptly " + "(error code: 14, error message: 'Socket closed', " + f"log file: '{output_base}/server/jvm.out')", + file=sys.stderr, + ) + sys.exit(37) + + if mode == "leak_child_then_success" and invocation == 1: + output_base = Path(os.environ["FAKE_BAZEL_OUTPUT_BASE"]) + child_cwd = output_base / "execroot" / "cilium" + child_cwd.mkdir(parents=True, exist_ok=True) + child = subprocess.Popen( + [ + sys.executable, + "-c", + "import os, time; " + "os.chdir(os.environ['FAKE_CHILD_CWD']); " + "time.sleep(60)", + ], + env={**os.environ, "FAKE_CHILD_CWD": str(child_cwd)}, + stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + start_new_session=True, + ) + append(f"leak_child_pid {child.pid}") + print("[10,535 / 10,567] 13 actions, 12 running", flush=True) + print(" Compiling tests/a.cc; 67s processwrapper-sandbox", flush=True) + print( + "Server terminated abruptly " + "(error code: 14, error message: 'Connection reset by peer', " + f"log file: '{output_base}/server/jvm.out')", + file=sys.stderr, + ) + sys.exit(37) + + if mode == "silent_wall_clock" and invocation == 1: + print("[1 / 4] 2 actions, 2 running", flush=True) + print(" Compiling a.cc; 0s processwrapper-sandbox", flush=True) + print(" Compiling b.cc; 0s processwrapper-sandbox", flush=True) + time.sleep(30) + sys.exit(1) + + if invocation == 1: + print("[1 / 4] 2 actions, 2 running", flush=True) + print(" Compiling a.cc; 2s processwrapper-sandbox", flush=True) + print(" Compiling b.cc; 2s processwrapper-sandbox", flush=True) + time.sleep(30) + elif invocation == 2 and mode == "scale": + print_meaningful_running_actions() + time.sleep(2) + sys.exit(0) + elif invocation == 2 and mode == "skip_upscale": + write_low_mem() + print_meaningful_running_actions() + time.sleep(2) + sys.exit(0) + elif invocation == 2 and mode == "long_action_skip_upscale": + print_meaningful_running_actions(16) + time.sleep(2) + sys.exit(0) + elif invocation == 2 and mode == "postpone_then_upscale": + print_running_actions(16, completed=2) + time.sleep(0.3) + print_running_actions(16, completed=3) + time.sleep(0.8) + print_running_actions(1, completed=4) + time.sleep(2) + sys.exit(0) + elif invocation == 2 and mode == "postpone_memory_then_upscale": + print_running_actions(16, completed=2) + time.sleep(0.3) + print_running_actions(16, completed=3) + time.sleep(0.8) + write_mid_large_mem() + print_running_actions(1, completed=4) + time.sleep(0.8) + write_high_large_mem() + print_running_actions(1, completed=4) + time.sleep(4) + sys.exit(0) + elif invocation == 2 and mode == "pending_upscale_then_downscale": + print_running_actions(16, completed=2) + time.sleep(0.3) + print_running_actions(16, completed=3) + time.sleep(0.8) + write_low_mem() + print_running_actions(3, completed=4) + time.sleep(2) + sys.exit(0) + elif invocation == 2 and mode == "no_meaningful_work": + print_running_actions(1, completed=2) + time.sleep(0.8) + print_running_actions(1, completed=2) + time.sleep(2) + sys.exit(0) + elif invocation == 2 and mode == "winding_down": + print_running_actions(1, completed=2, running=3, total=6) + time.sleep(0.2) + print_running_actions(1, completed=3, running=3, total=6) + time.sleep(0.2) + print_running_actions(1, completed=4, running=2, total=6) + time.sleep(0.8) + print("[6 / 6] no actions running", flush=True) + time.sleep(2) + sys.exit(0) + elif invocation == 2 and mode == "near_complete": + print_running_actions(1, completed=10585, running=2, total=10588) + time.sleep(0.8) + print( + "[10,586 / 10,588] 13 / 15 tests; 2 actions running; " + "last test: //tests:cilium_http_integration_test", + flush=True, + ) + print( + " Testing //tests:cilium_tls_http_integration_test; " + "0s processwrapper-sandbox", + flush=True, + ) + print( + " Testing //tests:cilium_tls_tcp_integration_test; " + "0s processwrapper-sandbox", + flush=True, + ) + time.sleep(2) + sys.exit(0) + elif invocation == 2 and mode == "upscale_failure": + print_meaningful_running_actions() + time.sleep(30) + else: + sys.exit(0) + """ + ), + encoding="utf-8", + ) + fake.chmod(0o755) + return fake + + def run_wrapper( + self, + tmpdir: Path, + fake_bazel: Path, + mode: str = "scale", + extra_args: list[str] | None = None, + extra_env: dict[str, str] | None = None, + initial_available_kb: int = 512 * 1024, + jobs: int = 4, + build_timeout: int = 1, + ) -> subprocess.CompletedProcess: + log = tmpdir / "fake.log" + count = tmpdir / "count" + meminfo = tmpdir / "meminfo" + output_base = tmpdir / "output_base" + workspace = tmpdir / "workspace" + workspace.mkdir() + write_meminfo(meminfo, total_kb=4 * 1024 * 1024, available_kb=initial_available_kb) + + env = os.environ.copy() + env.update( + { + "BAZEL": str(fake_bazel), + "BAZEL_ADAPTIVE_MEMINFO": str(meminfo), + "BAZEL_ADAPTIVE_BUILD_TIMEOUT": str(build_timeout), + "BAZEL_ADAPTIVE_MEMORY_POLL_INTERVAL": "0.05", + "BAZEL_ADAPTIVE_UPSCALE_CHECK_INTERVAL": "0.5", + "BAZEL_ADAPTIVE_RESTART_SETTLE_DELAY": "0.05", + "FAKE_BAZEL_LOG": str(log), + "FAKE_BAZEL_COUNT": str(count), + "FAKE_BAZEL_MODE": mode, + "FAKE_BAZEL_OUTPUT_BASE": str(output_base), + } + ) + if extra_env: + env.update(extra_env) + args = [sys.executable, str(WRAPPER), "build", f"--jobs={jobs}"] + if extra_args: + args.extend(extra_args) + return subprocess.run( + args, + env=env, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + timeout=30, + check=False, + cwd=workspace, + ) + + def test_downscale_then_upscale_with_fake_bazel(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper(tmpdir, fake) + + self.assertEqual(result.returncode, 0, result.stderr) + self.assertIn("stdout-marker", result.stdout) + self.assertIn("stderr-marker", result.stderr) + self.assertIn( + "action timeout and low memory detected; retrying with fewer jobs", + result.stderr, + ) + self.assertIn("upscale:", result.stderr) + self.assertIn("stopping Bazel at --jobs=2", result.stderr) + self.assertIn("restart at --jobs=3", result.stderr) + self.assertIn("Bazel stopped at --jobs=2; restarting with --jobs=3", result.stderr) + self.assertIn("settling for 0.05s before restarting with --jobs=2", result.stderr) + self.assertIn("settling for 0.05s before restarting with --jobs=3", result.stderr) + self.assertIn("memory latest", result.stderr) + self.assertIn("running actions observed", result.stderr) + self.assertIn("interrupting Bazel at a cheap upscale point", result.stderr) + self.assertNotIn("restarting Bazel with", result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertIn("argv build --jobs=4", log) + self.assertIn("argv build --jobs=2", log) + self.assertIn("argv build --jobs=3", log) + self.assertEqual(log.count("argv build --jobs=4"), 1) + + def test_bazel_child_priority_can_be_lowered(self) -> None: + current_nice = os.nice(0) + if current_nice > 16: + self.skipTest("test process is already too nice to assert an increment") + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper( + tmpdir, + fake, + mode="killed_high", + extra_env={"BAZEL_ADAPTIVE_BAZEL_NICE": "3"}, + ) + + self.assertEqual(result.returncode, 0, result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertIn(f"nice {current_nice + 3}", log) + + def test_bazel_child_priority_can_be_left_unchanged(self) -> None: + current_nice = os.nice(0) + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper( + tmpdir, + fake, + mode="killed_high", + extra_env={"BAZEL_ADAPTIVE_BAZEL_NICE": "0"}, + ) + + self.assertEqual(result.returncode, 0, result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertIn(f"nice {current_nice}", log) + + def test_silent_bazel_output_downscales_by_wall_clock(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper(tmpdir, fake, mode="silent_wall_clock") + + self.assertEqual(result.returncode, 0, result.stderr) + self.assertIn("Compiling a.cc; 0s", result.stdout) + self.assertIn("action timeout and low memory detected", result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertIn("argv build --jobs=4", log) + self.assertIn("argv build --jobs=2", log) + + def test_internal_interrupted_crash_retries_once_with_same_jobs(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper(tmpdir, fake, mode="internal_interrupted_crash") + + self.assertEqual(result.returncode, 0, result.stderr) + self.assertIn( + "Bazel crashed internally after java.lang.InterruptedException; " + "retrying with same --jobs=4", + result.stderr, + ) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertEqual(log.count("argv build --jobs=4"), 2) + self.assertNotIn("argv build --jobs=2", log) + + def test_internal_interrupted_crash_retry_is_capped(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper(tmpdir, fake, mode="internal_interrupted_crash_twice") + + self.assertEqual(result.returncode, 37, result.stderr) + self.assertIn( + "Bazel crashed internally after java.lang.InterruptedException again; " + "not retrying", + result.stderr, + ) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertEqual(log.count("argv build --jobs=4"), 2) + self.assertNotIn("argv build --jobs=2", log) + + def test_bazel_user_interrupt_is_not_retried(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper(tmpdir, fake, mode="bazel_user_interrupt") + + self.assertEqual(result.returncode, 130, result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertEqual(log.count("argv build --jobs=4"), 1) + self.assertNotIn("retrying with same", result.stderr) + self.assertNotIn("retrying with fewer", result.stderr) + + def test_upscale_cancelled_if_bazel_reports_failure_while_stopping(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper(tmpdir, fake, mode="upscale_failure") + + self.assertEqual(result.returncode, 1, result.stderr) + self.assertIn("upscale cancelled because Bazel reported a build failure", result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertIn("argv build --jobs=4", log) + self.assertIn("argv build --jobs=2", log) + self.assertNotIn("argv build --jobs=3", log) + + def test_upscale_skip_reports_average_memory_reason(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper( + tmpdir, + fake, + mode="skip_upscale", + build_timeout=5, + extra_env={"BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB": "2048"}, + ) + + self.assertEqual(result.returncode, 0, result.stderr) + self.assertRegex( + result.stderr, + r"upscale watch skipped after 1 scheduled attempt and [1-9][0-9]* reevaluations", + ) + self.assertRegex(result.stderr, r"memory skips: [1-9][0-9]*; job-runtime skips: 0") + self.assertIn("memory dipped below low-memory threshold", result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertIn("argv build --jobs=4", log) + self.assertIn("argv build --jobs=2", log) + self.assertEqual(log.count("argv build --jobs=4"), 1) + + def test_upscale_waits_for_young_current_actions(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper(tmpdir, fake, mode="long_action_skip_upscale") + + self.assertEqual(result.returncode, 0, result.stderr) + self.assertRegex(result.stderr, r"oldest current running action is 1[56]s") + self.assertRegex( + result.stderr, + r"memory skips: 0; job-runtime skips: [1-9][0-9]*", + ) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertIn("argv build --jobs=4", log) + self.assertIn("argv build --jobs=2", log) + self.assertNotIn("argv build --jobs=3", log) + + def test_pending_upscale_runs_when_current_actions_become_young(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper(tmpdir, fake, mode="postpone_then_upscale") + + self.assertEqual(result.returncode, 0, result.stderr) + self.assertIn( + "upscale watch active: oldest current running action is 16s", + result.stderr, + ) + self.assertIn("interrupting Bazel at a cheap upscale point", result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertIn("argv build --jobs=4", log) + self.assertIn("argv build --jobs=2", log) + self.assertIn("argv build --jobs=3", log) + + def test_pending_upscale_rechecks_memory_before_running(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper(tmpdir, fake, mode="postpone_memory_then_upscale") + + self.assertEqual(result.returncode, 0, result.stderr) + self.assertIn( + "upscale watch active: oldest current running action is 16s", + result.stderr, + ) + self.assertIn("upscale: memory latest", result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertIn("argv build --jobs=3", log) + + def test_downscale_watch_takes_priority_over_pending_upscale(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper( + tmpdir, + fake, + mode="pending_upscale_then_downscale", + extra_env={"BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB": "2048"}, + ) + + self.assertEqual(result.returncode, 0, result.stderr) + self.assertIn("upscale watch active", result.stderr) + self.assertIn("action timeout and low memory detected", result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertIn("argv build --jobs=1", log) + self.assertNotIn("argv build --jobs=3", log) + + def test_upscale_requires_meaningful_work_before_restarting(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper(tmpdir, fake, mode="no_meaningful_work") + + self.assertEqual(result.returncode, 0, result.stderr) + self.assertIn("completed action count has not advanced", result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertIn("argv build --jobs=4", log) + self.assertIn("argv build --jobs=2", log) + self.assertNotIn("argv build --jobs=3", log) + + def test_upscale_lets_winding_down_bazel_finish(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper(tmpdir, fake, mode="winding_down") + + self.assertEqual(result.returncode, 0, result.stderr) + self.assertIn("letting Bazel finish", result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertIn("argv build --jobs=4", log) + self.assertIn("argv build --jobs=2", log) + self.assertNotIn("argv build --jobs=3", log) + + def test_upscale_skips_when_all_remaining_actions_are_running_in_fake_bazel(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper(tmpdir, fake, mode="near_complete") + + self.assertEqual(result.returncode, 0, result.stderr) + self.assertIn("only 2 action(s) remain", result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertIn("argv build --jobs=4", log) + self.assertIn("argv build --jobs=2", log) + self.assertNotIn("argv build --jobs=3", log) + + def test_user_signal_is_forwarded(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + workspace = tmpdir / "workspace" + workspace.mkdir() + log = tmpdir / "fake.log" + count = tmpdir / "count" + meminfo = tmpdir / "meminfo" + write_meminfo(meminfo, total_kb=4 * 1024 * 1024, available_kb=3 * 1024 * 1024) + env = os.environ.copy() + env.update( + { + "BAZEL": str(fake), + "BAZEL_ADAPTIVE_MEMINFO": str(meminfo), + "BAZEL_ADAPTIVE_BUILD_TIMEOUT": "10", + "FAKE_BAZEL_LOG": str(log), + "FAKE_BAZEL_COUNT": str(count), + "FAKE_BAZEL_MODE": "signal", + } + ) + process = subprocess.Popen( + [sys.executable, str(WRAPPER), "build", "--jobs=1"], + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + cwd=workspace, + ) + time.sleep(0.5) + process.send_signal(signal.SIGINT) + stdout, stderr = process.communicate(timeout=10) + + self.assertEqual(process.returncode, 130, stderr) + self.assertIn("stdout-marker", stdout) + self.assertIn("sigint 1", log.read_text(encoding="utf-8")) + + def test_cleanup_escalates_to_shutdown(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper(tmpdir, fake, mode="ignore") + + self.assertEqual(result.returncode, 0, result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertIn("shutdown", log) + self.assertIn("argv build --jobs=2", log) + + def test_killed_action_exit_retries_with_fewer_jobs(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper(tmpdir, fake, mode="killed_exit") + + self.assertEqual(result.returncode, 0, result.stderr) + self.assertIn("(Killed): clang failed", result.stderr) + self.assertIn("memory pressure:", result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertIn("argv build --jobs=4", log) + self.assertIn("argv build --jobs=2", log) + + def test_killed_action_retries_same_jobs_when_memory_average_is_high(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper( + tmpdir, + fake, + mode="killed_high", + initial_available_kb=3 * 1024 * 1024, + ) + + self.assertEqual(result.returncode, 0, result.stderr) + self.assertIn("retrying with same --jobs=4", result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertEqual(log.count("argv build --jobs=4"), 2) + self.assertNotIn("shutdown", log) + self.assertNotIn("argv build --jobs=2", log) + + def test_terminated_action_exit_retries_with_fewer_jobs(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper(tmpdir, fake, mode="terminated_exit") + + self.assertEqual(result.returncode, 0, result.stderr) + self.assertIn("(Terminated): clang failed", result.stderr) + self.assertIn("killed or terminated action; retrying with fewer jobs", result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertIn("argv build --jobs=4", log) + self.assertIn("argv build --jobs=2", log) + + def test_repeated_same_job_terminated_action_waits_before_retry(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper( + tmpdir, + fake, + mode="terminated_high_twice", + initial_available_kb=3 * 1024 * 1024, + ) + + self.assertEqual(result.returncode, 0, result.stderr) + self.assertIn("retrying with same --jobs=4", result.stderr) + self.assertIn( + "waiting up to 0.05s for Bazel build processes to exit " + "before retrying with same --jobs=4", + result.stderr, + ) + self.assertIn("settling for 0.05s before retrying with same --jobs=4", result.stderr) + self.assertIn("asking Bazel server to shut down before retrying", result.stderr) + self.assertIn("bazel shutdown completed with exit code 0", result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertEqual(log.count("argv build --jobs=4"), 3) + self.assertEqual(log.count("shutdown"), 1) + self.assertNotIn("argv build --jobs=2", log) + + def test_repeated_same_job_terminated_action_retry_is_capped(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper( + tmpdir, + fake, + mode="terminated_high_always", + extra_env={"BAZEL_ADAPTIVE_SAME_JOB_RETRY_LIMIT": "2"}, + initial_available_kb=3 * 1024 * 1024, + ) + + self.assertEqual(result.returncode, 1, result.stderr) + self.assertIn( + "after 2 same-job retry attempt(s); not retrying", + result.stderr, + ) + self.assertIn("bazel shutdown completed with exit code 0", result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertEqual(log.count("argv build --jobs=4"), 3) + self.assertEqual(log.count("shutdown"), 1) + self.assertNotIn("argv build --jobs=2", log) + + def test_killed_action_at_one_job_does_not_retry(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper(tmpdir, fake, mode="killed_exit", jobs=1) + + self.assertEqual(result.returncode, 1, result.stderr) + self.assertIn("(Killed): clang failed", result.stderr) + self.assertIn("memory pressure:", result.stderr) + self.assertNotIn("retrying with fewer jobs", result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertIn("argv build --jobs=1", log) + self.assertNotIn("argv build --jobs=0", log) + + def test_abrupt_server_exit_after_recent_memory_pressure_retries(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper(tmpdir, fake, mode="server_abrupt") + + self.assertEqual(result.returncode, 0, result.stderr) + self.assertIn("Server terminated abruptly", result.stderr) + self.assertIn("memory pressure:", result.stderr) + self.assertIn("recent memory pressure", result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertIn("argv build --jobs=4", log) + self.assertIn("argv build --jobs=2", log) + + def test_abrupt_server_exit_after_visible_stall_retries_without_low_memory(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper( + tmpdir, + fake, + mode="server_abrupt_stall", + initial_available_kb=3 * 1024 * 1024, + ) + + self.assertEqual(result.returncode, 0, result.stderr) + self.assertIn("Connection reset by peer", result.stderr) + self.assertIn("memory pressure:", result.stderr) + self.assertIn("recent visible action stall over 1s", result.stderr) + self.assertIn("retrying with same --jobs=4", result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertEqual(log.count("argv build --jobs=4"), 2) + self.assertNotIn("argv build --jobs=2", log) + + def test_abrupt_server_exit_without_memory_pressure_retries(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper( + tmpdir, + fake, + mode="server_abrupt", + initial_available_kb=3 * 1024 * 1024, + ) + + self.assertEqual(result.returncode, 0, result.stderr) + self.assertIn("Server terminated abruptly", result.stderr) + self.assertIn("memory pressure:", result.stderr) + self.assertIn("retrying with same --jobs=4", result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertEqual(log.count("argv build --jobs=4"), 2) + self.assertNotIn("argv build --jobs=2", log) + + def test_abrupt_server_exit_at_one_job_does_not_retry(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper(tmpdir, fake, mode="server_abrupt", jobs=1) + + self.assertEqual(result.returncode, 37, result.stderr) + self.assertIn("Server terminated abruptly", result.stderr) + self.assertNotIn("retrying with fewer jobs", result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertIn("argv build --jobs=1", log) + self.assertNotIn("argv build --jobs=0", log) + + def test_dangling_build_process_is_reported_and_killed(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper( + tmpdir, + fake, + mode="leak_child", + initial_available_kb=3 * 1024 * 1024, + jobs=1, + ) + + self.assertEqual(result.returncode, 37, result.stderr) + self.assertIn("dangling Bazel build process", result.stderr) + self.assertNotIn("dangling build process: pid=", result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + child_pid = None + for line in log.splitlines(): + if line.startswith("leak_child_pid "): + child_pid = int(line.split()[1]) + break + self.assertIsNotNone(child_pid) + self.assertFalse(process_exists(child_pid)) + + def test_dangling_build_process_after_failure_triggers_retry(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + result = self.run_wrapper( + tmpdir, + fake, + mode="leak_child_then_success", + initial_available_kb=3 * 1024 * 1024, + build_timeout=100, + ) + + self.assertEqual(result.returncode, 0, result.stderr) + self.assertIn("dangling Bazel build process", result.stderr) + self.assertNotIn("dangling build process: pid=", result.stderr) + self.assertIn("retrying with same --jobs=4", result.stderr) + log = (tmpdir / "fake.log").read_text(encoding="utf-8") + self.assertEqual(log.count("argv build --jobs=4"), 2) + self.assertNotIn("argv build --jobs=2", log) + child_pid = None + for line in log.splitlines(): + if line.startswith("leak_child_pid "): + child_pid = int(line.split()[1]) + break + self.assertIsNotNone(child_pid) + self.assertFalse(process_exists(child_pid)) + + def test_no_args_prints_bazel_usage_then_wrapper_hint(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + tmpdir = Path(tmp) + fake = self.make_fake_bazel(tmpdir) + log = tmpdir / "fake.log" + count = tmpdir / "count" + meminfo = tmpdir / "meminfo" + write_meminfo(meminfo, total_kb=4 * 1024 * 1024, available_kb=3 * 1024 * 1024) + env = os.environ.copy() + env.update( + { + "BAZEL": str(fake), + "BAZEL_ADAPTIVE_MEMINFO": str(meminfo), + "FAKE_BAZEL_LOG": str(log), + "FAKE_BAZEL_COUNT": str(count), + "FAKE_BAZEL_MODE": "scale", + } + ) + + result = subprocess.run( + [sys.executable, str(WRAPPER)], + env=env, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + timeout=10, + check=False, + ) + + self.assertEqual(result.returncode, 0, result.stderr) + self.assertIn("Usage: bazel ...", result.stdout) + self.assertIn("BAZEL_ADAPTIVE_BUILD_TIMEOUT=", result.stderr) + self.assertIn("BAZEL_ADAPTIVE_LOW_MEMORY_THRESHOLD_MB=", result.stderr) + + +if __name__ == "__main__": + unittest.main(verbosity=2)