Skip to content

Commit 5700859

Browse files
authored
perf(autoreload): skip stdlib/site-packages on per-cell check (#9629)
**This pull request was authored by a coding agent.** Fixes #9628. With `auto_reload` set to `lazy` or `autorun`, every cell run was calling `ModuleReloader.check(sys.modules, reload=True)`, which iterates *all* of `sys.modules` and does `os.stat` on each entry. With ~1000 modules in scope (typical), that adds 16–80ms per cell — compounded across the dozen cells re-running on a UI interaction it becomes a >1s lag. This change adds an opt-in `skip_non_user_modules=True` flag on `ModuleReloader.check`. When set, stdlib and site-packages module names are recorded in a persistent skip set (classified by `sysconfig` prefixes) and short-circuited on subsequent calls. `AutoreloadManager.cell_scope` (the hot per-cell path) opts in. The background `ModuleWatcher` keeps the default behavior and continues to scan every module on its 1s loop, so edits inside an installed package are still detected — just at watcher latency rather than cell-entry latency. Editable installs (`pip install -e .`, `uv add --editable`) have `__file__` outside site-packages, so they are correctly classified as user code and reload with no latency change. ### Benchmark Driving `ModuleReloader.check()` directly, 200 iterations post-warmup. Issue-shaped workload: ~2.5k modules (heavy stdlib + numpy/pandas/etc.) + 5 user files in a tmp dir. | path | median | p95 | |------|-------:|----:| | before | 4.88 ms | 6.15 ms | | after | 0.91 ms | 1.01 ms | **~4 ms saved per cell run, 5.4× median speedup.** Scale curve (median µs, varying user-module count): | user mods | sys.modules | before | after | speedup | |----------:|------------:|-------:|------:|--------:| | 0 | 2514 | 5037 | 873 | 5.8× | | 5 | 2519 | 5245 | 802 | 6.5× | | 25 | 2539 | 6082 | 1693 | 3.6× | | 100 | 2614 | 8342 | 4421 | 1.9× | | 500 | 3014 | 12489 | 8398 | 1.5× | The win narrows as user-code grows, by design: the optimization only filters out non-user-code.
1 parent cd40f73 commit 5700859

3 files changed

Lines changed: 251 additions & 2 deletions

File tree

marimo/_runtime/reload/autoreload.py

Lines changed: 84 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@
99

1010
from __future__ import annotations
1111

12+
import functools
1213
import gc
1314
import io
1415
import modulefinder
1516
import os
1617
import sys
18+
import sysconfig
1719
import threading
1820
import traceback
1921
import types
@@ -73,6 +75,32 @@ def safe_hasattr(obj: M, attr: str) -> bool:
7375
return False
7476

7577

78+
@functools.cache
79+
def _non_user_module_roots() -> tuple[str, ...]:
80+
"""Filesystem prefixes that hold stdlib + site-packages modules.
81+
82+
Each entry is normalized and terminated with a separator so that a raw
83+
prefix check on a normalized path matches whole directory boundaries
84+
(e.g. `/usr/lib/python3.13/` does not match `/usr/lib/python3.13-mine/`).
85+
"""
86+
roots: set[str] = set()
87+
for key in ("stdlib", "platstdlib", "purelib", "platlib"):
88+
p = sysconfig.get_path(key)
89+
if p:
90+
roots.add(p)
91+
# Fallback for builds where sysconfig's stdlib path is missing or
92+
# differs from the runtime location of the stdlib.
93+
roots.add(os.path.dirname(os.__file__))
94+
95+
normalized: set[str] = set()
96+
for r in roots:
97+
n = os.path.normcase(os.path.realpath(r))
98+
if not n.endswith(os.sep):
99+
n += os.sep
100+
normalized.add(n)
101+
return tuple(normalized)
102+
103+
76104
def modules_imported_by_cell(
77105
cell: CellImpl, sys_modules: dict[str, types.ModuleType]
78106
) -> set[str]:
@@ -160,10 +188,29 @@ def __init__(self) -> None:
160188
# for thread-safety
161189
self.lock = threading.Lock()
162190
self._module_dependency_finder = ModuleDependencyFinder()
191+
# modname -> cached `__file__` for modules classified as non-user.
192+
# Populated by every `check()` call (memoizing `_is_user_module`);
193+
# consumed only when `skip_non_user_modules=True`. Stored value is
194+
# used to invalidate the entry if `sys.modules[modname]` is later
195+
# rebound to a module with a different `__file__` (e.g. a user
196+
# module shadowing an installed package).
197+
self._skip: dict[str, str | None] = {}
163198

164199
# Timestamp existing modules
165200
self.check(modules=sys.modules, reload=False)
166201

202+
def _is_user_module(self, module: types.ModuleType) -> bool:
203+
"""True for modules whose source lives outside stdlib/site-packages.
204+
205+
Editable installs (e.g. `pip install -e .`) point `__file__` at the
206+
source tree, so they are correctly classified as user code.
207+
"""
208+
f = safe_getattr(module, "__file__", None)
209+
if not f:
210+
return False
211+
path = os.path.normcase(os.path.realpath(f))
212+
return not path.startswith(_non_user_module_roots())
213+
167214
def filename_and_mtime(
168215
self, module: types.ModuleType
169216
) -> ModuleMTime | None:
@@ -206,12 +253,24 @@ def cell_uses_stale_modules(self, cell: CellImpl) -> bool:
206253
)
207254

208255
def check(
209-
self, modules: dict[str, types.ModuleType], reload: bool
256+
self,
257+
modules: dict[str, types.ModuleType],
258+
reload: bool,
259+
*,
260+
skip_non_user_modules: bool = False,
210261
) -> set[types.ModuleType]:
211262
"""Check timestamps of modules, optionally reload them.
212263
213264
Also patches existing objects with hot-reloaded ones.
214265
266+
When `skip_non_user_modules` is True, modules whose `__file__` is
267+
under stdlib/site-packages are skipped — intended for the per-cell
268+
hot path. The background `ModuleWatcher` leaves it False so it still
269+
stats every module on its 1s loop, which is what keeps edits inside
270+
installed packages detectable. Both paths populate the same skip
271+
cache, so the hot path benefits from classifications the watcher
272+
has already done.
273+
215274
Returns a set of modules that were found to have been modified.
216275
"""
217276

@@ -228,6 +287,30 @@ def check(
228287
m = modules.get(modname, None)
229288
if m is None:
230289
continue
290+
# Classify (memoized via `_skip`). The hot path uses the
291+
# cache to short-circuit; the watcher always falls through
292+
# to the stat check so that edits inside installed packages
293+
# are still picked up. The cached entry stores `__file__`,
294+
# so a module rebound to a new location gets reclassified.
295+
current_file = safe_getattr(m, "__file__", None)
296+
if modname in self._skip:
297+
if self._skip[modname] == current_file:
298+
is_non_user = True
299+
else:
300+
# Rebound to a different file — drop all cached
301+
# state for this name so the new module starts
302+
# from a clean mtime baseline.
303+
del self._skip[modname]
304+
self.modules_mtimes.pop(modname, None)
305+
self.stale_modules.discard(modname)
306+
is_non_user = False
307+
else:
308+
is_non_user = False
309+
if not is_non_user and not self._is_user_module(m):
310+
self._skip[modname] = current_file
311+
is_non_user = True
312+
if is_non_user and skip_non_user_modules:
313+
continue
231314

232315
module_mtime = self.filename_and_mtime(m)
233316
if module_mtime is None:

marimo/_runtime/reload/manager.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,10 +85,18 @@ def cell_scope(self) -> Iterator[None]:
8585
yield
8686
return
8787
snapshot = set(sys.modules)
88-
self._reloader.check(modules=sys.modules, reload=True)
88+
# Entry: skip stdlib/site-packages so cells don't pay for stat-ing
89+
# them. This is the perf-critical call.
90+
self._reloader.check(
91+
modules=sys.modules, reload=True, skip_non_user_modules=True
92+
)
8993
try:
9094
yield
9195
finally:
96+
# Exit: record mtimes for modules the cell just imported. Don't
97+
# skip here — `new_modules` is small (typically 0-3) and we need
98+
# an mtime baseline for newly-imported installed packages so the
99+
# next edit isn't silently treated as the initial state.
92100
new_modules = set(sys.modules) - snapshot
93101
self._reloader.check(
94102
modules={m: sys.modules[m] for m in new_modules},

tests/_runtime/reload/test_autoreload.py

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import gc
44
import importlib
5+
import os
56
import pathlib
67
import sys
78
import textwrap
@@ -425,6 +426,163 @@ def test_check_reload_clears_stale_modules(
425426
assert len(reloader.stale_modules) == 0
426427

427428

429+
class TestSkipCache:
430+
def test_is_user_module_stdlib(self):
431+
reloader = ModuleReloader()
432+
assert reloader._is_user_module(sys.modules["os"]) is False
433+
assert reloader._is_user_module(sys.modules["pathlib"]) is False
434+
435+
def test_is_user_module_builtin_has_no_file(self):
436+
reloader = ModuleReloader()
437+
assert reloader._is_user_module(sys.modules["sys"]) is False
438+
assert reloader._is_user_module(sys.modules["builtins"]) is False
439+
440+
def test_is_user_module_user_code(
441+
self, tmp_path: pathlib.Path, py_modname: str
442+
):
443+
sys.path.append(str(tmp_path))
444+
py_file = tmp_path / pathlib.Path(py_modname + ".py")
445+
py_file.write_text("x = 1")
446+
mod = importlib.import_module(py_modname)
447+
reloader = ModuleReloader()
448+
assert reloader._is_user_module(mod) is True
449+
450+
def test_both_paths_populate_skip(self):
451+
# The cache is shared memoization for the classification step;
452+
# whichever path sees a module first records the verdict.
453+
reloader = ModuleReloader()
454+
reloader.check(sys.modules, reload=False)
455+
assert "os" in reloader._skip
456+
457+
reloader2 = ModuleReloader()
458+
reloader2.check(sys.modules, reload=False, skip_non_user_modules=True)
459+
assert "os" in reloader2._skip
460+
461+
def test_user_module_not_skipped_on_hot_path(
462+
self, tmp_path: pathlib.Path, py_modname: str
463+
):
464+
sys.path.append(str(tmp_path))
465+
py_file = tmp_path / pathlib.Path(py_modname + ".py")
466+
py_file.write_text("x = 1")
467+
importlib.import_module(py_modname)
468+
reloader = ModuleReloader()
469+
reloader.check(sys.modules, reload=False, skip_non_user_modules=True)
470+
assert py_modname not in reloader._skip
471+
472+
def test_skipped_modules_are_not_restated(self, monkeypatch):
473+
reloader = ModuleReloader()
474+
reloader.check(sys.modules, reload=False, skip_non_user_modules=True)
475+
assert "os" in reloader._skip
476+
477+
calls: list[str] = []
478+
orig = reloader.filename_and_mtime
479+
480+
def spy(module):
481+
calls.append(getattr(module, "__name__", "?"))
482+
return orig(module)
483+
484+
monkeypatch.setattr(reloader, "filename_and_mtime", spy)
485+
reloader.check(sys.modules, reload=False, skip_non_user_modules=True)
486+
assert "os" not in calls
487+
assert "pathlib" not in calls
488+
489+
def test_watcher_path_still_sees_installed_packages(
490+
self, tmp_path: pathlib.Path, py_modname: str, monkeypatch
491+
):
492+
# Regression guard: even after the hot path has classified a module
493+
# as non-user (and cached it in `_skip`), the watcher's
494+
# `skip_non_user_modules=False` call must still stat it and detect
495+
# edits. Without this, `auto_reload` users editing files inside an
496+
# installed package would silently stop getting hot reloads.
497+
import marimo._runtime.reload.autoreload as autoreload_mod
498+
499+
sys.path.append(str(tmp_path))
500+
py_file = tmp_path / pathlib.Path(py_modname + ".py")
501+
py_file.write_text("x = 1")
502+
mod = importlib.import_module(py_modname)
503+
504+
real_roots = autoreload_mod._non_user_module_roots()
505+
tmp_root = os.path.normcase(os.path.realpath(str(tmp_path))) + os.sep
506+
monkeypatch.setattr(
507+
autoreload_mod,
508+
"_non_user_module_roots",
509+
lambda: (tmp_root,) + real_roots,
510+
)
511+
512+
reloader = ModuleReloader()
513+
assert reloader._is_user_module(mod) is False
514+
515+
# Hot path classifies and caches.
516+
reloader.check(sys.modules, reload=False, skip_non_user_modules=True)
517+
assert py_modname in reloader._skip
518+
519+
# Watcher path falls through to the stat check despite the cache.
520+
update_file(py_file, "x = 2")
521+
assert any(m is mod for m in reloader.check(sys.modules, reload=False))
522+
523+
def test_skip_cache_invalidates_when_module_rebound(
524+
self, tmp_path: pathlib.Path, py_modname: str
525+
):
526+
# If `sys.modules[modname]` is rebound to a module with a different
527+
# `__file__` (e.g. a user file shadows an installed package), the
528+
# cached non-user verdict must not stick — and any cached mtime
529+
# from the old module must be cleared too, otherwise an older user
530+
# file would be silently treated as unchanged.
531+
sys.path.append(str(tmp_path))
532+
user_file = tmp_path / pathlib.Path(py_modname + ".py")
533+
user_file.write_text("x = 1")
534+
user_mod = importlib.import_module(py_modname)
535+
536+
# Plant a fake "installed" version under the same name first.
537+
fake_installed = types.ModuleType(py_modname)
538+
fake_installed.__file__ = os.path.join(
539+
os.path.dirname(os.__file__), py_modname + ".py"
540+
)
541+
sys.modules[py_modname] = fake_installed
542+
543+
reloader = ModuleReloader()
544+
# Watcher-style call: classifies as non-user AND records a
545+
# (synthetic) far-future mtime so we can detect stale-cache leakage.
546+
reloader.check(sys.modules, reload=False)
547+
assert py_modname in reloader._skip
548+
reloader.modules_mtimes[py_modname] = 1e12
549+
550+
# Rebind to the real user module.
551+
sys.modules[py_modname] = user_mod
552+
reloader.check(sys.modules, reload=False, skip_non_user_modules=True)
553+
assert py_modname not in reloader._skip
554+
# Stale mtime is cleared so the next edit isn't masked by it.
555+
assert reloader.modules_mtimes.get(py_modname, 0) < 1e12
556+
557+
def test_user_module_reload_still_works(
558+
self, tmp_path: pathlib.Path, py_modname: str
559+
):
560+
sys.path.append(str(tmp_path))
561+
py_file = tmp_path / pathlib.Path(py_modname + ".py")
562+
py_file.write_text(
563+
textwrap.dedent(
564+
"""
565+
def foo():
566+
return 1
567+
"""
568+
)
569+
)
570+
mod = importlib.import_module(py_modname)
571+
reloader = ModuleReloader()
572+
reloader.check(sys.modules, reload=False)
573+
assert mod.foo() == 1
574+
575+
update_file(
576+
py_file,
577+
"""
578+
def foo():
579+
return 2
580+
""",
581+
)
582+
reloader.check(sys.modules, reload=True)
583+
assert mod.foo() == 2
584+
585+
428586
class TestUpdateFunctions:
429587
"""Tests for update_* functions"""
430588

0 commit comments

Comments
 (0)