Skip to content

Commit 67109f5

Browse files
move wait for dom to cdp from js
1 parent 511be19 commit 67109f5

5 files changed

Lines changed: 386 additions & 80 deletions

File tree

stagehand/domScripts.js

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -291,21 +291,6 @@
291291
};
292292

293293
// lib/dom/utils.ts
294-
async function waitForDomSettle() {
295-
return new Promise((resolve) => {
296-
const createTimeout = () => {
297-
return setTimeout(() => {
298-
resolve();
299-
}, 2e3);
300-
};
301-
let timeout = createTimeout();
302-
const observer = new MutationObserver(() => {
303-
clearTimeout(timeout);
304-
timeout = createTimeout();
305-
});
306-
observer.observe(window.document.body, { childList: true, subtree: true });
307-
});
308-
}
309294
function calculateViewportHeight() {
310295
return Math.ceil(window.innerHeight * 0.75);
311296
}
@@ -1046,7 +1031,6 @@
10461031
}
10471032
return boundingBoxes;
10481033
}
1049-
window.waitForDomSettle = waitForDomSettle;
10501034
window.processDom = processDom;
10511035
window.processAllOfDom = processAllOfDom;
10521036
window.storeDOM = storeDOM;

stagehand/page.py

Lines changed: 198 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -398,8 +398,6 @@ async def send_cdp(self, method: str, params: Optional[dict] = None) -> dict:
398398
self._stagehand.logger.debug(
399399
f"CDP command '{method}' failed: {e}. Attempting to reconnect..."
400400
)
401-
# Try to reconnect
402-
await self._ensure_cdp_session()
403401
# Handle specific errors if needed (e.g., session closed)
404402
if "Target closed" in str(e) or "Session closed" in str(e):
405403
# Attempt to reset the client if the session closed unexpectedly
@@ -441,71 +439,213 @@ async def detach_cdp_client(self):
441439
async def _wait_for_settled_dom(self, timeout_ms: int = None):
442440
"""
443441
Wait for the DOM to settle (stop changing) before proceeding.
442+
443+
**Definition of "settled"**
444+
• No in-flight network requests (except WebSocket / Server-Sent-Events).
445+
• That idle state lasts for at least **500 ms** (the "quiet-window").
446+
447+
**How it works**
448+
1. Subscribes to CDP Network and Page events for the main target and all
449+
out-of-process iframes (via `Target.setAutoAttach { flatten:true }`).
450+
2. Every time `Network.requestWillBeSent` fires, the request ID is added
451+
to an **`inflight`** set.
452+
3. When the request finishes—`loadingFinished`, `loadingFailed`,
453+
`requestServedFromCache`, or a *data:* response—the request ID is
454+
removed.
455+
4. *Document* requests are also mapped **frameId → requestId**; when
456+
`Page.frameStoppedLoading` fires the corresponding Document request is
457+
removed immediately (covers iframes whose network events never close).
458+
5. A **stalled-request sweep timer** runs every 500 ms. If a *Document*
459+
request has been open for ≥ 2 s it is forcibly removed; this prevents
460+
ad/analytics iframes from blocking the wait forever.
461+
6. When `inflight` becomes empty the helper starts a 500 ms timer.
462+
If no new request appears before the timer fires, the promise
463+
resolves → **DOM is considered settled**.
464+
7. A global guard (`timeoutMs` or `stagehand.domSettleTimeoutMs`,
465+
default ≈ 30 s) ensures we always resolve; if it fires we log how many
466+
requests were still outstanding.
444467
445468
Args:
446469
timeout_ms (int, optional): Maximum time to wait in milliseconds.
447470
If None, uses the stagehand client's dom_settle_timeout_ms.
448471
"""
472+
import asyncio
473+
import time
474+
475+
timeout = timeout_ms or getattr(self._stagehand, "dom_settle_timeout_ms", 30000)
476+
client = await self.get_cdp_client()
477+
478+
# Check if document exists
449479
try:
450-
timeout = timeout_ms or getattr(
451-
self._stagehand, "dom_settle_timeout_ms", 30000
452-
)
453-
import asyncio
454-
455-
# Wait for domcontentloaded first
480+
await self._page.title()
481+
except Exception:
456482
await self._page.wait_for_load_state("domcontentloaded")
457-
458-
# Create a timeout promise that resolves after the specified time
459-
timeout_task = asyncio.create_task(asyncio.sleep(timeout / 1000))
460-
461-
# Try to check if the DOM has settled
462-
try:
463-
# Create a task for evaluating the DOM settling
464-
eval_task = asyncio.create_task(
465-
self._page.evaluate(
466-
"""
467-
() => {
468-
return new Promise((resolve) => {
469-
if (typeof window.waitForDomSettle === 'function') {
470-
window.waitForDomSettle().then(resolve);
471-
} else {
472-
console.warn('waitForDomSettle is not defined, considering DOM as settled');
473-
resolve();
474-
}
475-
});
476-
}
477-
"""
478-
)
479-
)
480-
481-
# Create tasks for other ways to determine page readiness
482-
dom_task = asyncio.create_task(
483-
self._page.wait_for_load_state("domcontentloaded")
484-
)
485-
body_task = asyncio.create_task(self._page.wait_for_selector("body"))
486-
487-
# Wait for the first task to complete
488-
done, pending = await asyncio.wait(
489-
[eval_task, dom_task, body_task, timeout_task],
490-
return_when=asyncio.FIRST_COMPLETED,
491-
)
492-
493-
# Cancel any pending tasks
494-
for task in pending:
495-
task.cancel()
496-
497-
# If the timeout was hit, log a warning
498-
if timeout_task in done:
483+
484+
# Enable CDP domains
485+
await client.send("Network.enable")
486+
await client.send("Page.enable")
487+
await client.send("Target.setAutoAttach", {
488+
"autoAttach": True,
489+
"waitForDebuggerOnStart": False,
490+
"flatten": True
491+
})
492+
493+
# Set up tracking structures
494+
inflight = set() # Set of request IDs
495+
meta = {} # Dict of request ID -> {"url": str, "start": float}
496+
doc_by_frame = {} # Dict of frame ID -> request ID
497+
498+
# Event tracking
499+
quiet_timer = None
500+
stalled_request_sweep_task = None
501+
loop = asyncio.get_event_loop()
502+
done_event = asyncio.Event()
503+
504+
def clear_quiet():
505+
nonlocal quiet_timer
506+
if quiet_timer:
507+
quiet_timer.cancel()
508+
quiet_timer = None
509+
510+
def resolve_done():
511+
"""Cleanup and mark as done"""
512+
clear_quiet()
513+
if stalled_request_sweep_task and not stalled_request_sweep_task.done():
514+
stalled_request_sweep_task.cancel()
515+
done_event.set()
516+
517+
def maybe_quiet():
518+
"""Start quiet timer if no requests are in flight"""
519+
nonlocal quiet_timer
520+
if len(inflight) == 0 and not quiet_timer:
521+
quiet_timer = loop.call_later(0.5, resolve_done)
522+
523+
def finish_req(request_id: str):
524+
"""Mark a request as finished"""
525+
if request_id not in inflight:
526+
return
527+
inflight.remove(request_id)
528+
meta.pop(request_id, None)
529+
# Remove from frame mapping
530+
for fid, rid in list(doc_by_frame.items()):
531+
if rid == request_id:
532+
doc_by_frame.pop(fid)
533+
clear_quiet()
534+
maybe_quiet()
535+
536+
# Event handlers
537+
def on_request(params):
538+
"""Handle Network.requestWillBeSent"""
539+
if params.get("type") in ["WebSocket", "EventSource"]:
540+
return
541+
542+
request_id = params["requestId"]
543+
inflight.add(request_id)
544+
meta[request_id] = {
545+
"url": params["request"]["url"],
546+
"start": time.time()
547+
}
548+
549+
if params.get("type") == "Document" and params.get("frameId"):
550+
doc_by_frame[params["frameId"]] = request_id
551+
552+
clear_quiet()
553+
554+
def on_finish(params):
555+
"""Handle Network.loadingFinished"""
556+
finish_req(params["requestId"])
557+
558+
def on_failed(params):
559+
"""Handle Network.loadingFailed"""
560+
finish_req(params["requestId"])
561+
562+
def on_cached(params):
563+
"""Handle Network.requestServedFromCache"""
564+
finish_req(params["requestId"])
565+
566+
def on_data_url(params):
567+
"""Handle Network.responseReceived for data: URLs"""
568+
if params.get("response", {}).get("url", "").startswith("data:"):
569+
finish_req(params["requestId"])
570+
571+
def on_frame_stop(params):
572+
"""Handle Page.frameStoppedLoading"""
573+
frame_id = params["frameId"]
574+
if frame_id in doc_by_frame:
575+
finish_req(doc_by_frame[frame_id])
576+
577+
# Register event handlers
578+
client.on("Network.requestWillBeSent", on_request)
579+
client.on("Network.loadingFinished", on_finish)
580+
client.on("Network.loadingFailed", on_failed)
581+
client.on("Network.requestServedFromCache", on_cached)
582+
client.on("Network.responseReceived", on_data_url)
583+
client.on("Page.frameStoppedLoading", on_frame_stop)
584+
585+
async def sweep_stalled_requests():
586+
"""Remove stalled document requests after 2 seconds"""
587+
while not done_event.is_set():
588+
await asyncio.sleep(0.5)
589+
now = time.time()
590+
for request_id, request_meta in list(meta.items()):
591+
if now - request_meta["start"] > 2.0:
592+
inflight.discard(request_id)
593+
meta.pop(request_id, None)
594+
self._stagehand.logger.debug(
595+
"⏳ forcing completion of stalled iframe document",
596+
extra={
597+
"url": request_meta["url"][:120]
598+
}
599+
)
600+
maybe_quiet()
601+
602+
# Start stalled request sweeper
603+
stalled_request_sweep_task = asyncio.create_task(sweep_stalled_requests())
604+
605+
# Set up timeout guard
606+
async def timeout_guard():
607+
await asyncio.sleep(timeout / 1000)
608+
if not done_event.is_set():
609+
if len(inflight) > 0:
499610
self._stagehand.logger.debug(
500-
"DOM settle timeout exceeded, continuing anyway",
501-
extra={"timeout_ms": timeout},
611+
"⚠️ DOM-settle timeout reached – network requests still pending",
612+
extra={
613+
"count": len(inflight)
614+
}
502615
)
503-
504-
except Exception as e:
505-
self._stagehand.logger.debug(f"Error waiting for DOM to settle: {e}")
506-
507-
except Exception as e:
508-
self._stagehand.logger.error(f"Error in _wait_for_settled_dom: {e}")
616+
resolve_done()
617+
618+
timeout_task = asyncio.create_task(timeout_guard())
619+
620+
# Initial check
621+
maybe_quiet()
622+
623+
try:
624+
# Wait for completion
625+
await done_event.wait()
626+
finally:
627+
# Cleanup
628+
client.remove_listener("Network.requestWillBeSent", on_request)
629+
client.remove_listener("Network.loadingFinished", on_finish)
630+
client.remove_listener("Network.loadingFailed", on_failed)
631+
client.remove_listener("Network.requestServedFromCache", on_cached)
632+
client.remove_listener("Network.responseReceived", on_data_url)
633+
client.remove_listener("Page.frameStoppedLoading", on_frame_stop)
634+
635+
if quiet_timer:
636+
quiet_timer.cancel()
637+
if stalled_request_sweep_task and not stalled_request_sweep_task.done():
638+
stalled_request_sweep_task.cancel()
639+
try:
640+
await stalled_request_sweep_task
641+
except asyncio.CancelledError:
642+
pass
643+
if timeout_task and not timeout_task.done():
644+
timeout_task.cancel()
645+
try:
646+
await timeout_task
647+
except asyncio.CancelledError:
648+
pass
509649

510650
# Forward other Page methods to underlying Playwright page
511651
def __getattr__(self, name):

tests/conftest.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -395,10 +395,6 @@ def mock_dom_scripts():
395395
return ['//body', '//div[@class="content"]'];
396396
};
397397
398-
window.waitForDomSettle = function() {
399-
return Promise.resolve();
400-
};
401-
402398
window.getElementInfo = function(selector) {
403399
return {
404400
selector: selector,

tests/mocks/mock_browser.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,6 @@ async def evaluate(self, script: str, *args):
7272
# Return different results based on script content
7373
if "getScrollableElementXpaths" in script:
7474
return ["//body", "//div[@class='content']"]
75-
elif "waitForDomSettle" in script:
76-
return True
7775
elif "getElementInfo" in script:
7876
return {
7977
"selector": args[0] if args else "#test",

0 commit comments

Comments
 (0)