From c36dbc2838cb67ce04bc5a6da8d44f5ee6869ce9 Mon Sep 17 00:00:00 2001 From: hiepau1231 Date: Sat, 4 Apr 2026 15:01:35 +0700 Subject: [PATCH 1/9] chore: add monitor mode design spec for issue #2680 Co-Authored-By: Claude Opus 4.6 --- .../specs/2026-04-04-monitor-mode-design.md | 155 ++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 docs/superpowers/specs/2026-04-04-monitor-mode-design.md diff --git a/docs/superpowers/specs/2026-04-04-monitor-mode-design.md b/docs/superpowers/specs/2026-04-04-monitor-mode-design.md new file mode 100644 index 000000000000..d90c162df1ca --- /dev/null +++ b/docs/superpowers/specs/2026-04-04-monitor-mode-design.md @@ -0,0 +1,155 @@ +# Monitor Mode for BasicCrawler — Design Spec + +**Date:** 2026-04-04 +**Issue:** [#2680](https://github.com/apify/crawlee/issues/2680) +**Related PR:** [#2692](https://github.com/apify/crawlee/pull/2692) (reference only — implementation is fresh) + +--- + +## Problem + +When running a crawler locally, there is no real-time progress overview. Developers have to read scattered log lines to understand how fast the crawl is going, how much is left, and what the system load looks like. The `puppeteer-cluster` library had a monitor feature that was widely used and is missed after migrating to Crawlee. + +--- + +## Goal + +Add an opt-in `monitor` option to `BasicCrawler` that prints a compact, real-time status block to the terminal while crawling. It must not interfere with the existing logger output. + +--- + +## Architecture + +### New file: `packages/core/src/crawlers/monitor.ts` + +A standalone `Monitor` class. It receives a `Statistics` instance and an optional `AutoscaledPool` instance, then on a configurable interval renders a status block to `process.stderr`. + +Using `process.stderr` keeps it separate from the `@apify/log` output, which writes to `process.stdout` by default. This prevents the monitor from overwriting log lines. + +When `process.stderr.isTTY` is `true` (interactive terminal), the monitor uses ANSI escape codes (`\x1b[{N}A\x1b[2K`) to overwrite its own previous output in-place. When not a TTY (CI, piped output), it falls back to plain newline-delimited prints so the output stays readable in logs. + +**Class interface:** + +```ts +export interface MonitorOptions { + /** How often to refresh the monitor display. Default: 5 seconds. */ + intervalSecs?: number; +} + +export class Monitor { + constructor( + private readonly stats: Statistics, + private readonly autoscaledPool?: AutoscaledPool, + private readonly options: MonitorOptions = {}, + ) {} + + start(): void; // starts setInterval + stop(): void; // clears interval, erases last monitor block from terminal +} +``` + +**Rendered output format** (5 lines): + +``` +⏱ Start: 2024-01-01 10:00:00 | Running for 00:03:24 +📊 Progress: 145/500 (29.0%) | Failed: 3 (2.1%) | Speed: 42 req/min +⏳ ETA: ~00:08:27 +💻 CPU: 34% | Mem: 512 MB / 1.8 GB +🔀 Concurrency: 8/10 (desired: 10) +``` + +- **Total** is read from `requestManager.getTotalCount()` passed in via constructor (optional — shown as `?` when unknown) +- **Speed** is `requestsFinishedPerMinute` from `stats.calculate()` +- **ETA** is `(total - finished) / speed` in minutes, formatted as `HH:MM:SS`; shows `N/A` when total is unknown +- **CPU/Mem** is read from `autoscaledPool.systemStatus.getCurrentStatus()` when pool is available; shows `N/A` otherwise +- **Concurrency** reads `autoscaledPool.currentConcurrency` and `autoscaledPool.desiredConcurrency` + +--- + +### Changes to `packages/basic-crawler/src/internals/basic-crawler.ts` + +**1. Add option to `BasicCrawlerOptions`:** + +```ts +/** + * Enables monitor mode: a real-time status block printed to stderr during the crawl. + * Only active when stderr is a TTY or when output is plain (CI-friendly fallback). + * @default false + */ +monitor?: boolean; +``` + +This is a top-level option, not inside `experiments`. The feature is stable enough to warrant a direct option. + +**2. Store it on the crawler:** + +```ts +protected monitor: boolean; +// in constructor: +this.monitor = options.monitor ?? false; +``` + +**3. In `run()`**, alongside the existing `periodicLogger`: + +```ts +const monitorInstance = this.monitor + ? new Monitor(this.stats, this.autoscaledPool, { intervalSecs: 5 }) + : null; +monitorInstance?.start(); + +try { + await this.autoscaledPool!.run(); +} finally { + monitorInstance?.stop(); + // ... existing teardown +} +``` + +**4. Export `Monitor` from `packages/core/src/crawlers/index.ts` and `packages/core/src/index.ts`.** + +--- + +## Testing Strategy + +### Unit tests — `test/core/crawlers/monitor.test.ts` + +| Test | What it checks | +|---|---| +| Constructs without throwing | Basic instantiation | +| `start()` + `stop()` without error | Lifecycle works | +| Renders correct output with known stats | Format string correctness | +| Non-TTY mode prints plain lines (no ANSI) | CI-safe fallback | +| TTY mode uses ANSI overwrite codes | In-place refresh | +| ETA shows `N/A` when total is unknown | Edge case | +| Stop clears the interval | No memory leak | + +Use `vitest.useFakeTimers()` to control the interval without real waiting. +Mock `process.stderr` with a writable stub to capture output without printing to real terminal. + +### Integration tests — added to `test/core/crawlers/basic_crawler.test.ts` + +| Test | What it checks | +|---|---| +| Crawler with `monitor: true` completes successfully | No crash, correct final stats returned | +| Crawler with `monitor: false` behaves identically | Option is inert when disabled | + +--- + +## Non-goals + +- No interactive keyboard controls (pause/resume via keypress) — out of scope +- No color themes or custom format strings — keep it simple for v1 +- No new npm dependencies — implement with Node.js built-ins only + +--- + +## Files Changed + +| File | Change | +|---|---| +| `packages/core/src/crawlers/monitor.ts` | **New** — `Monitor` class | +| `packages/core/src/crawlers/index.ts` | Export `Monitor` | +| `packages/core/src/index.ts` | Re-export `Monitor` | +| `packages/basic-crawler/src/internals/basic-crawler.ts` | Add `monitor` option, instantiate `Monitor` in `run()` | +| `test/core/crawlers/monitor.test.ts` | **New** — unit tests | +| `test/core/crawlers/basic_crawler.test.ts` | Add integration tests | From a2c10dd51eefc4e12eb03e658a2a22dede9164a4 Mon Sep 17 00:00:00 2001 From: hiepau1231 Date: Sat, 4 Apr 2026 15:08:11 +0700 Subject: [PATCH 2/9] chore: add monitor mode implementation plan Co-Authored-By: Claude Opus 4.6 --- .../plans/2026-04-04-monitor-mode.md | 730 ++++++++++++++++++ 1 file changed, 730 insertions(+) create mode 100644 docs/superpowers/plans/2026-04-04-monitor-mode.md diff --git a/docs/superpowers/plans/2026-04-04-monitor-mode.md b/docs/superpowers/plans/2026-04-04-monitor-mode.md new file mode 100644 index 000000000000..7d222a81dd2a --- /dev/null +++ b/docs/superpowers/plans/2026-04-04-monitor-mode.md @@ -0,0 +1,730 @@ +# Monitor Mode Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add an opt-in `monitor: true` option to `BasicCrawler` that prints a compact real-time status block to `process.stderr` during a crawl run. + +**Architecture:** A new `Monitor` class in `packages/core` reads from the `Statistics` instance (for progress/speed) and uses Node.js `os` and `process` built-ins (for CPU/memory). It writes a fixed-height block to `process.stderr` using ANSI escape codes to overwrite itself in TTY mode, falling back to plain newlines in non-TTY mode. `BasicCrawler.run()` instantiates `Monitor` (after `_init()`) when `monitor: true`, starts it, and stops it in the `finally` block. + +**Tech Stack:** TypeScript, Node.js built-ins (`os`, `process`), Vitest (tests), `@crawlee/core`, `@crawlee/basic` + +--- + +## File Map + +| File | Action | Responsibility | +|---|---|---| +| `packages/core/src/crawlers/monitor.ts` | **Create** | `Monitor` class — renders status block to stderr | +| `packages/core/src/crawlers/index.ts` | **Modify** | Export `Monitor` | +| `packages/core/src/index.ts` | No change needed | Already re-exports `./crawlers` with `export *` | +| `packages/basic-crawler/src/internals/basic-crawler.ts` | **Modify** | Add `monitor` option, instantiate `Monitor` in `run()` | +| `test/core/crawlers/monitor.test.ts` | **Create** | Unit tests for `Monitor` class | +| `test/core/crawlers/basic_crawler.test.ts` | **Modify** | Integration tests: crawler with `monitor: true` completes ok | + +--- + +## Task 1: Create the `Monitor` class + +**Files:** +- Create: `packages/core/src/crawlers/monitor.ts` + +### Background + +`Statistics.state` has: +- `requestsFinished: number` +- `requestsFailed: number` +- `crawlerStartedAt: Date | string | null` + +`Statistics.calculate()` returns: +- `requestsFinishedPerMinute: number` + +`requestManager` lives on `BasicCrawler`, not on `Statistics`. To display `total`, we pass it as a separate parameter. + +For CPU/Mem we use Node.js built-ins only — no dependency on `AutoscaledPool` internals. + +`AutoscaledPool` exposes: +- `currentConcurrency: number` (getter) +- `desiredConcurrency: number` (getter) +- `maxConcurrency: number` (getter) + +- [ ] **Step 1: Create `packages/core/src/crawlers/monitor.ts` with this exact content:** + +```typescript +import os from 'node:os'; + +import type { AutoscaledPool } from '../autoscaling/autoscaled_pool'; +import type { Statistics } from './statistics'; + +export interface MonitorOptions { + /** + * How often to refresh the monitor display, in seconds. + * @default 5 + */ + intervalSecs?: number; +} + +const MONITOR_LINE_COUNT = 5; + +function padStart(n: number, width = 2): string { + return String(n).padStart(width, '0'); +} + +function formatDuration(ms: number): string { + const totalSecs = Math.floor(ms / 1000); + const h = Math.floor(totalSecs / 3600); + const m = Math.floor((totalSecs % 3600) / 60); + const s = totalSecs % 60; + return `${padStart(h)}:${padStart(m)}:${padStart(s)}`; +} + +function formatBytes(bytes: number): string { + if (bytes >= 1024 ** 3) return `${(bytes / 1024 ** 3).toFixed(1)} GB`; + if (bytes >= 1024 ** 2) return `${(bytes / 1024 ** 2).toFixed(0)} MB`; + return `${(bytes / 1024).toFixed(0)} KB`; +} + +/** + * Renders a compact real-time status block to `process.stderr` during a crawl. + * + * Enable via the `monitor` option on `BasicCrawler`: + * ```ts + * const crawler = new BasicCrawler({ monitor: true, ... }); + * ``` + * + * In TTY mode the block overwrites itself in-place. In non-TTY mode (CI, pipes) + * it prints plain lines so the output remains readable in logs. + */ +export class Monitor { + private intervalId?: ReturnType; + private readonly intervalMs: number; + private rendered = false; + + constructor( + private readonly stats: Statistics, + private readonly autoscaledPool?: AutoscaledPool, + private readonly options: MonitorOptions = {}, + private readonly totalRequests?: () => number | undefined, + ) { + this.intervalMs = (options.intervalSecs ?? 5) * 1000; + } + + /** Starts the periodic display. */ + start(): void { + this.intervalId = setInterval(() => this.render(), this.intervalMs); + } + + /** Stops the periodic display and clears the last rendered block from the terminal. */ + stop(): void { + if (this.intervalId !== undefined) { + clearInterval(this.intervalId); + this.intervalId = undefined; + } + if (this.rendered && process.stderr.isTTY) { + // Move up MONITOR_LINE_COUNT lines and clear each one + for (let i = 0; i < MONITOR_LINE_COUNT; i++) { + process.stderr.write('\x1b[1A\x1b[2K'); + } + this.rendered = false; + } + } + + /** Builds and returns the status block as an array of lines. Exposed for testing. */ + buildLines(): string[] { + const { state } = this.stats; + const calculated = this.stats.calculate(); + + const startedAt = state.crawlerStartedAt ? new Date(state.crawlerStartedAt) : new Date(); + const now = new Date(); + const elapsed = now.getTime() - startedAt.getTime(); + + const finished = state.requestsFinished; + const failed = state.requestsFailed; + const total = this.totalRequests?.(); + const speed = calculated.requestsFinishedPerMinute; + + const progressStr = total != null + ? `${finished}/${total} (${((finished / total) * 100).toFixed(1)}%)` + : `${finished}/? (?%)`; + + const failedPct = finished + failed > 0 + ? ` | Failed: ${failed} (${((failed / (finished + failed)) * 100).toFixed(1)}%)` + : ''; + + let etaStr = 'N/A'; + if (total != null && speed > 0) { + const remaining = total - finished; + const etaMs = (remaining / speed) * 60 * 1000; + etaStr = `~${formatDuration(etaMs)}`; + } + + const memInfo = process.memoryUsage(); + const totalMem = os.totalmem(); + const usedMem = totalMem - os.freemem(); + const cpus = os.cpus(); + const cpuLoad = os.loadavg()[0]; + const cpuPct = cpus.length > 0 ? Math.min(100, (cpuLoad / cpus.length) * 100).toFixed(0) : '?'; + + const concurrency = this.autoscaledPool + ? `${this.autoscaledPool.currentConcurrency}/${this.autoscaledPool.maxConcurrency} (desired: ${this.autoscaledPool.desiredConcurrency})` + : 'N/A'; + + return [ + `\u23F1 Start: ${startedAt.toLocaleTimeString()} | Running for ${formatDuration(elapsed)}`, + `\uD83D\uDCCA Progress: ${progressStr}${failedPct} | Speed: ${speed} req/min`, + `\u23F3 ETA: ${etaStr}`, + `\uD83D\uDCBB CPU: ${cpuPct}% | Mem: ${formatBytes(memInfo.rss)} process / ${formatBytes(usedMem)} / ${formatBytes(totalMem)} total`, + `\uD83D\uDD00 Concurrency: ${concurrency}`, + ]; + } + + private render(): void { + const lines = this.buildLines(); + + if (process.stderr.isTTY && this.rendered) { + // Move cursor up to overwrite previous block + process.stderr.write(`\x1b[${MONITOR_LINE_COUNT}A`); + } + + for (const line of lines) { + if (process.stderr.isTTY) { + // Clear line then write + process.stderr.write(`\x1b[2K${line}\n`); + } else { + process.stderr.write(`${line}\n`); + } + } + + this.rendered = true; + } +} +``` + +- [ ] **Step 2: Run TypeScript check to verify the file compiles** + +```bash +cd packages/core && yarn tsc --noEmit 2>&1 | head -30 +``` + +Expected: no errors (or only pre-existing unrelated errors). + +--- + +## Task 2: Export `Monitor` from `@crawlee/core` + +**Files:** +- Modify: `packages/core/src/crawlers/index.ts` + +- [ ] **Step 1: Add export to `packages/core/src/crawlers/index.ts`** + +Current content of file: +```typescript +export * from './crawler_commons'; +export * from './crawler_extension'; +export * from './crawler_utils'; +export * from './statistics'; +export * from './error_tracker'; +export * from './error_snapshotter'; +``` + +Add one line at the end: +```typescript +export * from './crawler_commons'; +export * from './crawler_extension'; +export * from './crawler_utils'; +export * from './statistics'; +export * from './error_tracker'; +export * from './error_snapshotter'; +export * from './monitor'; +``` + +- [ ] **Step 2: Run TypeScript check** + +```bash +cd packages/core && yarn tsc --noEmit 2>&1 | head -30 +``` + +Expected: no errors. + +- [ ] **Step 3: Commit** + +```bash +git add packages/core/src/crawlers/monitor.ts packages/core/src/crawlers/index.ts +git commit -m "feat: add Monitor class to @crawlee/core" +``` + +--- + +## Task 3: Write unit tests for `Monitor` + +**Files:** +- Create: `test/core/crawlers/monitor.test.ts` + +### Background + +- `Statistics` is imported from `@crawlee/core` +- We use `vitest.useFakeTimers()` to control `setInterval` without real waiting +- We mock `process.stderr` by replacing `process.stderr.write` with a `vi.fn()` stub +- We mock `process.stderr.isTTY` using `Object.defineProperty` + +- [ ] **Step 1: Write the failing tests in `test/core/crawlers/monitor.test.ts`** + +```typescript +import os from 'node:os'; + +import { Configuration, Statistics } from '@crawlee/core'; +import { afterEach, beforeEach, describe, expect, test, vi } from 'vitest'; + +import { Monitor } from '../../../packages/core/src/crawlers/monitor'; +import { MemoryStorageEmulator } from '../../shared/MemoryStorageEmulator'; + +describe('Monitor', () => { + const localStorageEmulator = new MemoryStorageEmulator(); + + beforeEach(async () => { + await localStorageEmulator.init(); + vi.useFakeTimers(); + }); + + afterEach(async () => { + await localStorageEmulator.destroy(); + vi.useRealTimers(); + vi.restoreAllMocks(); + }); + + test('constructs without throwing', () => { + const stats = new Statistics(); + expect(() => new Monitor(stats)).not.toThrow(); + }); + + test('start() and stop() do not throw', () => { + const stats = new Statistics(); + const monitor = new Monitor(stats); + expect(() => monitor.start()).not.toThrow(); + expect(() => monitor.stop()).not.toThrow(); + }); + + test('stop() before start() does not throw', () => { + const stats = new Statistics(); + const monitor = new Monitor(stats); + expect(() => monitor.stop()).not.toThrow(); + }); + + test('buildLines() returns 5 lines', () => { + const stats = new Statistics(); + const monitor = new Monitor(stats); + const lines = monitor.buildLines(); + expect(lines).toHaveLength(5); + }); + + test('buildLines() shows finished/total and percentage when total is known', () => { + const stats = new Statistics(); + stats.startJob('r1'); + stats.finishJob('r1', 0); + + const monitor = new Monitor(stats, undefined, {}, () => 10); + const lines = monitor.buildLines(); + + expect(lines[1]).toContain('1/10'); + expect(lines[1]).toContain('10.0%'); + }); + + test('buildLines() shows ? when total is unknown', () => { + const stats = new Statistics(); + const monitor = new Monitor(stats); + const lines = monitor.buildLines(); + + expect(lines[1]).toContain('/?'); + }); + + test('buildLines() shows ETA as N/A when total is unknown', () => { + const stats = new Statistics(); + const monitor = new Monitor(stats); + const lines = monitor.buildLines(); + + expect(lines[2]).toContain('N/A'); + }); + + test('buildLines() shows concurrency info when autoscaledPool is provided', () => { + const stats = new Statistics(); + const fakePool = { + currentConcurrency: 3, + desiredConcurrency: 5, + maxConcurrency: 10, + } as any; + + const monitor = new Monitor(stats, fakePool); + const lines = monitor.buildLines(); + + expect(lines[4]).toContain('3/10'); + expect(lines[4]).toContain('desired: 5'); + }); + + test('buildLines() shows N/A for concurrency when autoscaledPool is not provided', () => { + const stats = new Statistics(); + const monitor = new Monitor(stats); + const lines = monitor.buildLines(); + + expect(lines[4]).toContain('N/A'); + }); + + test('renders to stderr when interval fires', () => { + const writeStub = vi.spyOn(process.stderr, 'write').mockImplementation(() => true); + const stats = new Statistics(); + const monitor = new Monitor(stats, undefined, { intervalSecs: 1 }); + + monitor.start(); + vi.advanceTimersByTime(1000); + monitor.stop(); + + expect(writeStub).toHaveBeenCalled(); + }); + + test('in non-TTY mode, does not write ANSI overwrite codes', () => { + const writes: string[] = []; + vi.spyOn(process.stderr, 'write').mockImplementation((chunk: any) => { + writes.push(String(chunk)); + return true; + }); + Object.defineProperty(process.stderr, 'isTTY', { value: false, configurable: true }); + + const stats = new Statistics(); + const monitor = new Monitor(stats, undefined, { intervalSecs: 1 }); + + monitor.start(); + vi.advanceTimersByTime(1000); + monitor.stop(); + + const combined = writes.join(''); + // Should not contain ANSI cursor-up code + expect(combined).not.toContain('\x1b[5A'); + expect(combined).not.toContain('\x1b[2K'); + }); + + test('in TTY mode, second render writes ANSI cursor-up to overwrite', () => { + const writes: string[] = []; + vi.spyOn(process.stderr, 'write').mockImplementation((chunk: any) => { + writes.push(String(chunk)); + return true; + }); + Object.defineProperty(process.stderr, 'isTTY', { value: true, configurable: true }); + + const stats = new Statistics(); + const monitor = new Monitor(stats, undefined, { intervalSecs: 1 }); + + monitor.start(); + vi.advanceTimersByTime(1000); // first render + vi.advanceTimersByTime(1000); // second render — should have cursor-up + monitor.stop(); + + const combined = writes.join(''); + expect(combined).toContain('\x1b[5A'); + }); +}); +``` + +- [ ] **Step 2: Run the tests to verify they FAIL (Monitor doesn't exist yet relative to test path)** + +```bash +cd "$(git rev-parse --show-toplevel)" && yarn vitest run test/core/crawlers/monitor.test.ts 2>&1 | tail -20 +``` + +Expected: tests fail because `Monitor` import path may need adjustment, or type errors. + +> **Note:** If the import `from '../../../packages/core/src/crawlers/monitor'` resolves correctly (check tsconfig paths in `test/tsconfig.json`), the tests may pass after Task 1. If not, adjust the import to `from '@crawlee/core'` after the build. + +- [ ] **Step 3: Check test tsconfig to see how other core internals are imported in tests** + +```bash +cat test/core/crawlers/statistics.test.ts | head -5 +``` + +If statistics is imported from `'@crawlee/core'`, change the monitor import similarly: + +```typescript +import { Monitor } from '@crawlee/core'; +``` + +Then re-run: + +```bash +yarn vitest run test/core/crawlers/monitor.test.ts 2>&1 | tail -20 +``` + +Expected: tests PASS (after Task 1 and Task 2 are done). + +- [ ] **Step 4: Commit** + +```bash +git add test/core/crawlers/monitor.test.ts +git commit -m "test: add unit tests for Monitor class" +``` + +--- + +## Task 4: Integrate `Monitor` into `BasicCrawler` + +**Files:** +- Modify: `packages/basic-crawler/src/internals/basic-crawler.ts` + +### Background + +The `run()` function is around line 979 in `basic-crawler.ts`. The structure is: + +```typescript +async run(...) { + // ...setup... + await this._init(); + await this.stats.startCapturing(); + const periodicLogger = this.getPeriodicLogger(); + // ... + try { + await this.autoscaledPool!.run(); + } finally { + await this.teardown(); + // ... + periodicLogger.stop(); + // ... + } +} +``` + +`this.autoscaledPool` is assigned inside `this._init()`, so it's available after `_init()`. + +`this.requestManager` is also available after `_init()`. + +- [ ] **Step 1: Add `monitor` to imports from `@crawlee/core`** + +In `packages/basic-crawler/src/internals/basic-crawler.ts`, find the import block from `@crawlee/core` (around line 31). Add `Monitor` and `MonitorOptions` to it: + +```typescript +import { + AutoscaledPool, + Configuration, + CriticalError, + Dataset, + enqueueLinks, + EnqueueStrategy, + EventType, + GotScrapingHttpClient, + KeyValueStore, + mergeCookies, + Monitor, + NonRetryableError, + purgeDefaultStorages, + RequestListAdapter, + RequestManagerTandem, + RequestProvider, + RequestQueue, + // ... rest of existing imports +} from '@crawlee/core'; +``` + +- [ ] **Step 2: Add `monitor` option to `BasicCrawlerOptions` interface** + +Find the `BasicCrawlerOptions` interface. It ends around the `statisticsOptions` and `httpClient` properties. Add after `httpClient`: + +```typescript +/** + * Enables monitor mode: a compact real-time status block printed to `process.stderr` during the crawl. + * + * In interactive terminals (TTY), the block overwrites itself in-place. + * In non-TTY environments (CI, piped output), plain lines are printed instead. + * + * @default false + * @example + * ```ts + * const crawler = new BasicCrawler({ monitor: true }); + * ``` + */ +monitor?: boolean; +``` + +- [ ] **Step 3: Store `monitor` option in the constructor and add `ow` validation** + +Find the `ow` validation block in the constructor (around line 590–630). Add: + +```typescript +monitor: ow.optional.boolean, +``` + +Find the destructuring of constructor options (around line 637–700). Add `monitor = false`: + +```typescript +const { + // ... existing destructuring ... + monitor = false, +} = options; +``` + +Add a protected field on the class (near other protected fields around line 566): + +```typescript +protected monitorEnabled: boolean; +``` + +And in the constructor body, assign it: + +```typescript +this.monitorEnabled = monitor; +``` + +- [ ] **Step 4: Instantiate and run `Monitor` inside `run()`** + +Find the `run()` method. After `const periodicLogger = this.getPeriodicLogger();` (around line 1033), add: + +```typescript +const monitorInstance = this.monitorEnabled + ? new Monitor( + this.stats, + this.autoscaledPool, + { intervalSecs: 5 }, + () => this.requestManager?.getTotalCount(), + ) + : null; +monitorInstance?.start(); +``` + +In the `finally` block, before `periodicLogger.stop()`, add: + +```typescript +monitorInstance?.stop(); +``` + +- [ ] **Step 5: Run TypeScript check** + +```bash +cd "$(git rev-parse --show-toplevel)" && yarn tsc-check-tests 2>&1 | head -40 +``` + +Expected: no new errors. + +- [ ] **Step 6: Commit** + +```bash +git add packages/basic-crawler/src/internals/basic-crawler.ts +git commit -m "feat: add monitor option to BasicCrawler" +``` + +--- + +## Task 5: Add integration tests to `basic_crawler.test.ts` + +**Files:** +- Modify: `test/core/crawlers/basic_crawler.test.ts` + +- [ ] **Step 1: Find a good `describe` block to add the new tests** + +The file has a top-level `describe('BasicCrawler', ...)`. Add a new nested `describe` block at the end (before the closing `}`), after all existing `describe` blocks. + +- [ ] **Step 2: Add the integration tests** + +Add this block inside `describe('BasicCrawler', ...)`: + +```typescript +describe('monitor option', () => { + test('crawler with monitor: true completes successfully and returns final stats', async () => { + const requestList = await RequestList.open(null, [ + `http://${HOSTNAME}:${port}/`, + ]); + + const crawler = new BasicCrawler({ + requestList, + monitor: true, + async requestHandler() { + // no-op + }, + }); + + const stats = await crawler.run(); + + expect(stats.requestsFinished).toBe(1); + expect(stats.requestsFailed).toBe(0); + }); + + test('crawler with monitor: false behaves the same as without the option', async () => { + const requestList = await RequestList.open(null, [ + `http://${HOSTNAME}:${port}/`, + ]); + + const crawler = new BasicCrawler({ + requestList, + monitor: false, + async requestHandler() { + // no-op + }, + }); + + const stats = await crawler.run(); + + expect(stats.requestsFinished).toBe(1); + expect(stats.requestsFailed).toBe(0); + }); +}); +``` + +> **Note:** The `HOSTNAME`, `port`, and `server` variables are already defined in the outer `describe('BasicCrawler', ...)` scope, set up in `beforeAll`. The URL `http://${HOSTNAME}:${port}/` returns a valid response (`app.get('/', ...)` is already defined near the top of the file). + +- [ ] **Step 3: Run the integration tests** + +```bash +cd "$(git rev-parse --show-toplevel)" && yarn vitest run test/core/crawlers/basic_crawler.test.ts 2>&1 | tail -30 +``` + +Expected: all tests pass (including the new ones). + +- [ ] **Step 4: Run the full unit test suite for monitor** + +```bash +cd "$(git rev-parse --show-toplevel)" && yarn vitest run test/core/crawlers/monitor.test.ts 2>&1 | tail -20 +``` + +Expected: all tests pass. + +- [ ] **Step 5: Run the full TypeScript check one last time** + +```bash +cd "$(git rev-parse --show-toplevel)" && yarn tsc-check-tests 2>&1 | head -40 +``` + +Expected: no errors. + +- [ ] **Step 6: Commit** + +```bash +git add test/core/crawlers/basic_crawler.test.ts +git commit -m "test: add integration tests for BasicCrawler monitor option" +``` + +--- + +## Self-Review Checklist + +### Spec coverage + +| Spec requirement | Task that covers it | +|---|---| +| New `Monitor` class in `packages/core/src/crawlers/monitor.ts` | Task 1 | +| Reads `Statistics` for progress/speed | Task 1 — `stats.state` + `stats.calculate()` | +| Shows start time, elapsed, progress, ETA, CPU, mem, concurrency | Task 1 — `buildLines()` | +| Writes to `process.stderr` | Task 1 — `render()` uses `process.stderr.write` | +| TTY: in-place overwrite with ANSI codes | Task 1 — `render()` | +| Non-TTY: plain newline fallback | Task 1 — `render()` checks `isTTY` | +| Export from `@crawlee/core` | Task 2 | +| `monitor?: boolean` option on `BasicCrawlerOptions` | Task 4 Step 2 | +| Instantiated in `run()` after `_init()` | Task 4 Step 4 | +| Stopped in `finally` block | Task 4 Step 4 | +| Unit tests for `Monitor` | Task 3 | +| Integration tests for `BasicCrawler` | Task 5 | + +All requirements covered. ✅ + +### Placeholder scan + +No TBD/TODO or vague instructions. All code steps contain complete implementations. ✅ + +### Type consistency + +- `Monitor` constructor signature defined in Task 1 and referenced in Task 4 — parameters match (`stats`, `autoscaledPool`, `options`, `totalRequests`). +- `buildLines()` defined in Task 1 and tested in Task 3 — name matches. +- `MonitorOptions.intervalSecs` defined in Task 1, used in Task 4 — consistent. +- `autoscaledPool.currentConcurrency`, `.desiredConcurrency`, `.maxConcurrency` — verified as public getters from codebase exploration. ✅ From 2c41402792d227ee120f1c6e222d5afb0538e3ad Mon Sep 17 00:00:00 2001 From: hiepau1231 Date: Sat, 4 Apr 2026 15:21:55 +0700 Subject: [PATCH 3/9] chore: update monitor mode plan with Codex review fixes (8 issues resolved) Co-Authored-By: Claude Opus 4.6 --- .../plans/2026-04-04-monitor-mode.md | 71 +++++++++++++++---- 1 file changed, 59 insertions(+), 12 deletions(-) diff --git a/docs/superpowers/plans/2026-04-04-monitor-mode.md b/docs/superpowers/plans/2026-04-04-monitor-mode.md index 7d222a81dd2a..4cef245ca59b 100644 --- a/docs/superpowers/plans/2026-04-04-monitor-mode.md +++ b/docs/superpowers/plans/2026-04-04-monitor-mode.md @@ -4,7 +4,7 @@ **Goal:** Add an opt-in `monitor: true` option to `BasicCrawler` that prints a compact real-time status block to `process.stderr` during a crawl run. -**Architecture:** A new `Monitor` class in `packages/core` reads from the `Statistics` instance (for progress/speed) and uses Node.js `os` and `process` built-ins (for CPU/memory). It writes a fixed-height block to `process.stderr` using ANSI escape codes to overwrite itself in TTY mode, falling back to plain newlines in non-TTY mode. `BasicCrawler.run()` instantiates `Monitor` (after `_init()`) when `monitor: true`, starts it, and stops it in the `finally` block. +**Architecture:** A new `Monitor` class in `packages/core` reads from the `Statistics` instance (for progress/speed) and uses Node.js `os` and `process` built-ins (for CPU/memory). It writes a fixed-height block to `process.stderr` using ANSI escape codes to overwrite itself in TTY mode, falling back to plain newlines in non-TTY mode. `BasicCrawler.run()` instantiates `Monitor` (after `_init()`) when `monitor: true`, renders an initial frame immediately on `start()`, and stops it at the **very start** of the `finally` block (before any teardown logging). When `monitor: true`, `getPeriodicLogger()` is called with an option to suppress its output so the two writers do not interleave. **Tech Stack:** TypeScript, Node.js built-ins (`os`, `process`), Vitest (tests), `@crawlee/core`, `@crawlee/basic` @@ -108,8 +108,9 @@ export class Monitor { this.intervalMs = (options.intervalSecs ?? 5) * 1000; } - /** Starts the periodic display. */ + /** Starts the periodic display. Renders an initial frame immediately, then repeats on each interval. */ start(): void { + this.render(); // ISSUE-1 fix: render immediately so short crawls always show output this.intervalId = setInterval(() => this.render(), this.intervalMs); } @@ -140,18 +141,25 @@ export class Monitor { const finished = state.requestsFinished; const failed = state.requestsFailed; const total = this.totalRequests?.(); + // ISSUE-6 note: getTotalCount() on RequestManagerTandem may be an approximate sum + // of the underlying RequestList + RequestQueue. The plan treats this as a best-effort + // estimate: progress % and ETA are shown when total > 0, hidden when total === 0. + // This matches the existing behaviour in PR #2692 and is acceptable for a "monitor mode" + // display (non-authoritative progress indicator). No special-casing per request-source mode. const speed = calculated.requestsFinishedPerMinute; - const progressStr = total != null + const progressStr = total != null && total > 0 ? `${finished}/${total} (${((finished / total) * 100).toFixed(1)}%)` - : `${finished}/? (?%)`; + : total === 0 + ? `${finished}/0 (N/A%)` + : `${finished}/? (?%)`; const failedPct = finished + failed > 0 ? ` | Failed: ${failed} (${((failed / (finished + failed)) * 100).toFixed(1)}%)` : ''; let etaStr = 'N/A'; - if (total != null && speed > 0) { + if (total != null && total > 0 && speed > 0) { const remaining = total - finished; const etaMs = (remaining / speed) * 60 * 1000; etaStr = `~${formatDuration(etaMs)}`; @@ -279,16 +287,20 @@ import { MemoryStorageEmulator } from '../../shared/MemoryStorageEmulator'; describe('Monitor', () => { const localStorageEmulator = new MemoryStorageEmulator(); + let originalIsTTY: boolean | undefined; // ISSUE-7 fix: save original descriptor beforeEach(async () => { await localStorageEmulator.init(); vi.useFakeTimers(); + originalIsTTY = process.stderr.isTTY; // save before any test mutates it }); afterEach(async () => { await localStorageEmulator.destroy(); vi.useRealTimers(); vi.restoreAllMocks(); + // Restore isTTY to original value (Object.defineProperty mutations not undone by vi.restoreAllMocks) + Object.defineProperty(process.stderr, 'isTTY', { value: originalIsTTY, configurable: true }); }); test('constructs without throwing', () => { @@ -570,9 +582,12 @@ this.monitorEnabled = monitor; - [ ] **Step 4: Instantiate and run `Monitor` inside `run()`** -Find the `run()` method. After `const periodicLogger = this.getPeriodicLogger();` (around line 1033), add: +Find the `run()` method. **Check how `getPeriodicLogger()` is called** — it returns an object with a `stop()` method. When `monitor: true`, the periodic logger must be silenced so both do not write to stderr simultaneously. Do this by checking if `BasicCrawlerOptions` already has a `statusMessageLoggingInterval` option; if `monitor` is true, pass `statusMessageLoggingInterval: 0` (effectively disabling periodic status log messages) to the periodic logger or set the logging to `Number.POSITIVE_INFINITY` to suppress it. + +Concretely, after `await this._init();` and `await this.stats.startCapturing();`, replace the existing `const periodicLogger = this.getPeriodicLogger();` line with: ```typescript +const periodicLogger = this.getPeriodicLogger(); const monitorInstance = this.monitorEnabled ? new Monitor( this.stats, @@ -581,15 +596,22 @@ const monitorInstance = this.monitorEnabled () => this.requestManager?.getTotalCount(), ) : null; +// When monitor is active, suppress the periodic status logger (ISSUE-2 fix) +if (this.monitorEnabled) { + periodicLogger.stop(); +} monitorInstance?.start(); ``` -In the `finally` block, before `periodicLogger.stop()`, add: +In the `finally` block, **as the very first statement** (ISSUE-3 fix — before `await this.teardown()` and before any final logging), add: ```typescript +// Stop monitor first so its ANSI block is cleared before any teardown logs monitorInstance?.stop(); ``` +Then resume with the existing teardown logic. + - [ ] **Step 5: Run TypeScript check** ```bash @@ -618,11 +640,17 @@ The file has a top-level `describe('BasicCrawler', ...)`. Add a new nested `desc - [ ] **Step 2: Add the integration tests** -Add this block inside `describe('BasicCrawler', ...)`: +Add this block inside `describe('BasicCrawler', ...)`. **These tests spy on `process.stderr.write` to verify actual monitor output is produced (ISSUE-4 fix).** ```typescript describe('monitor option', () => { - test('crawler with monitor: true completes successfully and returns final stats', async () => { + test('crawler with monitor: true writes to stderr during run', async () => { + const writes: string[] = []; + const writeStub = vi.spyOn(process.stderr, 'write').mockImplementation((chunk: any) => { + writes.push(String(chunk)); + return true; + }); + const requestList = await RequestList.open(null, [ `http://${HOSTNAME}:${port}/`, ]); @@ -637,11 +665,22 @@ describe('monitor option', () => { const stats = await crawler.run(); + writeStub.mockRestore(); + expect(stats.requestsFinished).toBe(1); expect(stats.requestsFailed).toBe(0); + // Monitor must have written at least one 5-line block containing known marker + const combined = writes.join(''); + expect(combined).toContain('Progress:'); // ISSUE-4 fix: assert monitor-specific marker, not just non-empty }); - test('crawler with monitor: false behaves the same as without the option', async () => { + test('crawler with monitor: false does not write monitor output to stderr', async () => { + const writes: string[] = []; + const writeStub = vi.spyOn(process.stderr, 'write').mockImplementation((chunk: any) => { + writes.push(String(chunk)); + return true; + }); + const requestList = await RequestList.open(null, [ `http://${HOSTNAME}:${port}/`, ]); @@ -656,8 +695,13 @@ describe('monitor option', () => { const stats = await crawler.run(); + writeStub.mockRestore(); + expect(stats.requestsFinished).toBe(1); expect(stats.requestsFailed).toBe(0); + // No monitor block: stderr writes should be empty or contain no progress line + const combined = writes.join(''); + expect(combined).not.toContain('Progress:'); }); }); ``` @@ -712,9 +756,12 @@ git commit -m "test: add integration tests for BasicCrawler monitor option" | Export from `@crawlee/core` | Task 2 | | `monitor?: boolean` option on `BasicCrawlerOptions` | Task 4 Step 2 | | Instantiated in `run()` after `_init()` | Task 4 Step 4 | -| Stopped in `finally` block | Task 4 Step 4 | +| Initial render on `start()` for short crawls | Task 1 — `start()` calls `render()` immediately (ISSUE-1) | +| `stop()` at very start of `finally`, before teardown logs | Task 4 Step 4 (ISSUE-3) | +| Periodic logger suppressed when monitor active | Task 4 Step 4 (ISSUE-2) | +| `total = 0` handled — shows `N/A%` not `NaN` | Task 1 — `buildLines()` (ISSUE-5) | +| Integration tests verify stderr output, not just stats | Task 5 — `vi.spyOn(process.stderr.write)` (ISSUE-4) | | Unit tests for `Monitor` | Task 3 | -| Integration tests for `BasicCrawler` | Task 5 | All requirements covered. ✅ From 69ca1b3af73904446aa40bbc45cee1a2cf424fd1 Mon Sep 17 00:00:00 2001 From: hiepau1231 Date: Sat, 4 Apr 2026 15:22:53 +0700 Subject: [PATCH 4/9] chore: add .worktrees/ to .gitignore Co-Authored-By: Claude Opus 4.6 --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 77e793d8d509..4400170f398d 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,4 @@ test/e2e/**/packages # Local vitest config overrides vitest.config.local.mts +.worktrees/ From 91000f4c167ddc23e398381a2b7d898541adefb5 Mon Sep 17 00:00:00 2001 From: hiepau1231 Date: Sat, 4 Apr 2026 15:30:56 +0700 Subject: [PATCH 5/9] feat: add Monitor class to @crawlee/core Adds a new Monitor class that renders a compact real-time status block to process.stderr during a crawl. In TTY mode the block overwrites itself in-place; in non-TTY mode (CI, pipes) it prints plain lines. Co-Authored-By: Claude Opus 4.6 --- packages/core/src/crawlers/index.ts | 1 + packages/core/src/crawlers/monitor.ts | 155 ++++++++++++++++++++++++++ 2 files changed, 156 insertions(+) create mode 100644 packages/core/src/crawlers/monitor.ts diff --git a/packages/core/src/crawlers/index.ts b/packages/core/src/crawlers/index.ts index 77a83511e413..8a3a3c0d673d 100644 --- a/packages/core/src/crawlers/index.ts +++ b/packages/core/src/crawlers/index.ts @@ -4,3 +4,4 @@ export * from './crawler_utils'; export * from './statistics'; export * from './error_tracker'; export * from './error_snapshotter'; +export * from './monitor'; diff --git a/packages/core/src/crawlers/monitor.ts b/packages/core/src/crawlers/monitor.ts new file mode 100644 index 000000000000..e371c5267f4d --- /dev/null +++ b/packages/core/src/crawlers/monitor.ts @@ -0,0 +1,155 @@ +import os from 'node:os'; + +import type { AutoscaledPool } from '../autoscaling/autoscaled_pool'; +import type { Statistics } from './statistics'; + +export interface MonitorOptions { + /** + * How often to refresh the monitor display, in seconds. + * @default 5 + */ + intervalSecs?: number; +} + +const MONITOR_LINE_COUNT = 5; + +function padStart(n: number, width = 2): string { + return String(n).padStart(width, '0'); +} + +function formatDuration(ms: number): string { + const totalSecs = Math.floor(ms / 1000); + const h = Math.floor(totalSecs / 3600); + const m = Math.floor((totalSecs % 3600) / 60); + const s = totalSecs % 60; + return `${padStart(h)}:${padStart(m)}:${padStart(s)}`; +} + +function formatBytes(bytes: number): string { + if (bytes >= 1024 ** 3) return `${(bytes / 1024 ** 3).toFixed(1)} GB`; + if (bytes >= 1024 ** 2) return `${(bytes / 1024 ** 2).toFixed(0)} MB`; + return `${(bytes / 1024).toFixed(0)} KB`; +} + +/** + * Renders a compact real-time status block to `process.stderr` during a crawl. + * + * Enable via the `monitor` option on `BasicCrawler`: + * ```ts + * const crawler = new BasicCrawler({ monitor: true, ... }); + * ``` + * + * In TTY mode the block overwrites itself in-place. In non-TTY mode (CI, pipes) + * it prints plain lines so the output remains readable in logs. + */ +export class Monitor { + private intervalId?: ReturnType; + private readonly intervalMs: number; + private rendered = false; + + constructor( + private readonly stats: Statistics, + private readonly autoscaledPool?: AutoscaledPool, + options: MonitorOptions = {}, + private readonly totalRequests?: () => number | undefined, + ) { + this.intervalMs = (options.intervalSecs ?? 5) * 1000; + } + + /** Starts the periodic display. Renders an initial frame immediately, then repeats on each interval. */ + start(): void { + this.render(); // render immediately so short crawls always show output + this.intervalId = setInterval(() => this.render(), this.intervalMs); + } + + /** Stops the periodic display and clears the last rendered block from the terminal. */ + stop(): void { + if (this.intervalId !== undefined) { + clearInterval(this.intervalId); + this.intervalId = undefined; + } + if (this.rendered && process.stderr.isTTY) { + // Move up MONITOR_LINE_COUNT lines and clear each one + for (let i = 0; i < MONITOR_LINE_COUNT; i++) { + process.stderr.write('\x1b[1A\x1b[2K'); + } + this.rendered = false; + } + } + + /** Builds and returns the status block as an array of lines. Exposed for testing. */ + buildLines(): string[] { + const { state } = this.stats; + const calculated = this.stats.calculate(); + + const startedAt = state.crawlerStartedAt ? new Date(state.crawlerStartedAt) : new Date(); + const now = new Date(); + const elapsed = now.getTime() - startedAt.getTime(); + + const finished = state.requestsFinished; + const failed = state.requestsFailed; + const total = this.totalRequests?.(); + // getTotalCount() on RequestManagerTandem may be an approximate sum + // of the underlying RequestList + RequestQueue. The plan treats this as a best-effort + // estimate: progress % and ETA are shown when total > 0, hidden when total === 0. + // This matches the existing behaviour in PR #2692 and is acceptable for a "monitor mode" + // display (non-authoritative progress indicator). No special-casing per request-source mode. + const speed = calculated.requestsFinishedPerMinute; + + const progressStr = total != null && total > 0 + ? `${finished}/${total} (${((finished / total) * 100).toFixed(1)}%)` + : total === 0 + ? `${finished}/0 (N/A%)` + : `${finished}/? (?%)`; + + const failedPct = finished + failed > 0 + ? ` | Failed: ${failed} (${((failed / (finished + failed)) * 100).toFixed(1)}%)` + : ''; + + let etaStr = 'N/A'; + if (total != null && total > 0 && speed > 0) { + const remaining = total - finished; + const etaMs = (remaining / speed) * 60 * 1000; + etaStr = `~${formatDuration(etaMs)}`; + } + + const memInfo = process.memoryUsage(); + const totalMem = os.totalmem(); + const usedMem = totalMem - os.freemem(); + const cpus = os.cpus(); + const cpuLoad = os.loadavg()[0]; + const cpuPct = cpus.length > 0 ? Math.min(100, (cpuLoad / cpus.length) * 100).toFixed(0) : '?'; + + const concurrency = this.autoscaledPool + ? `${this.autoscaledPool.currentConcurrency}/${this.autoscaledPool.maxConcurrency} (desired: ${this.autoscaledPool.desiredConcurrency})` + : 'N/A'; + + return [ + `\u23F1 Start: ${startedAt.toLocaleTimeString()} | Running for ${formatDuration(elapsed)}`, + `\uD83D\uDCCA Progress: ${progressStr}${failedPct} | Speed: ${speed} req/min`, + `\u23F3 ETA: ${etaStr}`, + `\uD83D\uDCBB CPU: ${cpuPct}% | Mem: ${formatBytes(memInfo.rss)} process / ${formatBytes(usedMem)} / ${formatBytes(totalMem)} total`, + `\uD83D\uDD00 Concurrency: ${concurrency}`, + ]; + } + + private render(): void { + const lines = this.buildLines(); + + if (process.stderr.isTTY && this.rendered) { + // Move cursor up to overwrite previous block + process.stderr.write(`\x1b[${MONITOR_LINE_COUNT}A`); + } + + for (const line of lines) { + if (process.stderr.isTTY) { + // Clear line then write + process.stderr.write(`\x1b[2K${line}\n`); + } else { + process.stderr.write(`${line}\n`); + } + } + + this.rendered = true; + } +} From 9e2f38e5d58cbb49c2537c62abccde8f599a486a Mon Sep 17 00:00:00 2001 From: hiepau1231 Date: Sat, 4 Apr 2026 15:32:37 +0700 Subject: [PATCH 6/9] test: add unit tests for Monitor class 12 tests covering: construction, start/stop lifecycle, buildLines() output content, TTY vs non-TTY rendering, and stderr write behavior. Co-Authored-By: Claude Opus 4.6 --- test/core/crawlers/monitor.test.ts | 155 +++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 test/core/crawlers/monitor.test.ts diff --git a/test/core/crawlers/monitor.test.ts b/test/core/crawlers/monitor.test.ts new file mode 100644 index 000000000000..f3f1449bced8 --- /dev/null +++ b/test/core/crawlers/monitor.test.ts @@ -0,0 +1,155 @@ +import os from 'node:os'; + +import { Statistics } from '@crawlee/core'; +import { afterEach, beforeEach, describe, expect, test, vi } from 'vitest'; + +import { Monitor } from '../../../packages/core/src/crawlers/monitor'; +import { MemoryStorageEmulator } from '../../shared/MemoryStorageEmulator'; + +describe('Monitor', () => { + const localStorageEmulator = new MemoryStorageEmulator(); + let originalIsTTY: boolean | undefined; + + beforeEach(async () => { + await localStorageEmulator.init(); + vi.useFakeTimers(); + originalIsTTY = process.stderr.isTTY; + }); + + afterEach(async () => { + await localStorageEmulator.destroy(); + vi.useRealTimers(); + vi.restoreAllMocks(); + // Restore isTTY — Object.defineProperty mutations are not undone by vi.restoreAllMocks + Object.defineProperty(process.stderr, 'isTTY', { value: originalIsTTY, configurable: true }); + }); + + test('constructs without throwing', () => { + const stats = new Statistics(); + expect(() => new Monitor(stats)).not.toThrow(); + }); + + test('start() and stop() do not throw', () => { + const stats = new Statistics(); + const monitor = new Monitor(stats); + expect(() => monitor.start()).not.toThrow(); + expect(() => monitor.stop()).not.toThrow(); + }); + + test('stop() before start() does not throw', () => { + const stats = new Statistics(); + const monitor = new Monitor(stats); + expect(() => monitor.stop()).not.toThrow(); + }); + + test('buildLines() returns 5 lines', () => { + const stats = new Statistics(); + const monitor = new Monitor(stats); + const lines = monitor.buildLines(); + expect(lines).toHaveLength(5); + }); + + test('buildLines() shows finished/total and percentage when total is known', () => { + const stats = new Statistics(); + stats.startJob('r1'); + stats.finishJob('r1', 0); + + const monitor = new Monitor(stats, undefined, {}, () => 10); + const lines = monitor.buildLines(); + + expect(lines[1]).toContain('1/10'); + expect(lines[1]).toContain('10.0%'); + }); + + test('buildLines() shows ? when total is unknown', () => { + const stats = new Statistics(); + const monitor = new Monitor(stats); + const lines = monitor.buildLines(); + + expect(lines[1]).toContain('/?'); + }); + + test('buildLines() shows ETA as N/A when total is unknown', () => { + const stats = new Statistics(); + const monitor = new Monitor(stats); + const lines = monitor.buildLines(); + + expect(lines[2]).toContain('N/A'); + }); + + test('buildLines() shows concurrency info when autoscaledPool is provided', () => { + const stats = new Statistics(); + const fakePool = { + currentConcurrency: 3, + desiredConcurrency: 5, + maxConcurrency: 10, + } as any; + + const monitor = new Monitor(stats, fakePool); + const lines = monitor.buildLines(); + + expect(lines[4]).toContain('3/10'); + expect(lines[4]).toContain('desired: 5'); + }); + + test('buildLines() shows N/A for concurrency when autoscaledPool is not provided', () => { + const stats = new Statistics(); + const monitor = new Monitor(stats); + const lines = monitor.buildLines(); + + expect(lines[4]).toContain('N/A'); + }); + + test('renders to stderr when interval fires', () => { + const writeStub = vi.spyOn(process.stderr, 'write').mockImplementation(() => true); + const stats = new Statistics(); + const monitor = new Monitor(stats, undefined, { intervalSecs: 1 }); + + monitor.start(); + vi.advanceTimersByTime(1000); + monitor.stop(); + + expect(writeStub).toHaveBeenCalled(); + }); + + test('in non-TTY mode, does not write ANSI overwrite codes', () => { + const writes: string[] = []; + vi.spyOn(process.stderr, 'write').mockImplementation((chunk: any) => { + writes.push(String(chunk)); + return true; + }); + Object.defineProperty(process.stderr, 'isTTY', { value: false, configurable: true }); + + const stats = new Statistics(); + const monitor = new Monitor(stats, undefined, { intervalSecs: 1 }); + + monitor.start(); + vi.advanceTimersByTime(1000); + monitor.stop(); + + const combined = writes.join(''); + // Should not contain ANSI cursor-up code + expect(combined).not.toContain('\x1b[5A'); + expect(combined).not.toContain('\x1b[2K'); + }); + + test('in TTY mode, second render writes ANSI cursor-up to overwrite', () => { + const writes: string[] = []; + vi.spyOn(process.stderr, 'write').mockImplementation((chunk: any) => { + writes.push(String(chunk)); + return true; + }); + Object.defineProperty(process.stderr, 'isTTY', { value: true, configurable: true }); + + const stats = new Statistics(); + const monitor = new Monitor(stats, undefined, { intervalSecs: 1 }); + + monitor.start(); + vi.advanceTimersByTime(1000); // first render (from start()) + vi.advanceTimersByTime(1000); // second render via interval — should have cursor-up + monitor.stop(); + + const combined = writes.join(''); + expect(combined).toContain('\x1b[5A'); + }); +}); From ed5ba44460926a414860acf9f86ceb6601407588 Mon Sep 17 00:00:00 2001 From: hiepau1231 Date: Sat, 4 Apr 2026 15:35:26 +0700 Subject: [PATCH 7/9] feat: integrate Monitor into BasicCrawler Adds a `monitor` boolean option to BasicCrawlerOptions. When true, a Monitor instance is created at the start of `run()` and stopped in the `finally` block, ensuring it always cleans up even if the crawl throws. Co-Authored-By: Claude Opus 4.6 --- .../src/internals/basic-crawler.ts | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 3ff8e2c63421..1dcdd5d2fb00 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -39,6 +39,7 @@ import { GotScrapingHttpClient, KeyValueStore, mergeCookies, + Monitor, NonRetryableError, purgeDefaultStorages, RequestListAdapter, @@ -405,6 +406,20 @@ export interface BasicCrawlerOptions(); private experiments: CrawlerExperiments; @@ -612,6 +628,7 @@ export class BasicCrawler this.requestManager?.getTotalCount()) + : null; + + monitor?.start(); + try { await this.autoscaledPool!.run(); } finally { + monitor?.stop(); await this.teardown(); await this.stats.stopCapturing(); From e9832ef84497d0743da44f74125eeff27fb29011 Mon Sep 17 00:00:00 2001 From: hiepau1231 Date: Sat, 4 Apr 2026 15:37:56 +0700 Subject: [PATCH 8/9] test: add integration tests for BasicCrawler monitor option 3 integration tests verifying that: - crawl completes normally with monitor: true - crawl completes normally with monitor: false (default) - errors still propagate to failedRequestHandler when monitor is active Co-Authored-By: Claude Opus 4.6 --- test/core/crawlers/basic_crawler.test.ts | 47 ++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/test/core/crawlers/basic_crawler.test.ts b/test/core/crawlers/basic_crawler.test.ts index c455fbf4f402..0956aeb803ea 100644 --- a/test/core/crawlers/basic_crawler.test.ts +++ b/test/core/crawlers/basic_crawler.test.ts @@ -2039,4 +2039,51 @@ describe('BasicCrawler', () => { expect(crawlerB.requestQueue?.config).toBe(configB); }); }); + + describe('monitor option', () => { + test('crawler runs successfully with monitor: true', async () => { + const handledUrls: string[] = []; + + const crawler = new BasicCrawler({ + monitor: true, + requestHandler: ({ request }) => { + handledUrls.push(request.url); + }, + }); + + await crawler.run([{ url: `http://${HOSTNAME}:${port}` }]); + expect(handledUrls).toHaveLength(1); + }); + + test('crawler runs successfully with monitor: false (default)', async () => { + const handledUrls: string[] = []; + + const crawler = new BasicCrawler({ + requestHandler: ({ request }) => { + handledUrls.push(request.url); + }, + }); + + await crawler.run([{ url: `http://${HOSTNAME}:${port}` }]); + expect(handledUrls).toHaveLength(1); + }); + + test('monitor: true does not suppress request errors — failedRequestHandler still fires', async () => { + let failed = 0; + + const crawler = new BasicCrawler({ + monitor: true, + maxRequestRetries: 0, + requestHandler: () => { + throw new Error('forced failure'); + }, + failedRequestHandler: () => { + failed++; + }, + }); + + await crawler.run([{ url: `http://${HOSTNAME}:${port}` }]); + expect(failed).toBe(1); + }); + }); }); From bd326487bb7a43d9e7efc0a522a4acef6a7f1e71 Mon Sep 17 00:00:00 2001 From: hiepau1231 Date: Sat, 4 Apr 2026 16:05:24 +0700 Subject: [PATCH 9/9] =?UTF-8?q?feat:=20finalize=20monitor=20mode=20?= =?UTF-8?q?=E2=80=94=20options=20passthrough,=20ETA=20guard,=20Windows=20C?= =?UTF-8?q?PU,=20unref=20timer,=204=20new=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add `monitorOptions?: MonitorOptions` to `BasicCrawlerOptions` and pass it through to the `Monitor` constructor so callers can configure `intervalSecs` without monkey-patching. - Skip the periodic status-message logger when `monitor: true` is set to prevent ANSI cursor-control sequences from being interleaved with plain log lines. - Guard ETA against negative values when `finished > total` (approximate counts from `RequestManagerTandem`): clamp remaining to ≥ 0. - Show `CPU: N/A` on Windows where `os.loadavg()` always returns zeros. - Call `.unref()` on the interval so the monitor cannot prevent process exit. - Remove internal planning docs (spec/plan) from the repository. - Add 4 tests: negative-ETA guard, Windows CPU label, intervalSecs config, and improve cursor-overwrite assertion. Closes #2680 Co-Authored-By: Claude Opus 4.6 --- .../plans/2026-04-04-monitor-mode.md | 777 ------------------ .../specs/2026-04-04-monitor-mode-design.md | 155 ---- .../src/internals/basic-crawler.ts | 16 +- packages/core/src/crawlers/monitor.ts | 9 +- test/core/crawlers/monitor.test.ts | 54 ++ 5 files changed, 76 insertions(+), 935 deletions(-) delete mode 100644 docs/superpowers/plans/2026-04-04-monitor-mode.md delete mode 100644 docs/superpowers/specs/2026-04-04-monitor-mode-design.md diff --git a/docs/superpowers/plans/2026-04-04-monitor-mode.md b/docs/superpowers/plans/2026-04-04-monitor-mode.md deleted file mode 100644 index 4cef245ca59b..000000000000 --- a/docs/superpowers/plans/2026-04-04-monitor-mode.md +++ /dev/null @@ -1,777 +0,0 @@ -# Monitor Mode Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Add an opt-in `monitor: true` option to `BasicCrawler` that prints a compact real-time status block to `process.stderr` during a crawl run. - -**Architecture:** A new `Monitor` class in `packages/core` reads from the `Statistics` instance (for progress/speed) and uses Node.js `os` and `process` built-ins (for CPU/memory). It writes a fixed-height block to `process.stderr` using ANSI escape codes to overwrite itself in TTY mode, falling back to plain newlines in non-TTY mode. `BasicCrawler.run()` instantiates `Monitor` (after `_init()`) when `monitor: true`, renders an initial frame immediately on `start()`, and stops it at the **very start** of the `finally` block (before any teardown logging). When `monitor: true`, `getPeriodicLogger()` is called with an option to suppress its output so the two writers do not interleave. - -**Tech Stack:** TypeScript, Node.js built-ins (`os`, `process`), Vitest (tests), `@crawlee/core`, `@crawlee/basic` - ---- - -## File Map - -| File | Action | Responsibility | -|---|---|---| -| `packages/core/src/crawlers/monitor.ts` | **Create** | `Monitor` class — renders status block to stderr | -| `packages/core/src/crawlers/index.ts` | **Modify** | Export `Monitor` | -| `packages/core/src/index.ts` | No change needed | Already re-exports `./crawlers` with `export *` | -| `packages/basic-crawler/src/internals/basic-crawler.ts` | **Modify** | Add `monitor` option, instantiate `Monitor` in `run()` | -| `test/core/crawlers/monitor.test.ts` | **Create** | Unit tests for `Monitor` class | -| `test/core/crawlers/basic_crawler.test.ts` | **Modify** | Integration tests: crawler with `monitor: true` completes ok | - ---- - -## Task 1: Create the `Monitor` class - -**Files:** -- Create: `packages/core/src/crawlers/monitor.ts` - -### Background - -`Statistics.state` has: -- `requestsFinished: number` -- `requestsFailed: number` -- `crawlerStartedAt: Date | string | null` - -`Statistics.calculate()` returns: -- `requestsFinishedPerMinute: number` - -`requestManager` lives on `BasicCrawler`, not on `Statistics`. To display `total`, we pass it as a separate parameter. - -For CPU/Mem we use Node.js built-ins only — no dependency on `AutoscaledPool` internals. - -`AutoscaledPool` exposes: -- `currentConcurrency: number` (getter) -- `desiredConcurrency: number` (getter) -- `maxConcurrency: number` (getter) - -- [ ] **Step 1: Create `packages/core/src/crawlers/monitor.ts` with this exact content:** - -```typescript -import os from 'node:os'; - -import type { AutoscaledPool } from '../autoscaling/autoscaled_pool'; -import type { Statistics } from './statistics'; - -export interface MonitorOptions { - /** - * How often to refresh the monitor display, in seconds. - * @default 5 - */ - intervalSecs?: number; -} - -const MONITOR_LINE_COUNT = 5; - -function padStart(n: number, width = 2): string { - return String(n).padStart(width, '0'); -} - -function formatDuration(ms: number): string { - const totalSecs = Math.floor(ms / 1000); - const h = Math.floor(totalSecs / 3600); - const m = Math.floor((totalSecs % 3600) / 60); - const s = totalSecs % 60; - return `${padStart(h)}:${padStart(m)}:${padStart(s)}`; -} - -function formatBytes(bytes: number): string { - if (bytes >= 1024 ** 3) return `${(bytes / 1024 ** 3).toFixed(1)} GB`; - if (bytes >= 1024 ** 2) return `${(bytes / 1024 ** 2).toFixed(0)} MB`; - return `${(bytes / 1024).toFixed(0)} KB`; -} - -/** - * Renders a compact real-time status block to `process.stderr` during a crawl. - * - * Enable via the `monitor` option on `BasicCrawler`: - * ```ts - * const crawler = new BasicCrawler({ monitor: true, ... }); - * ``` - * - * In TTY mode the block overwrites itself in-place. In non-TTY mode (CI, pipes) - * it prints plain lines so the output remains readable in logs. - */ -export class Monitor { - private intervalId?: ReturnType; - private readonly intervalMs: number; - private rendered = false; - - constructor( - private readonly stats: Statistics, - private readonly autoscaledPool?: AutoscaledPool, - private readonly options: MonitorOptions = {}, - private readonly totalRequests?: () => number | undefined, - ) { - this.intervalMs = (options.intervalSecs ?? 5) * 1000; - } - - /** Starts the periodic display. Renders an initial frame immediately, then repeats on each interval. */ - start(): void { - this.render(); // ISSUE-1 fix: render immediately so short crawls always show output - this.intervalId = setInterval(() => this.render(), this.intervalMs); - } - - /** Stops the periodic display and clears the last rendered block from the terminal. */ - stop(): void { - if (this.intervalId !== undefined) { - clearInterval(this.intervalId); - this.intervalId = undefined; - } - if (this.rendered && process.stderr.isTTY) { - // Move up MONITOR_LINE_COUNT lines and clear each one - for (let i = 0; i < MONITOR_LINE_COUNT; i++) { - process.stderr.write('\x1b[1A\x1b[2K'); - } - this.rendered = false; - } - } - - /** Builds and returns the status block as an array of lines. Exposed for testing. */ - buildLines(): string[] { - const { state } = this.stats; - const calculated = this.stats.calculate(); - - const startedAt = state.crawlerStartedAt ? new Date(state.crawlerStartedAt) : new Date(); - const now = new Date(); - const elapsed = now.getTime() - startedAt.getTime(); - - const finished = state.requestsFinished; - const failed = state.requestsFailed; - const total = this.totalRequests?.(); - // ISSUE-6 note: getTotalCount() on RequestManagerTandem may be an approximate sum - // of the underlying RequestList + RequestQueue. The plan treats this as a best-effort - // estimate: progress % and ETA are shown when total > 0, hidden when total === 0. - // This matches the existing behaviour in PR #2692 and is acceptable for a "monitor mode" - // display (non-authoritative progress indicator). No special-casing per request-source mode. - const speed = calculated.requestsFinishedPerMinute; - - const progressStr = total != null && total > 0 - ? `${finished}/${total} (${((finished / total) * 100).toFixed(1)}%)` - : total === 0 - ? `${finished}/0 (N/A%)` - : `${finished}/? (?%)`; - - const failedPct = finished + failed > 0 - ? ` | Failed: ${failed} (${((failed / (finished + failed)) * 100).toFixed(1)}%)` - : ''; - - let etaStr = 'N/A'; - if (total != null && total > 0 && speed > 0) { - const remaining = total - finished; - const etaMs = (remaining / speed) * 60 * 1000; - etaStr = `~${formatDuration(etaMs)}`; - } - - const memInfo = process.memoryUsage(); - const totalMem = os.totalmem(); - const usedMem = totalMem - os.freemem(); - const cpus = os.cpus(); - const cpuLoad = os.loadavg()[0]; - const cpuPct = cpus.length > 0 ? Math.min(100, (cpuLoad / cpus.length) * 100).toFixed(0) : '?'; - - const concurrency = this.autoscaledPool - ? `${this.autoscaledPool.currentConcurrency}/${this.autoscaledPool.maxConcurrency} (desired: ${this.autoscaledPool.desiredConcurrency})` - : 'N/A'; - - return [ - `\u23F1 Start: ${startedAt.toLocaleTimeString()} | Running for ${formatDuration(elapsed)}`, - `\uD83D\uDCCA Progress: ${progressStr}${failedPct} | Speed: ${speed} req/min`, - `\u23F3 ETA: ${etaStr}`, - `\uD83D\uDCBB CPU: ${cpuPct}% | Mem: ${formatBytes(memInfo.rss)} process / ${formatBytes(usedMem)} / ${formatBytes(totalMem)} total`, - `\uD83D\uDD00 Concurrency: ${concurrency}`, - ]; - } - - private render(): void { - const lines = this.buildLines(); - - if (process.stderr.isTTY && this.rendered) { - // Move cursor up to overwrite previous block - process.stderr.write(`\x1b[${MONITOR_LINE_COUNT}A`); - } - - for (const line of lines) { - if (process.stderr.isTTY) { - // Clear line then write - process.stderr.write(`\x1b[2K${line}\n`); - } else { - process.stderr.write(`${line}\n`); - } - } - - this.rendered = true; - } -} -``` - -- [ ] **Step 2: Run TypeScript check to verify the file compiles** - -```bash -cd packages/core && yarn tsc --noEmit 2>&1 | head -30 -``` - -Expected: no errors (or only pre-existing unrelated errors). - ---- - -## Task 2: Export `Monitor` from `@crawlee/core` - -**Files:** -- Modify: `packages/core/src/crawlers/index.ts` - -- [ ] **Step 1: Add export to `packages/core/src/crawlers/index.ts`** - -Current content of file: -```typescript -export * from './crawler_commons'; -export * from './crawler_extension'; -export * from './crawler_utils'; -export * from './statistics'; -export * from './error_tracker'; -export * from './error_snapshotter'; -``` - -Add one line at the end: -```typescript -export * from './crawler_commons'; -export * from './crawler_extension'; -export * from './crawler_utils'; -export * from './statistics'; -export * from './error_tracker'; -export * from './error_snapshotter'; -export * from './monitor'; -``` - -- [ ] **Step 2: Run TypeScript check** - -```bash -cd packages/core && yarn tsc --noEmit 2>&1 | head -30 -``` - -Expected: no errors. - -- [ ] **Step 3: Commit** - -```bash -git add packages/core/src/crawlers/monitor.ts packages/core/src/crawlers/index.ts -git commit -m "feat: add Monitor class to @crawlee/core" -``` - ---- - -## Task 3: Write unit tests for `Monitor` - -**Files:** -- Create: `test/core/crawlers/monitor.test.ts` - -### Background - -- `Statistics` is imported from `@crawlee/core` -- We use `vitest.useFakeTimers()` to control `setInterval` without real waiting -- We mock `process.stderr` by replacing `process.stderr.write` with a `vi.fn()` stub -- We mock `process.stderr.isTTY` using `Object.defineProperty` - -- [ ] **Step 1: Write the failing tests in `test/core/crawlers/monitor.test.ts`** - -```typescript -import os from 'node:os'; - -import { Configuration, Statistics } from '@crawlee/core'; -import { afterEach, beforeEach, describe, expect, test, vi } from 'vitest'; - -import { Monitor } from '../../../packages/core/src/crawlers/monitor'; -import { MemoryStorageEmulator } from '../../shared/MemoryStorageEmulator'; - -describe('Monitor', () => { - const localStorageEmulator = new MemoryStorageEmulator(); - let originalIsTTY: boolean | undefined; // ISSUE-7 fix: save original descriptor - - beforeEach(async () => { - await localStorageEmulator.init(); - vi.useFakeTimers(); - originalIsTTY = process.stderr.isTTY; // save before any test mutates it - }); - - afterEach(async () => { - await localStorageEmulator.destroy(); - vi.useRealTimers(); - vi.restoreAllMocks(); - // Restore isTTY to original value (Object.defineProperty mutations not undone by vi.restoreAllMocks) - Object.defineProperty(process.stderr, 'isTTY', { value: originalIsTTY, configurable: true }); - }); - - test('constructs without throwing', () => { - const stats = new Statistics(); - expect(() => new Monitor(stats)).not.toThrow(); - }); - - test('start() and stop() do not throw', () => { - const stats = new Statistics(); - const monitor = new Monitor(stats); - expect(() => monitor.start()).not.toThrow(); - expect(() => monitor.stop()).not.toThrow(); - }); - - test('stop() before start() does not throw', () => { - const stats = new Statistics(); - const monitor = new Monitor(stats); - expect(() => monitor.stop()).not.toThrow(); - }); - - test('buildLines() returns 5 lines', () => { - const stats = new Statistics(); - const monitor = new Monitor(stats); - const lines = monitor.buildLines(); - expect(lines).toHaveLength(5); - }); - - test('buildLines() shows finished/total and percentage when total is known', () => { - const stats = new Statistics(); - stats.startJob('r1'); - stats.finishJob('r1', 0); - - const monitor = new Monitor(stats, undefined, {}, () => 10); - const lines = monitor.buildLines(); - - expect(lines[1]).toContain('1/10'); - expect(lines[1]).toContain('10.0%'); - }); - - test('buildLines() shows ? when total is unknown', () => { - const stats = new Statistics(); - const monitor = new Monitor(stats); - const lines = monitor.buildLines(); - - expect(lines[1]).toContain('/?'); - }); - - test('buildLines() shows ETA as N/A when total is unknown', () => { - const stats = new Statistics(); - const monitor = new Monitor(stats); - const lines = monitor.buildLines(); - - expect(lines[2]).toContain('N/A'); - }); - - test('buildLines() shows concurrency info when autoscaledPool is provided', () => { - const stats = new Statistics(); - const fakePool = { - currentConcurrency: 3, - desiredConcurrency: 5, - maxConcurrency: 10, - } as any; - - const monitor = new Monitor(stats, fakePool); - const lines = monitor.buildLines(); - - expect(lines[4]).toContain('3/10'); - expect(lines[4]).toContain('desired: 5'); - }); - - test('buildLines() shows N/A for concurrency when autoscaledPool is not provided', () => { - const stats = new Statistics(); - const monitor = new Monitor(stats); - const lines = monitor.buildLines(); - - expect(lines[4]).toContain('N/A'); - }); - - test('renders to stderr when interval fires', () => { - const writeStub = vi.spyOn(process.stderr, 'write').mockImplementation(() => true); - const stats = new Statistics(); - const monitor = new Monitor(stats, undefined, { intervalSecs: 1 }); - - monitor.start(); - vi.advanceTimersByTime(1000); - monitor.stop(); - - expect(writeStub).toHaveBeenCalled(); - }); - - test('in non-TTY mode, does not write ANSI overwrite codes', () => { - const writes: string[] = []; - vi.spyOn(process.stderr, 'write').mockImplementation((chunk: any) => { - writes.push(String(chunk)); - return true; - }); - Object.defineProperty(process.stderr, 'isTTY', { value: false, configurable: true }); - - const stats = new Statistics(); - const monitor = new Monitor(stats, undefined, { intervalSecs: 1 }); - - monitor.start(); - vi.advanceTimersByTime(1000); - monitor.stop(); - - const combined = writes.join(''); - // Should not contain ANSI cursor-up code - expect(combined).not.toContain('\x1b[5A'); - expect(combined).not.toContain('\x1b[2K'); - }); - - test('in TTY mode, second render writes ANSI cursor-up to overwrite', () => { - const writes: string[] = []; - vi.spyOn(process.stderr, 'write').mockImplementation((chunk: any) => { - writes.push(String(chunk)); - return true; - }); - Object.defineProperty(process.stderr, 'isTTY', { value: true, configurable: true }); - - const stats = new Statistics(); - const monitor = new Monitor(stats, undefined, { intervalSecs: 1 }); - - monitor.start(); - vi.advanceTimersByTime(1000); // first render - vi.advanceTimersByTime(1000); // second render — should have cursor-up - monitor.stop(); - - const combined = writes.join(''); - expect(combined).toContain('\x1b[5A'); - }); -}); -``` - -- [ ] **Step 2: Run the tests to verify they FAIL (Monitor doesn't exist yet relative to test path)** - -```bash -cd "$(git rev-parse --show-toplevel)" && yarn vitest run test/core/crawlers/monitor.test.ts 2>&1 | tail -20 -``` - -Expected: tests fail because `Monitor` import path may need adjustment, or type errors. - -> **Note:** If the import `from '../../../packages/core/src/crawlers/monitor'` resolves correctly (check tsconfig paths in `test/tsconfig.json`), the tests may pass after Task 1. If not, adjust the import to `from '@crawlee/core'` after the build. - -- [ ] **Step 3: Check test tsconfig to see how other core internals are imported in tests** - -```bash -cat test/core/crawlers/statistics.test.ts | head -5 -``` - -If statistics is imported from `'@crawlee/core'`, change the monitor import similarly: - -```typescript -import { Monitor } from '@crawlee/core'; -``` - -Then re-run: - -```bash -yarn vitest run test/core/crawlers/monitor.test.ts 2>&1 | tail -20 -``` - -Expected: tests PASS (after Task 1 and Task 2 are done). - -- [ ] **Step 4: Commit** - -```bash -git add test/core/crawlers/monitor.test.ts -git commit -m "test: add unit tests for Monitor class" -``` - ---- - -## Task 4: Integrate `Monitor` into `BasicCrawler` - -**Files:** -- Modify: `packages/basic-crawler/src/internals/basic-crawler.ts` - -### Background - -The `run()` function is around line 979 in `basic-crawler.ts`. The structure is: - -```typescript -async run(...) { - // ...setup... - await this._init(); - await this.stats.startCapturing(); - const periodicLogger = this.getPeriodicLogger(); - // ... - try { - await this.autoscaledPool!.run(); - } finally { - await this.teardown(); - // ... - periodicLogger.stop(); - // ... - } -} -``` - -`this.autoscaledPool` is assigned inside `this._init()`, so it's available after `_init()`. - -`this.requestManager` is also available after `_init()`. - -- [ ] **Step 1: Add `monitor` to imports from `@crawlee/core`** - -In `packages/basic-crawler/src/internals/basic-crawler.ts`, find the import block from `@crawlee/core` (around line 31). Add `Monitor` and `MonitorOptions` to it: - -```typescript -import { - AutoscaledPool, - Configuration, - CriticalError, - Dataset, - enqueueLinks, - EnqueueStrategy, - EventType, - GotScrapingHttpClient, - KeyValueStore, - mergeCookies, - Monitor, - NonRetryableError, - purgeDefaultStorages, - RequestListAdapter, - RequestManagerTandem, - RequestProvider, - RequestQueue, - // ... rest of existing imports -} from '@crawlee/core'; -``` - -- [ ] **Step 2: Add `monitor` option to `BasicCrawlerOptions` interface** - -Find the `BasicCrawlerOptions` interface. It ends around the `statisticsOptions` and `httpClient` properties. Add after `httpClient`: - -```typescript -/** - * Enables monitor mode: a compact real-time status block printed to `process.stderr` during the crawl. - * - * In interactive terminals (TTY), the block overwrites itself in-place. - * In non-TTY environments (CI, piped output), plain lines are printed instead. - * - * @default false - * @example - * ```ts - * const crawler = new BasicCrawler({ monitor: true }); - * ``` - */ -monitor?: boolean; -``` - -- [ ] **Step 3: Store `monitor` option in the constructor and add `ow` validation** - -Find the `ow` validation block in the constructor (around line 590–630). Add: - -```typescript -monitor: ow.optional.boolean, -``` - -Find the destructuring of constructor options (around line 637–700). Add `monitor = false`: - -```typescript -const { - // ... existing destructuring ... - monitor = false, -} = options; -``` - -Add a protected field on the class (near other protected fields around line 566): - -```typescript -protected monitorEnabled: boolean; -``` - -And in the constructor body, assign it: - -```typescript -this.monitorEnabled = monitor; -``` - -- [ ] **Step 4: Instantiate and run `Monitor` inside `run()`** - -Find the `run()` method. **Check how `getPeriodicLogger()` is called** — it returns an object with a `stop()` method. When `monitor: true`, the periodic logger must be silenced so both do not write to stderr simultaneously. Do this by checking if `BasicCrawlerOptions` already has a `statusMessageLoggingInterval` option; if `monitor` is true, pass `statusMessageLoggingInterval: 0` (effectively disabling periodic status log messages) to the periodic logger or set the logging to `Number.POSITIVE_INFINITY` to suppress it. - -Concretely, after `await this._init();` and `await this.stats.startCapturing();`, replace the existing `const periodicLogger = this.getPeriodicLogger();` line with: - -```typescript -const periodicLogger = this.getPeriodicLogger(); -const monitorInstance = this.monitorEnabled - ? new Monitor( - this.stats, - this.autoscaledPool, - { intervalSecs: 5 }, - () => this.requestManager?.getTotalCount(), - ) - : null; -// When monitor is active, suppress the periodic status logger (ISSUE-2 fix) -if (this.monitorEnabled) { - periodicLogger.stop(); -} -monitorInstance?.start(); -``` - -In the `finally` block, **as the very first statement** (ISSUE-3 fix — before `await this.teardown()` and before any final logging), add: - -```typescript -// Stop monitor first so its ANSI block is cleared before any teardown logs -monitorInstance?.stop(); -``` - -Then resume with the existing teardown logic. - -- [ ] **Step 5: Run TypeScript check** - -```bash -cd "$(git rev-parse --show-toplevel)" && yarn tsc-check-tests 2>&1 | head -40 -``` - -Expected: no new errors. - -- [ ] **Step 6: Commit** - -```bash -git add packages/basic-crawler/src/internals/basic-crawler.ts -git commit -m "feat: add monitor option to BasicCrawler" -``` - ---- - -## Task 5: Add integration tests to `basic_crawler.test.ts` - -**Files:** -- Modify: `test/core/crawlers/basic_crawler.test.ts` - -- [ ] **Step 1: Find a good `describe` block to add the new tests** - -The file has a top-level `describe('BasicCrawler', ...)`. Add a new nested `describe` block at the end (before the closing `}`), after all existing `describe` blocks. - -- [ ] **Step 2: Add the integration tests** - -Add this block inside `describe('BasicCrawler', ...)`. **These tests spy on `process.stderr.write` to verify actual monitor output is produced (ISSUE-4 fix).** - -```typescript -describe('monitor option', () => { - test('crawler with monitor: true writes to stderr during run', async () => { - const writes: string[] = []; - const writeStub = vi.spyOn(process.stderr, 'write').mockImplementation((chunk: any) => { - writes.push(String(chunk)); - return true; - }); - - const requestList = await RequestList.open(null, [ - `http://${HOSTNAME}:${port}/`, - ]); - - const crawler = new BasicCrawler({ - requestList, - monitor: true, - async requestHandler() { - // no-op - }, - }); - - const stats = await crawler.run(); - - writeStub.mockRestore(); - - expect(stats.requestsFinished).toBe(1); - expect(stats.requestsFailed).toBe(0); - // Monitor must have written at least one 5-line block containing known marker - const combined = writes.join(''); - expect(combined).toContain('Progress:'); // ISSUE-4 fix: assert monitor-specific marker, not just non-empty - }); - - test('crawler with monitor: false does not write monitor output to stderr', async () => { - const writes: string[] = []; - const writeStub = vi.spyOn(process.stderr, 'write').mockImplementation((chunk: any) => { - writes.push(String(chunk)); - return true; - }); - - const requestList = await RequestList.open(null, [ - `http://${HOSTNAME}:${port}/`, - ]); - - const crawler = new BasicCrawler({ - requestList, - monitor: false, - async requestHandler() { - // no-op - }, - }); - - const stats = await crawler.run(); - - writeStub.mockRestore(); - - expect(stats.requestsFinished).toBe(1); - expect(stats.requestsFailed).toBe(0); - // No monitor block: stderr writes should be empty or contain no progress line - const combined = writes.join(''); - expect(combined).not.toContain('Progress:'); - }); -}); -``` - -> **Note:** The `HOSTNAME`, `port`, and `server` variables are already defined in the outer `describe('BasicCrawler', ...)` scope, set up in `beforeAll`. The URL `http://${HOSTNAME}:${port}/` returns a valid response (`app.get('/', ...)` is already defined near the top of the file). - -- [ ] **Step 3: Run the integration tests** - -```bash -cd "$(git rev-parse --show-toplevel)" && yarn vitest run test/core/crawlers/basic_crawler.test.ts 2>&1 | tail -30 -``` - -Expected: all tests pass (including the new ones). - -- [ ] **Step 4: Run the full unit test suite for monitor** - -```bash -cd "$(git rev-parse --show-toplevel)" && yarn vitest run test/core/crawlers/monitor.test.ts 2>&1 | tail -20 -``` - -Expected: all tests pass. - -- [ ] **Step 5: Run the full TypeScript check one last time** - -```bash -cd "$(git rev-parse --show-toplevel)" && yarn tsc-check-tests 2>&1 | head -40 -``` - -Expected: no errors. - -- [ ] **Step 6: Commit** - -```bash -git add test/core/crawlers/basic_crawler.test.ts -git commit -m "test: add integration tests for BasicCrawler monitor option" -``` - ---- - -## Self-Review Checklist - -### Spec coverage - -| Spec requirement | Task that covers it | -|---|---| -| New `Monitor` class in `packages/core/src/crawlers/monitor.ts` | Task 1 | -| Reads `Statistics` for progress/speed | Task 1 — `stats.state` + `stats.calculate()` | -| Shows start time, elapsed, progress, ETA, CPU, mem, concurrency | Task 1 — `buildLines()` | -| Writes to `process.stderr` | Task 1 — `render()` uses `process.stderr.write` | -| TTY: in-place overwrite with ANSI codes | Task 1 — `render()` | -| Non-TTY: plain newline fallback | Task 1 — `render()` checks `isTTY` | -| Export from `@crawlee/core` | Task 2 | -| `monitor?: boolean` option on `BasicCrawlerOptions` | Task 4 Step 2 | -| Instantiated in `run()` after `_init()` | Task 4 Step 4 | -| Initial render on `start()` for short crawls | Task 1 — `start()` calls `render()` immediately (ISSUE-1) | -| `stop()` at very start of `finally`, before teardown logs | Task 4 Step 4 (ISSUE-3) | -| Periodic logger suppressed when monitor active | Task 4 Step 4 (ISSUE-2) | -| `total = 0` handled — shows `N/A%` not `NaN` | Task 1 — `buildLines()` (ISSUE-5) | -| Integration tests verify stderr output, not just stats | Task 5 — `vi.spyOn(process.stderr.write)` (ISSUE-4) | -| Unit tests for `Monitor` | Task 3 | - -All requirements covered. ✅ - -### Placeholder scan - -No TBD/TODO or vague instructions. All code steps contain complete implementations. ✅ - -### Type consistency - -- `Monitor` constructor signature defined in Task 1 and referenced in Task 4 — parameters match (`stats`, `autoscaledPool`, `options`, `totalRequests`). -- `buildLines()` defined in Task 1 and tested in Task 3 — name matches. -- `MonitorOptions.intervalSecs` defined in Task 1, used in Task 4 — consistent. -- `autoscaledPool.currentConcurrency`, `.desiredConcurrency`, `.maxConcurrency` — verified as public getters from codebase exploration. ✅ diff --git a/docs/superpowers/specs/2026-04-04-monitor-mode-design.md b/docs/superpowers/specs/2026-04-04-monitor-mode-design.md deleted file mode 100644 index d90c162df1ca..000000000000 --- a/docs/superpowers/specs/2026-04-04-monitor-mode-design.md +++ /dev/null @@ -1,155 +0,0 @@ -# Monitor Mode for BasicCrawler — Design Spec - -**Date:** 2026-04-04 -**Issue:** [#2680](https://github.com/apify/crawlee/issues/2680) -**Related PR:** [#2692](https://github.com/apify/crawlee/pull/2692) (reference only — implementation is fresh) - ---- - -## Problem - -When running a crawler locally, there is no real-time progress overview. Developers have to read scattered log lines to understand how fast the crawl is going, how much is left, and what the system load looks like. The `puppeteer-cluster` library had a monitor feature that was widely used and is missed after migrating to Crawlee. - ---- - -## Goal - -Add an opt-in `monitor` option to `BasicCrawler` that prints a compact, real-time status block to the terminal while crawling. It must not interfere with the existing logger output. - ---- - -## Architecture - -### New file: `packages/core/src/crawlers/monitor.ts` - -A standalone `Monitor` class. It receives a `Statistics` instance and an optional `AutoscaledPool` instance, then on a configurable interval renders a status block to `process.stderr`. - -Using `process.stderr` keeps it separate from the `@apify/log` output, which writes to `process.stdout` by default. This prevents the monitor from overwriting log lines. - -When `process.stderr.isTTY` is `true` (interactive terminal), the monitor uses ANSI escape codes (`\x1b[{N}A\x1b[2K`) to overwrite its own previous output in-place. When not a TTY (CI, piped output), it falls back to plain newline-delimited prints so the output stays readable in logs. - -**Class interface:** - -```ts -export interface MonitorOptions { - /** How often to refresh the monitor display. Default: 5 seconds. */ - intervalSecs?: number; -} - -export class Monitor { - constructor( - private readonly stats: Statistics, - private readonly autoscaledPool?: AutoscaledPool, - private readonly options: MonitorOptions = {}, - ) {} - - start(): void; // starts setInterval - stop(): void; // clears interval, erases last monitor block from terminal -} -``` - -**Rendered output format** (5 lines): - -``` -⏱ Start: 2024-01-01 10:00:00 | Running for 00:03:24 -📊 Progress: 145/500 (29.0%) | Failed: 3 (2.1%) | Speed: 42 req/min -⏳ ETA: ~00:08:27 -💻 CPU: 34% | Mem: 512 MB / 1.8 GB -🔀 Concurrency: 8/10 (desired: 10) -``` - -- **Total** is read from `requestManager.getTotalCount()` passed in via constructor (optional — shown as `?` when unknown) -- **Speed** is `requestsFinishedPerMinute` from `stats.calculate()` -- **ETA** is `(total - finished) / speed` in minutes, formatted as `HH:MM:SS`; shows `N/A` when total is unknown -- **CPU/Mem** is read from `autoscaledPool.systemStatus.getCurrentStatus()` when pool is available; shows `N/A` otherwise -- **Concurrency** reads `autoscaledPool.currentConcurrency` and `autoscaledPool.desiredConcurrency` - ---- - -### Changes to `packages/basic-crawler/src/internals/basic-crawler.ts` - -**1. Add option to `BasicCrawlerOptions`:** - -```ts -/** - * Enables monitor mode: a real-time status block printed to stderr during the crawl. - * Only active when stderr is a TTY or when output is plain (CI-friendly fallback). - * @default false - */ -monitor?: boolean; -``` - -This is a top-level option, not inside `experiments`. The feature is stable enough to warrant a direct option. - -**2. Store it on the crawler:** - -```ts -protected monitor: boolean; -// in constructor: -this.monitor = options.monitor ?? false; -``` - -**3. In `run()`**, alongside the existing `periodicLogger`: - -```ts -const monitorInstance = this.monitor - ? new Monitor(this.stats, this.autoscaledPool, { intervalSecs: 5 }) - : null; -monitorInstance?.start(); - -try { - await this.autoscaledPool!.run(); -} finally { - monitorInstance?.stop(); - // ... existing teardown -} -``` - -**4. Export `Monitor` from `packages/core/src/crawlers/index.ts` and `packages/core/src/index.ts`.** - ---- - -## Testing Strategy - -### Unit tests — `test/core/crawlers/monitor.test.ts` - -| Test | What it checks | -|---|---| -| Constructs without throwing | Basic instantiation | -| `start()` + `stop()` without error | Lifecycle works | -| Renders correct output with known stats | Format string correctness | -| Non-TTY mode prints plain lines (no ANSI) | CI-safe fallback | -| TTY mode uses ANSI overwrite codes | In-place refresh | -| ETA shows `N/A` when total is unknown | Edge case | -| Stop clears the interval | No memory leak | - -Use `vitest.useFakeTimers()` to control the interval without real waiting. -Mock `process.stderr` with a writable stub to capture output without printing to real terminal. - -### Integration tests — added to `test/core/crawlers/basic_crawler.test.ts` - -| Test | What it checks | -|---|---| -| Crawler with `monitor: true` completes successfully | No crash, correct final stats returned | -| Crawler with `monitor: false` behaves identically | Option is inert when disabled | - ---- - -## Non-goals - -- No interactive keyboard controls (pause/resume via keypress) — out of scope -- No color themes or custom format strings — keep it simple for v1 -- No new npm dependencies — implement with Node.js built-ins only - ---- - -## Files Changed - -| File | Change | -|---|---| -| `packages/core/src/crawlers/monitor.ts` | **New** — `Monitor` class | -| `packages/core/src/crawlers/index.ts` | Export `Monitor` | -| `packages/core/src/index.ts` | Re-export `Monitor` | -| `packages/basic-crawler/src/internals/basic-crawler.ts` | Add `monitor` option, instantiate `Monitor` in `run()` | -| `test/core/crawlers/monitor.test.ts` | **New** — unit tests | -| `test/core/crawlers/basic_crawler.test.ts` | Add integration tests | diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 1dcdd5d2fb00..f6c37edc1be5 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -40,6 +40,7 @@ import { KeyValueStore, mergeCookies, Monitor, + type MonitorOptions, NonRetryableError, purgeDefaultStorages, RequestListAdapter, @@ -420,6 +421,11 @@ export interface BasicCrawlerOptions(); private experiments: CrawlerExperiments; @@ -629,6 +636,7 @@ export class BasicCrawler { + // When monitor mode is active, it owns the display — skip the periodic log to avoid + // interleaving plain log lines with ANSI cursor-movement sequences. + if (this.monitorEnabled) return; + const { mode: operationMode, failedDelta } = getOperationMode(); let message: string; @@ -1058,7 +1072,7 @@ export class BasicCrawler this.requestManager?.getTotalCount()) + ? new Monitor(this.stats, this.autoscaledPool, this.monitorOptions, () => this.requestManager?.getTotalCount()) : null; monitor?.start(); diff --git a/packages/core/src/crawlers/monitor.ts b/packages/core/src/crawlers/monitor.ts index e371c5267f4d..423c67f72e59 100644 --- a/packages/core/src/crawlers/monitor.ts +++ b/packages/core/src/crawlers/monitor.ts @@ -60,6 +60,7 @@ export class Monitor { start(): void { this.render(); // render immediately so short crawls always show output this.intervalId = setInterval(() => this.render(), this.intervalMs); + this.intervalId.unref(); // don't prevent process exit if the event loop would otherwise be empty } /** Stops the periodic display and clears the last rendered block from the terminal. */ @@ -108,7 +109,8 @@ export class Monitor { let etaStr = 'N/A'; if (total != null && total > 0 && speed > 0) { - const remaining = total - finished; + // Use Math.max to guard against negative remaining (e.g. when total is an approximate count) + const remaining = Math.max(0, total - finished); const etaMs = (remaining / speed) * 60 * 1000; etaStr = `~${formatDuration(etaMs)}`; } @@ -118,7 +120,10 @@ export class Monitor { const usedMem = totalMem - os.freemem(); const cpus = os.cpus(); const cpuLoad = os.loadavg()[0]; - const cpuPct = cpus.length > 0 ? Math.min(100, (cpuLoad / cpus.length) * 100).toFixed(0) : '?'; + // os.loadavg() always returns [0,0,0] on Windows — show N/A to avoid misleading output. + const cpuPct = process.platform === 'win32' + ? 'N/A' + : cpus.length > 0 ? Math.min(100, (cpuLoad / cpus.length) * 100).toFixed(0) : '?'; const concurrency = this.autoscaledPool ? `${this.autoscaledPool.currentConcurrency}/${this.autoscaledPool.maxConcurrency} (desired: ${this.autoscaledPool.desiredConcurrency})` diff --git a/test/core/crawlers/monitor.test.ts b/test/core/crawlers/monitor.test.ts index f3f1449bced8..324388360c09 100644 --- a/test/core/crawlers/monitor.test.ts +++ b/test/core/crawlers/monitor.test.ts @@ -152,4 +152,58 @@ describe('Monitor', () => { const combined = writes.join(''); expect(combined).toContain('\x1b[5A'); }); + + test('ETA is never negative when finished > total (approximate count)', () => { + const stats = new Statistics(); + // Simulate 11 finished but total = 10 (approximate) + for (let i = 0; i < 11; i++) { + stats.startJob(`r${i}`); + stats.finishJob(`r${i}`, 0); + } + + const monitor = new Monitor(stats, undefined, {}, () => 10); + const lines = monitor.buildLines(); + + // ETA should be ~00:00:00 (zero remaining), never a negative duration + expect(lines[2]).not.toMatch(/~-/); + expect(lines[2]).toContain('~00:00:00'); + }); + + test('monitorOptions.intervalSecs controls the refresh interval', () => { + const writes: string[] = []; + vi.spyOn(process.stderr, 'write').mockImplementation((chunk: any) => { + writes.push(String(chunk)); + return true; + }); + Object.defineProperty(process.stderr, 'isTTY', { value: false, configurable: true }); + + const stats = new Statistics(); + const monitor = new Monitor(stats, undefined, { intervalSecs: 3 }); + + monitor.start(); + // At 2 s: only the immediate render from start() — interval has not fired yet + vi.advanceTimersByTime(2000); + const writeCountBefore = writes.length; + + // At 3 s: interval fires once + vi.advanceTimersByTime(1000); + monitor.stop(); + + // After 3 s total, the interval should have fired exactly once, adding 5 more lines + expect(writes.length).toBe(writeCountBefore + 5); + }); + + test('CPU line shows N/A on Windows', () => { + const originalPlatform = process.platform; + Object.defineProperty(process, 'platform', { value: 'win32', configurable: true }); + vi.spyOn(os, 'loadavg').mockReturnValue([0, 0, 0]); + + const stats = new Statistics(); + const monitor = new Monitor(stats); + const lines = monitor.buildLines(); + + Object.defineProperty(process, 'platform', { value: originalPlatform, configurable: true }); + + expect(lines[3]).toContain('CPU: N/A'); + }); });