Skip to content

Commit e652c9e

Browse files
committed
feat: upgraded terminal ui
1 parent f109edb commit e652c9e

3 files changed

Lines changed: 79 additions & 23 deletions

File tree

apps/scraper/commoncrawl/warc.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ export interface FetchOptions {
1818
maxRetries?: number;
1919
maxBackoffMs?: number;
2020
rateLimiter?: RateLimiter;
21+
onError?: (status: number, url: string, message: string) => void;
2122
}
2223

2324
class HttpError extends Error {
@@ -37,7 +38,7 @@ export async function fetchWarcRecord(
3738
record: CdxRecord,
3839
options: FetchOptions = {},
3940
): Promise<WarcResult> {
40-
const { timeoutMs = 45000, maxRetries = 10, maxBackoffMs = 60000, rateLimiter } = options;
41+
const { timeoutMs = 45000, maxRetries = 10, maxBackoffMs = 60000, rateLimiter, onError } = options;
4142

4243
const offset = parseInt(record.offset, 10);
4344
const length = parseInt(record.length, 10);
@@ -71,6 +72,8 @@ export async function fetchWarcRecord(
7172
// Handle rate limiting errors with retry
7273
if (response.status === 503 || response.status === 429) {
7374
rateLimiter?.reportError(response.status);
75+
const statusText = response.status === 429 ? "Too Many Requests" : "Service Unavailable";
76+
onError?.(response.status, record.url, `${response.status} ${statusText} - backing off`);
7477
if (attempt < maxRetries) {
7578
const baseDelay = Math.min(2 ** attempt * 1000, maxBackoffMs);
7679
const jitter = Math.random() * 0.3 * baseDelay; // 0-30% jitter
@@ -85,6 +88,7 @@ export async function fetchWarcRecord(
8588
}
8689

8790
if (!response.ok && response.status !== 206) {
91+
onError?.(response.status, record.url, `${response.status} ${response.statusText}`);
8892
throw new HttpError(
8993
response.status,
9094
`WARC fetch failed: ${response.status} ${response.statusText}`,

apps/scraper/scraper.ts

Lines changed: 30 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,10 @@ import {
1212
blank,
1313
clearLines,
1414
formatDuration,
15+
formatProgress,
1516
header,
1617
keyValue,
17-
progressBar,
18+
logError,
1819
section,
1920
writeMultiLineProgress,
2021
} from "./ui";
@@ -28,10 +29,11 @@ interface ProcessContext {
2829
stats: { saved: number; skipped: number; failed: number };
2930
rateLimiter: RateLimiter;
3031
force?: boolean;
32+
onError?: (status: number, url: string, message: string) => void;
3133
}
3234

3335
async function processRecord(record: CdxRecord, ctx: ProcessContext) {
34-
const { db, storage, config, crawlId, stats, rateLimiter, force } = ctx;
36+
const { db, storage, config, crawlId, stats, rateLimiter, force, onError } = ctx;
3537

3638
// Check if already processed (skip if --force)
3739
if (!force) {
@@ -48,6 +50,7 @@ async function processRecord(record: CdxRecord, ctx: ProcessContext) {
4850
result = await fetchWarcRecord(record, {
4951
timeoutMs: config.crawl.timeoutMs,
5052
rateLimiter,
53+
onError,
5154
});
5255
} catch (err) {
5356
stats.failed++;
@@ -169,40 +172,44 @@ export async function scrape(
169172
// Track throughput
170173
let lastThroughputUpdate = Date.now();
171174
let docsAtLastUpdate = 0;
175+
let currentDocsPerSec = 0;
172176

173177
// Track line count for clearing
174-
let prevLineCount = 1;
178+
let prevLineCount = 2;
179+
180+
// Error logging for verbose mode
181+
const onError = verbose
182+
? (_status: number, url: string, message: string) => {
183+
// Clear progress lines, log error, then redraw progress
184+
clearLines(prevLineCount);
185+
logError(`${message} - ${url}`);
186+
prevLineCount = 0; // Reset so next update draws fresh
187+
}
188+
: undefined;
175189

176190
// Progress update function
177191
const updateProgress = () => {
178-
const lines: string[] = [];
179-
180192
// Calculate docs/sec
181193
const now = Date.now();
182194
const elapsed = (now - lastThroughputUpdate) / 1000;
183-
let docsPerSec = 0;
184195
if (elapsed >= 1) {
185-
docsPerSec = (stats.saved - docsAtLastUpdate) / elapsed;
196+
currentDocsPerSec = (stats.saved - docsAtLastUpdate) / elapsed;
186197
lastThroughputUpdate = now;
187198
docsAtLastUpdate = stats.saved;
188199
}
189200

190201
const { errorCount } = rateLimiter.getStats();
191202

192-
const extras: string[] = [];
193-
if (docsPerSec > 0) extras.push(`${docsPerSec.toFixed(1)}/s`);
194-
if (stats.skipped > 0) extras.push(`${stats.skipped} dup`);
195-
if (stats.failed > 0) extras.push(`${stats.failed} fail`);
196-
if (errorCount > 0) extras.push(`${errorCount} retried`);
197-
const extrasText = extras.length > 0 ? ` (${extras.join(" · ")})` : "";
198-
199-
if (batchSize === Infinity) {
200-
lines.push(` WARC: ${stats.saved} saved${extrasText}`);
201-
} else {
202-
const savedDisplay = Math.min(stats.saved, batchSize);
203-
const warcBar = progressBar(savedDisplay, batchSize);
204-
lines.push(` WARC: ${warcBar} ${savedDisplay}/${batchSize} saved${extrasText}`);
205-
}
203+
const lines = formatProgress({
204+
saved: Math.min(stats.saved, batchSize),
205+
total: batchSize,
206+
docsPerSec: currentDocsPerSec,
207+
currentRps: rateLimiter.getCurrentRps(),
208+
skipped: stats.skipped,
209+
failed: stats.failed,
210+
retried: errorCount,
211+
elapsedMs: Date.now() - startTime,
212+
});
206213

207214
prevLineCount = writeMultiLineProgress(lines, prevLineCount);
208215
};
@@ -229,8 +236,9 @@ export async function scrape(
229236
config,
230237
crawlId,
231238
stats,
232-
rateLimiter: rateLimiter,
239+
rateLimiter,
233240
force,
241+
onError,
234242
});
235243
updateProgress();
236244
}).finally(() => tasks.delete(task));

apps/scraper/ui.ts

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,3 +52,47 @@ export function formatDuration(ms: number): string {
5252
}
5353
return `${seconds}s`;
5454
}
55+
56+
export interface ProgressStats {
57+
saved: number;
58+
total: number;
59+
docsPerSec: number;
60+
currentRps: number;
61+
skipped: number;
62+
failed: number;
63+
retried: number;
64+
elapsedMs: number;
65+
}
66+
67+
export function formatProgress(stats: ProgressStats): string[] {
68+
const { saved, total, docsPerSec, currentRps, skipped, failed, retried, elapsedMs } = stats;
69+
70+
const lines: string[] = [];
71+
72+
// Line 1: Progress bar with count and percentage
73+
if (total === Infinity) {
74+
lines.push(`━━━━━━━━━━━━━━━━━━━━ ${saved} saved`);
75+
} else {
76+
const bar = progressBar(saved, total);
77+
const pct = total > 0 ? ((saved / total) * 100).toFixed(1) : "0.0";
78+
lines.push(`${bar} ${saved}/${total} (${pct}%)`);
79+
}
80+
81+
// Line 2: Metrics
82+
const metrics: string[] = [];
83+
metrics.push(`${docsPerSec.toFixed(1)}/s @ ${currentRps} RPS`);
84+
if (skipped > 0) metrics.push(`${skipped} dup`);
85+
if (failed > 0) metrics.push(`${failed} fail`);
86+
if (retried > 0) metrics.push(`${retried} retried`);
87+
metrics.push(formatDuration(elapsedMs));
88+
89+
lines.push(metrics.join(" · "));
90+
91+
return lines;
92+
}
93+
94+
export function logError(message: string) {
95+
const now = new Date();
96+
const time = now.toLocaleTimeString("en-US", { hour12: false });
97+
console.log(`[${time}] ${message}`);
98+
}

0 commit comments

Comments
 (0)