Skip to content

Commit b753f04

Browse files
committed
feat: skip failed exec
1 parent 04fc3b1 commit b753f04

3 files changed

Lines changed: 15 additions & 4 deletions

File tree

packages/extractor/processor.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ export async function processDirectory(
5656
console.log(` Output: ${outputPrefix}/{hash}.txt`);
5757
}
5858

59-
const EXTRACTION_TIMEOUT_MS = 120_000; // 2 minutes per document
59+
const EXTRACTION_TIMEOUT_MS = 30_000; // 30 seconds per document
6060

6161
async function extractWithPython(
6262
doc: DocumentRecord,

packages/scraper/scraper.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,8 +164,10 @@ export async function scrape(options: ScrapeOptions) {
164164
// Initialize database
165165
const db = await createDb(config.database.url);
166166

167-
// Pre-load successful URLs for fast duplicate checking
167+
// Pre-load processed URLs for fast duplicate checking (includes both uploaded and failed)
168168
const uploadedUrls = force ? new Set<string>() : await db.getUploadedUrls();
169+
const failedUrls = force ? new Set<string>() : await db.getFailedUrls();
170+
const processedUrls = new Set([...uploadedUrls, ...failedUrls]);
169171

170172
// Aggregate stats across all crawls
171173
const totalStats = { saved: 0, skipped: 0, failed: 0 };
@@ -247,7 +249,7 @@ export async function scrape(options: ScrapeOptions) {
247249

248250
const task = downloadLimit(async () => {
249251
// Check duplicates BEFORE rate limiting (instant skip)
250-
if (!force && uploadedUrls.has(record.url)) {
252+
if (!force && processedUrls.has(record.url)) {
251253
stats.skipped++;
252254
updateProgress();
253255
return;

packages/shared/db.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ export interface DbClient {
6464
getDocument(id: string): Promise<DocumentRecord | null>;
6565
getDocumentByUrl(url: string): Promise<DocumentRecord | null>;
6666
getUploadedUrls(): Promise<Set<string>>;
67+
getFailedUrls(): Promise<Set<string>>;
6768
getPendingDocuments(limit: number): Promise<DocumentRecord[]>;
6869
getDocumentsByStatus(status: DocumentStatus, limit?: number): Promise<DocumentRecord[]>;
6970
getStats(): Promise<{ status: string; count: number }[]>;
@@ -157,6 +158,13 @@ export async function createDb(databaseUrl: string): Promise<DbClient> {
157158
return new Set(rows.map((r) => r.source_url));
158159
},
159160

161+
async getFailedUrls() {
162+
const rows = await sql<{ source_url: string }[]>`
163+
SELECT source_url FROM documents WHERE status = 'failed'
164+
`;
165+
return new Set(rows.map((r) => r.source_url));
166+
},
167+
160168
async getPendingDocuments(limit: number) {
161169
return sql<DocumentRecord[]>`
162170
SELECT * FROM documents WHERE status = 'pending' LIMIT ${limit}
@@ -207,11 +215,12 @@ export async function createDb(databaseUrl: string): Promise<DbClient> {
207215
},
208216

209217
async getUnextractedDocuments(limit: number) {
210-
// Get docs that haven't been extracted yet (includes failed ones for retry)
218+
// Get docs that haven't been extracted yet (excludes previously failed ones)
211219
return sql<DocumentRecord[]>`
212220
SELECT * FROM documents
213221
WHERE status = 'uploaded'
214222
AND extracted_at IS NULL
223+
AND extraction_error IS NULL
215224
ORDER BY uploaded_at ASC
216225
LIMIT ${limit}
217226
`;

0 commit comments

Comments
 (0)