@@ -64,6 +64,7 @@ export interface DbClient {
6464 getDocument ( id : string ) : Promise < DocumentRecord | null > ;
6565 getDocumentByUrl ( url : string ) : Promise < DocumentRecord | null > ;
6666 getUploadedUrls ( ) : Promise < Set < string > > ;
67+ getFailedUrls ( ) : Promise < Set < string > > ;
6768 getPendingDocuments ( limit : number ) : Promise < DocumentRecord [ ] > ;
6869 getDocumentsByStatus ( status : DocumentStatus , limit ?: number ) : Promise < DocumentRecord [ ] > ;
6970 getStats ( ) : Promise < { status : string ; count : number } [ ] > ;
@@ -157,6 +158,13 @@ export async function createDb(databaseUrl: string): Promise<DbClient> {
157158 return new Set ( rows . map ( ( r ) => r . source_url ) ) ;
158159 } ,
159160
161+ async getFailedUrls ( ) {
162+ const rows = await sql < { source_url : string } [ ] > `
163+ SELECT source_url FROM documents WHERE status = 'failed'
164+ ` ;
165+ return new Set ( rows . map ( ( r ) => r . source_url ) ) ;
166+ } ,
167+
160168 async getPendingDocuments ( limit : number ) {
161169 return sql < DocumentRecord [ ] > `
162170 SELECT * FROM documents WHERE status = 'pending' LIMIT ${ limit }
@@ -207,11 +215,12 @@ export async function createDb(databaseUrl: string): Promise<DbClient> {
207215 } ,
208216
209217 async getUnextractedDocuments ( limit : number ) {
210- // Get docs that haven't been extracted yet (includes failed ones for retry )
218+ // Get docs that haven't been extracted yet (excludes previously failed ones)
211219 return sql < DocumentRecord [ ] > `
212220 SELECT * FROM documents
213221 WHERE status = 'uploaded'
214222 AND extracted_at IS NULL
223+ AND extraction_error IS NULL
215224 ORDER BY uploaded_at ASC
216225 LIMIT ${ limit }
217226 ` ;
0 commit comments