Skip to content

Commit f96bf43

Browse files
committed
feat: mimic scraper file structure
1 parent 92b0a18 commit f96bf43

5 files changed

Lines changed: 96 additions & 137 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Dependencies
22
node_modules/
3+
uv.lock
34

45
# Build output
56
dist/

apps/cli/commands/extract.ts

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,11 @@ import { createLocalStorage, createR2Storage } from "@docx-corpus/shared";
99
interface ParsedFlags {
1010
batchSize?: number;
1111
workers?: number;
12-
resume: boolean;
1312
verbose: boolean;
1413
}
1514

1615
function parseFlags(args: string[]): ParsedFlags {
1716
const flags: ParsedFlags = {
18-
resume: false,
1917
verbose: false,
2018
};
2119

@@ -34,10 +32,6 @@ function parseFlags(args: string[]): ParsedFlags {
3432
flags.workers = parseInt(next || "", 10);
3533
i++;
3634
break;
37-
case "--resume":
38-
case "-r":
39-
flags.resume = true;
40-
break;
4135
case "--verbose":
4236
case "-v":
4337
flags.verbose = true;
@@ -58,10 +52,11 @@ Storage is auto-selected based on environment:
5852
- With R2 credentials: reads from r2://documents/, writes to r2://extracted/
5953
- Without R2 credentials: reads from ./corpus/documents/, writes to ./corpus/extracted/
6054
55+
Already-extracted files are automatically skipped (tracked in index.jsonl).
56+
6157
Options
6258
--batch-size, -b <n> Number of files per batch (default: from EXTRACT_BATCH_SIZE or 100)
6359
--workers, -w <n> Number of parallel workers (default: from EXTRACT_WORKERS or 4)
64-
--resume, -r Resume from last checkpoint
6560
--verbose, -v Show detailed progress
6661
--help, -h Show this help
6762
@@ -77,8 +72,8 @@ Environment Variables
7772
EXTRACT_WORKERS Worker count (default: 4)
7873
7974
Examples
80-
corpus extract # Use defaults from env
81-
corpus extract --resume -v # Resume with verbose output
75+
corpus extract # Extract up to batch size
76+
corpus extract -v # With verbose output
8277
corpus extract -b 50 -w 8 # Custom batch/workers
8378
`;
8479

@@ -108,7 +103,6 @@ export async function runExtract(args: string[]) {
108103
outputPrefix: envConfig.extract.outputPrefix,
109104
batchSize: flags.batchSize ?? envConfig.extract.batchSize,
110105
workers: flags.workers ?? envConfig.extract.workers,
111-
resume: flags.resume,
112106
};
113107

114108
console.log("Text Extractor");
@@ -120,7 +114,6 @@ export async function runExtract(args: string[]) {
120114
console.log(`Output: ${config.outputPrefix}/`);
121115
console.log(`Workers: ${config.workers}`);
122116
console.log(`Batch: ${config.batchSize}`);
123-
if (config.resume) console.log("Resume: enabled");
124117
console.log("");
125118

126119
try {

packages/extractor/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
export { processDirectory } from "./processor";
2-
export type { ExtractConfig, ExtractedDocument, ExtractionProgress } from "./types";
2+
export type { ExtractConfig, ExtractedDocument } from "./types";
33
export { loadExtractorConfig, hasCloudflareCredentials, type ExtractorConfig } from "./config";

packages/extractor/processor.ts

Lines changed: 90 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -1,110 +1,88 @@
11
import { mkdir, rm } from "node:fs/promises";
22
import { join, basename, dirname } from "node:path";
33
import { tmpdir } from "node:os";
4-
import type { ExtractConfig, ExtractedDocument, ExtractionProgress } from "./types";
4+
import type { ExtractConfig, ExtractedDocument } from "./types";
55

66
const PYTHON_DIR = join(dirname(import.meta.path), "python");
77
const PYTHON_PATH = join(PYTHON_DIR, ".venv", "bin", "python");
88
const SCRIPT_PATH = join(PYTHON_DIR, "extract.py");
99

10-
const PROGRESS_FILE = "progress.json";
10+
const INDEX_FILE = "index.jsonl";
1111
const ERRORS_FILE = "errors.jsonl";
12-
const OUTPUT_FILE = "documents.jsonl";
1312

1413
export async function processDirectory(
1514
config: ExtractConfig,
1615
verbose: boolean = false
1716
): Promise<void> {
18-
const { storage, inputPrefix, outputPrefix } = config;
17+
const { storage, inputPrefix, outputPrefix, batchSize } = config;
1918

20-
// List all .docx files in input prefix
19+
// Load index to get already-processed IDs
20+
const processedIds = await loadProcessedIds(storage, outputPrefix);
21+
22+
if (processedIds.size > 0) {
23+
console.log(`Already extracted: ${processedIds.size} documents`);
24+
}
25+
26+
// List .docx files, filtering out already-processed ones
2127
const files: string[] = [];
28+
29+
console.log(`Scanning ${inputPrefix}/...`);
2230
for await (const key of storage.list(inputPrefix)) {
2331
if (key.toLowerCase().endsWith(".docx") && !basename(key).startsWith("~$")) {
24-
files.push(key);
32+
const id = extractIdFromKey(key);
33+
if (!processedIds.has(id)) {
34+
files.push(key);
35+
if (files.length >= batchSize) {
36+
console.log(`Listed ${files.length} files to process (batch limit)`);
37+
break;
38+
}
39+
}
2540
}
2641
}
2742
files.sort();
2843

2944
if (files.length === 0) {
30-
console.log(`No DOCX files found in ${inputPrefix}`);
45+
console.log(`No new DOCX files to process in ${inputPrefix}`);
3146
return;
3247
}
3348

34-
console.log(`Found ${files.length} DOCX files`);
35-
36-
// Load progress
37-
const progress = await loadProgress(storage, outputPrefix);
38-
const startIndex = config.resume ? progress.processedFiles : 0;
39-
40-
if (config.resume && startIndex > 0) {
41-
console.log(`Resuming from file ${startIndex + 1}`);
42-
}
43-
44-
progress.totalFiles = files.length;
45-
progress.startedAt = progress.startedAt || new Date().toISOString();
49+
console.log(`Found ${files.length} DOCX files to extract`);
4650

4751
// Create temp directory for processing
4852
const tempDir = join(tmpdir(), `docx-extract-${Date.now()}`);
4953
await mkdir(tempDir, { recursive: true });
5054

51-
// Collect output lines in memory, write at end
52-
const outputLines: string[] = [];
53-
54-
// Load existing output if resuming
55-
if (config.resume && startIndex > 0) {
56-
const existingOutput = await storage.read(`${outputPrefix}/${OUTPUT_FILE}`);
57-
if (existingOutput) {
58-
const text = new TextDecoder().decode(existingOutput);
59-
outputLines.push(...text.trim().split("\n").filter(Boolean));
60-
}
61-
}
62-
63-
const batches = chunkArray(files.slice(startIndex), config.batchSize);
64-
let totalProcessed = startIndex;
55+
let successCount = 0;
56+
let errorCount = 0;
6557

6658
try {
67-
for (const batch of batches) {
68-
const results = await processBatch(batch, storage, tempDir, config.workers, verbose);
69-
70-
for (const result of results) {
71-
if (result.success && result.document) {
72-
outputLines.push(JSON.stringify(result.document));
73-
progress.successCount++;
74-
} else {
75-
progress.errorCount++;
76-
await appendError(storage, outputPrefix, result.error || "Unknown error", result.sourceKey);
77-
}
59+
const results = await processBatch(files, storage, tempDir, config.workers, verbose);
60+
61+
for (const result of results) {
62+
if (result.success && result.document) {
63+
// Write text file
64+
await storage.write(
65+
`${outputPrefix}/${result.document.id}.txt`,
66+
result.document.text
67+
);
68+
// Append to index (metadata)
69+
await appendToIndex(storage, outputPrefix, result.document);
70+
successCount++;
71+
} else {
72+
errorCount++;
73+
await appendError(storage, outputPrefix, result.error || "Unknown error", result.sourceKey);
7874
}
79-
80-
totalProcessed += batch.length;
81-
progress.processedFiles = totalProcessed;
82-
progress.lastProcessedKey = batch[batch.length - 1];
83-
progress.updatedAt = new Date().toISOString();
84-
85-
// Save progress and output
86-
await saveProgress(storage, outputPrefix, progress);
87-
await storage.write(
88-
`${outputPrefix}/${OUTPUT_FILE}`,
89-
outputLines.join("\n") + "\n"
90-
);
91-
92-
const percent = ((totalProcessed / files.length) * 100).toFixed(1);
93-
console.log(
94-
`Progress: ${totalProcessed}/${files.length} (${percent}%) - ` +
95-
`Success: ${progress.successCount}, Errors: ${progress.errorCount}`
96-
);
9775
}
76+
77+
console.log(`Processed: ${successCount} success, ${errorCount} errors`);
9878
} finally {
9979
// Cleanup temp directory
10080
await rm(tempDir, { recursive: true, force: true });
10181
}
10282

10383
console.log("\nExtraction complete!");
104-
console.log(` Total: ${files.length}`);
105-
console.log(` Success: ${progress.successCount}`);
106-
console.log(` Errors: ${progress.errorCount}`);
107-
console.log(` Output: ${outputPrefix}/${OUTPUT_FILE}`);
84+
console.log(` Output: ${outputPrefix}/{hash}.txt`);
85+
console.log(` Index: ${outputPrefix}/${INDEX_FILE}`);
10886
}
10987

11088
interface ProcessResult {
@@ -133,7 +111,7 @@ async function extractWithPython(
133111
}
134112

135113
const result = JSON.parse(stdout);
136-
const id = generateId(sourceKey);
114+
const id = extractIdFromKey(sourceKey);
137115

138116
return {
139117
id,
@@ -169,7 +147,7 @@ async function processBatch(
169147
throw new Error(`File not found: ${sourceKey}`);
170148
}
171149

172-
const tempFile = join(tempDir, `${generateId(sourceKey)}.docx`);
150+
const tempFile = join(tempDir, `${extractIdFromKey(sourceKey)}.docx`);
173151
await Bun.write(tempFile, content);
174152

175153
// Extract using Python
@@ -201,44 +179,10 @@ async function processBatch(
201179
return results;
202180
}
203181

204-
function generateId(key: string): string {
205-
const hasher = new Bun.CryptoHasher("sha256");
206-
hasher.update(key);
207-
return hasher.digest("hex").slice(0, 16);
208-
}
209-
210-
async function loadProgress(
211-
storage: ExtractConfig["storage"],
212-
outputPrefix: string
213-
): Promise<ExtractionProgress> {
214-
try {
215-
const content = await storage.read(`${outputPrefix}/${PROGRESS_FILE}`);
216-
if (content) {
217-
return JSON.parse(new TextDecoder().decode(content));
218-
}
219-
} catch {
220-
// Ignore errors, return fresh progress
221-
}
222-
223-
return {
224-
totalFiles: 0,
225-
processedFiles: 0,
226-
successCount: 0,
227-
errorCount: 0,
228-
startedAt: new Date().toISOString(),
229-
updatedAt: new Date().toISOString(),
230-
};
231-
}
232-
233-
async function saveProgress(
234-
storage: ExtractConfig["storage"],
235-
outputPrefix: string,
236-
progress: ExtractionProgress
237-
): Promise<void> {
238-
await storage.write(
239-
`${outputPrefix}/${PROGRESS_FILE}`,
240-
JSON.stringify(progress, null, 2)
241-
);
182+
function extractIdFromKey(key: string): string {
183+
// Extract hash from filename: "documents/abc123.docx" → "abc123"
184+
const filename = basename(key);
185+
return filename.replace(/\.docx$/i, "");
242186
}
243187

244188
async function appendError(
@@ -256,10 +200,45 @@ async function appendError(
256200
await storage.write(errorsKey, existingText + line);
257201
}
258202

259-
function chunkArray<T>(array: T[], size: number): T[][] {
260-
const chunks: T[][] = [];
261-
for (let i = 0; i < array.length; i += size) {
262-
chunks.push(array.slice(i, i + size));
203+
async function loadProcessedIds(
204+
storage: ExtractConfig["storage"],
205+
outputPrefix: string
206+
): Promise<Set<string>> {
207+
const ids = new Set<string>();
208+
try {
209+
const content = await storage.read(`${outputPrefix}/${INDEX_FILE}`);
210+
if (content) {
211+
const text = new TextDecoder().decode(content);
212+
for (const line of text.split("\n")) {
213+
if (line.trim()) {
214+
const entry = JSON.parse(line);
215+
ids.add(entry.id);
216+
}
217+
}
218+
}
219+
} catch {
220+
// Index doesn't exist yet, return empty set
263221
}
264-
return chunks;
222+
return ids;
223+
}
224+
225+
async function appendToIndex(
226+
storage: ExtractConfig["storage"],
227+
outputPrefix: string,
228+
doc: ExtractedDocument
229+
): Promise<void> {
230+
const indexKey = `${outputPrefix}/${INDEX_FILE}`;
231+
const entry = {
232+
id: doc.id,
233+
extractedAt: doc.extractedAt,
234+
wordCount: doc.wordCount,
235+
charCount: doc.charCount,
236+
tableCount: doc.tableCount,
237+
imageCount: doc.imageCount,
238+
};
239+
240+
// Read existing index and append
241+
const existing = await storage.read(indexKey);
242+
const existingText = existing ? new TextDecoder().decode(existing) : "";
243+
await storage.write(indexKey, existingText + JSON.stringify(entry) + "\n");
265244
}

packages/extractor/types.ts

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,4 @@ export interface ExtractConfig {
2323
outputPrefix: string;
2424
batchSize: number;
2525
workers: number;
26-
resume: boolean;
27-
}
28-
29-
/**
30-
* Progress tracking for resumable extraction
31-
*/
32-
export interface ExtractionProgress {
33-
totalFiles: number;
34-
processedFiles: number;
35-
successCount: number;
36-
errorCount: number;
37-
lastProcessedKey?: string;
38-
startedAt: string;
39-
updatedAt: string;
4026
}

0 commit comments

Comments
 (0)