Skip to content

Commit 0cd27b0

Browse files
bundoleeclaude
andcommitted
refactor(hybrid): move OCR settings from CLI to server startup
Move OCR configuration (--ocr-lang, --force-ocr) from Java CLI runtime options to Python hybrid server startup options. This improves performance by creating a single DocumentConverter instance at server startup instead of per-request converter creation with different language settings. Changes: - Add --ocr-lang and --force-ocr options to hybrid_server.py - Deprecate --hybrid-ocr CLI option (prints warning, no-op) - Remove forceOcr from DoclingFastServerClient and HybridConfig - Remove OCR-related constants and methods from Config.java - Update tests to reflect deprecated/removed functionality - Regenerate Python/Node.js wrappers via npm run sync Closes #161 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent d0b024e commit 0cd27b0

15 files changed

Lines changed: 62 additions & 186 deletions

File tree

content/docs/_generated/node-convert-options.mdx

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ description: Options for the Node.js convert function
2828
| `pages` | `string` | - | Pages to extract (e.g., "1,3,5-7"). Default: all pages |
2929
| `hybrid` | `string` | `"off"` | Hybrid backend for AI processing. Values: off (default), docling (docling-fast is deprecated alias) |
3030
| `hybridMode` | `string` | `"auto"` | Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) |
31-
| `hybridOcr` | `string` | `"auto"` | Hybrid OCR mode for Docling backend. Values: auto (default, OCR only where needed), force (force full-page OCR) |
3231
| `hybridUrl` | `string` | - | Hybrid backend server URL (overrides default) |
3332
| `hybridTimeout` | `string` | `"30000"` | Hybrid backend request timeout in milliseconds. Default: 30000 |
3433
| `hybridFallback` | `boolean` | `true` | Fallback to Java processing on hybrid backend error. Default: true |

content/docs/_generated/python-convert-options.mdx

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ description: Options for the Python convert function
2929
| `pages` | `str` | - | Pages to extract (e.g., "1,3,5-7"). Default: all pages |
3030
| `hybrid` | `str` | `"off"` | Hybrid backend for AI processing. Values: off (default), docling (docling-fast is deprecated alias) |
3131
| `hybrid_mode` | `str` | `"auto"` | Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) |
32-
| `hybrid_ocr` | `str` | `"auto"` | Hybrid OCR mode for Docling backend. Values: auto (default, OCR only where needed), force (force full-page OCR) |
3332
| `hybrid_url` | `str` | - | Hybrid backend server URL (overrides default) |
3433
| `hybrid_timeout` | `str` | `"30000"` | Hybrid backend request timeout in milliseconds. Default: 30000 |
3534
| `hybrid_fallback` | `bool` | `True` | Fallback to Java processing on hybrid backend error. Default: true |

content/docs/cli-options-reference.mdx

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ This page documents all available CLI options for opendataloader-pdf.
3232
| `--pages` | - | `string` | - | Pages to extract (e.g., "1,3,5-7"). Default: all pages |
3333
| `--hybrid` | - | `string` | `"off"` | Hybrid backend for AI processing. Values: off (default), docling (docling-fast is deprecated alias) |
3434
| `--hybrid-mode` | - | `string` | `"auto"` | Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) |
35-
| `--hybrid-ocr` | - | `string` | `"auto"` | Hybrid OCR mode for Docling backend. Values: auto (default, OCR only where needed), force (force full-page OCR) |
3635
| `--hybrid-url` | - | `string` | - | Hybrid backend server URL (overrides default) |
3736
| `--hybrid-timeout` | - | `string` | `"30000"` | Hybrid backend request timeout in milliseconds. Default: 30000 |
3837
| `--hybrid-fallback` | - | `boolean` | `true` | Fallback to Java processing on hybrid backend error. Default: true |

java/opendataloader-pdf-cli/src/main/java/org/opendataloader/pdf/cli/CLIOptions.java

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,9 @@ public class CLIOptions {
100100
private static final String HYBRID_MODE_LONG_OPTION = "hybrid-mode";
101101
private static final String HYBRID_MODE_DESC = "Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend)";
102102

103+
// Deprecated: OCR settings are now configured on the hybrid server
103104
private static final String HYBRID_OCR_LONG_OPTION = "hybrid-ocr";
104-
private static final String HYBRID_OCR_DESC = "Hybrid OCR mode for Docling backend. Values: auto (default, OCR only where needed), force (force full-page OCR)";
105+
private static final String HYBRID_OCR_DESC = "[Deprecated] OCR settings are now configured on the hybrid server (--ocr-lang, --force-ocr)";
105106

106107
private static final String HYBRID_URL_LONG_OPTION = "hybrid-url";
107108
private static final String HYBRID_URL_DESC = "Hybrid backend server URL (overrides default)";
@@ -151,13 +152,13 @@ public class CLIOptions {
151152
new OptionDefinition(PAGES_LONG_OPTION, null, "string", null, PAGES_DESC, true),
152153
new OptionDefinition(HYBRID_LONG_OPTION, null, "string", "off", HYBRID_DESC, true),
153154
new OptionDefinition(HYBRID_MODE_LONG_OPTION, null, "string", "auto", HYBRID_MODE_DESC, true),
154-
new OptionDefinition(HYBRID_OCR_LONG_OPTION, null, "string", "auto", HYBRID_OCR_DESC, true),
155155
new OptionDefinition(HYBRID_URL_LONG_OPTION, null, "string", null, HYBRID_URL_DESC, true),
156156
new OptionDefinition(HYBRID_TIMEOUT_LONG_OPTION, null, "string", "30000", HYBRID_TIMEOUT_DESC, true),
157157
new OptionDefinition(HYBRID_FALLBACK_LONG_OPTION, null, "boolean", true, HYBRID_FALLBACK_DESC, true),
158158
new OptionDefinition(EXPORT_OPTIONS_LONG_OPTION, null, "boolean", null, null, false),
159159

160160
// Legacy options (not exported, for backward compatibility)
161+
new OptionDefinition(HYBRID_OCR_LONG_OPTION, null, "string", null, HYBRID_OCR_DESC, false),
161162
new OptionDefinition(PDF_REPORT_LONG_OPTION, null, "boolean", null, null, false),
162163
new OptionDefinition(MARKDOWN_REPORT_LONG_OPTION, null, "boolean", null, null, false),
163164
new OptionDefinition(HTML_REPORT_LONG_OPTION, null, "boolean", null, null, false),
@@ -437,19 +438,9 @@ private static void applyHybridOptions(Config config, CommandLine commandLine) {
437438
config.getHybridConfig().setMode(mode);
438439
}
439440
if (commandLine.hasOption(HYBRID_OCR_LONG_OPTION)) {
440-
String ocrValue = commandLine.getOptionValue(HYBRID_OCR_LONG_OPTION);
441-
if (ocrValue == null || ocrValue.trim().isEmpty()) {
442-
throw new IllegalArgumentException(
443-
String.format("Option --hybrid-ocr requires a value. Supported values: %s",
444-
Config.getHybridOcrOptions(", ")));
445-
}
446-
String ocr = ocrValue.trim().toLowerCase(Locale.ROOT);
447-
if (!Config.isValidHybridOcr(ocr)) {
448-
throw new IllegalArgumentException(
449-
String.format("Unsupported hybrid OCR mode '%s'. Supported values: %s",
450-
ocr, Config.getHybridOcrOptions(", ")));
451-
}
452-
config.getHybridConfig().setOcrMode(ocr);
441+
// Deprecated: OCR settings are now configured on the hybrid server
442+
System.err.println("Warning: --hybrid-ocr is deprecated. "
443+
+ "Configure OCR settings on the hybrid server instead (--ocr-lang, --force-ocr).");
453444
}
454445
if (commandLine.hasOption(HYBRID_URL_LONG_OPTION)) {
455446
String url = commandLine.getOptionValue(HYBRID_URL_LONG_OPTION);

java/opendataloader-pdf-cli/src/test/java/org/opendataloader/pdf/cli/CLIOptionsTest.java

Lines changed: 6 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,7 @@ void testDefineOptions_containsHybridModeOption() {
367367

368368
@Test
369369
void testDefineOptions_containsHybridOcrOption() {
370+
// --hybrid-ocr is deprecated but still accepted for backward compatibility
370371
assertTrue(options.hasOption("hybrid-ocr"));
371372
}
372373

@@ -403,46 +404,24 @@ void testCreateConfig_withInvalidHybridMode() throws ParseException {
403404
}
404405

405406
@Test
406-
void testCreateConfig_withHybridOcrAuto() throws ParseException {
407-
String[] args = {"--hybrid", "docling", "--hybrid-ocr", "auto", testPdf.getAbsolutePath()};
408-
CommandLine cmd = parser.parse(options, args);
409-
410-
Config config = CLIOptions.createConfigFromCommandLine(cmd);
411-
412-
assertEquals("auto", config.getHybridConfig().getOcrMode());
413-
assertFalse(config.getHybridConfig().isForceOcr());
414-
}
415-
416-
@Test
417-
void testCreateConfig_withHybridOcrForce() throws ParseException {
407+
void testCreateConfig_withDeprecatedHybridOcr() throws ParseException {
408+
// --hybrid-ocr is deprecated; it should print a warning but not throw
418409
String[] args = {"--hybrid", "docling", "--hybrid-ocr", "force", testPdf.getAbsolutePath()};
419410
CommandLine cmd = parser.parse(options, args);
420411

412+
// Should not throw, just prints deprecation warning
421413
Config config = CLIOptions.createConfigFromCommandLine(cmd);
422-
423-
assertEquals("force", config.getHybridConfig().getOcrMode());
424-
assertTrue(config.getHybridConfig().isForceOcr());
425-
}
426-
427-
@Test
428-
void testCreateConfig_withInvalidHybridOcr() throws ParseException {
429-
String[] args = {"--hybrid-ocr", "invalid", testPdf.getAbsolutePath()};
430-
CommandLine cmd = parser.parse(options, args);
431-
432-
assertThrows(IllegalArgumentException.class, () -> {
433-
CLIOptions.createConfigFromCommandLine(cmd);
434-
});
414+
assertNotNull(config);
435415
}
436416

437417
@Test
438-
void testCreateConfig_defaultHybridModeAndOcr() throws ParseException {
418+
void testCreateConfig_defaultHybridMode() throws ParseException {
439419
String[] args = {"--hybrid", "docling", testPdf.getAbsolutePath()};
440420
CommandLine cmd = parser.parse(options, args);
441421

442422
Config config = CLIOptions.createConfigFromCommandLine(cmd);
443423

444424
assertEquals("auto", config.getHybridConfig().getMode());
445-
assertEquals("auto", config.getHybridConfig().getOcrMode());
446425
}
447426

448427
@Test

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/api/Config.java

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,6 @@ public class Config {
4646
public static final String HYBRID_MODE_FULL = "full";
4747
private static Set<String> hybridModeOptions = new HashSet<>();
4848

49-
/** Hybrid OCR mode: auto (OCR only where needed, Docling internal logic). */
50-
public static final String HYBRID_OCR_AUTO = "auto";
51-
/** Hybrid OCR mode: force (force full-page OCR on all pages). */
52-
public static final String HYBRID_OCR_FORCE = "force";
53-
private static Set<String> hybridOcrOptions = new HashSet<>();
5449
/** Placeholder string for page number in separators. */
5550
public static final String PAGE_NUMBER_STRING = "%page-number%";
5651
private String password;
@@ -115,8 +110,6 @@ public class Config {
115110
// hancom, azure, google added when implemented
116111
hybridModeOptions.add(HYBRID_MODE_AUTO);
117112
hybridModeOptions.add(HYBRID_MODE_FULL);
118-
hybridOcrOptions.add(HYBRID_OCR_AUTO);
119-
hybridOcrOptions.add(HYBRID_OCR_FORCE);
120113
}
121114

122115
/**
@@ -825,24 +818,4 @@ public static boolean isValidHybridMode(String mode) {
825818
return mode != null && hybridModeOptions.contains(mode.toLowerCase(Locale.ROOT));
826819
}
827820

828-
/**
829-
* Gets the list of supported hybrid OCR options.
830-
*
831-
* @param delimiter The delimiter to use between options.
832-
* @return The string with hybrid OCR modes separated by the delimiter.
833-
*/
834-
public static String getHybridOcrOptions(CharSequence delimiter) {
835-
return String.join(delimiter, hybridOcrOptions);
836-
}
837-
838-
/**
839-
* Checks if the given hybrid OCR mode is valid.
840-
*
841-
* @param ocr The hybrid OCR mode to check.
842-
* @return true if the OCR mode is valid, false otherwise.
843-
*/
844-
public static boolean isValidHybridOcr(String ocr) {
845-
return ocr != null && hybridOcrOptions.contains(ocr.toLowerCase(Locale.ROOT));
846-
}
847-
848821
}

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/DoclingFastServerClient.java

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ public class DoclingFastServerClient implements HybridClient {
5353
private final String baseUrl;
5454
private final OkHttpClient httpClient;
5555
private final ObjectMapper objectMapper;
56-
private final boolean forceOcr;
5756

5857
/**
5958
* Creates a new DoclingFastServerClient with the specified configuration.
@@ -63,7 +62,6 @@ public class DoclingFastServerClient implements HybridClient {
6362
public DoclingFastServerClient(HybridConfig config) {
6463
this.baseUrl = config.getEffectiveUrl("docling-fast");
6564
this.objectMapper = new ObjectMapper();
66-
this.forceOcr = config.isForceOcr();
6765
this.httpClient = new OkHttpClient.Builder()
6866
.connectTimeout(config.getTimeoutMs(), TimeUnit.MILLISECONDS)
6967
.readTimeout(config.getTimeoutMs(), TimeUnit.MILLISECONDS)
@@ -82,7 +80,6 @@ public DoclingFastServerClient(HybridConfig config) {
8280
this.baseUrl = baseUrl;
8381
this.httpClient = httpClient;
8482
this.objectMapper = objectMapper;
85-
this.forceOcr = false;
8683
}
8784

8885
@Override
@@ -131,11 +128,6 @@ private Request buildConvertRequest(HybridRequest request) {
131128
bodyBuilder.addFormDataPart("page_ranges", minPage + "-" + maxPage);
132129
}
133130

134-
// Add force_ocr parameter if enabled
135-
if (forceOcr) {
136-
bodyBuilder.addFormDataPart("force_ocr", "true");
137-
}
138-
139131
return new Request.Builder()
140132
.url(baseUrl + CONVERT_ENDPOINT)
141133
.post(bodyBuilder.build())

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/hybrid/HybridConfig.java

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,7 @@ public class HybridConfig {
3636
/** Hybrid triage mode: full (skip triage, send all pages to backend). */
3737
public static final String MODE_FULL = "full";
3838

39-
/** Hybrid OCR mode: auto (OCR only where needed). */
40-
public static final String OCR_AUTO = "auto";
41-
/** Hybrid OCR mode: force (force full-page OCR on all pages). */
42-
public static final String OCR_FORCE = "force";
43-
4439
private String mode = MODE_AUTO;
45-
private String ocrMode = OCR_AUTO;
4640

4741
/**
4842
* Default constructor initializing the configuration with default values.
@@ -189,31 +183,4 @@ public void setMode(String mode) {
189183
public boolean isFullMode() {
190184
return MODE_FULL.equals(mode);
191185
}
192-
193-
/**
194-
* Gets the hybrid OCR mode.
195-
*
196-
* @return The OCR mode (auto or force).
197-
*/
198-
public String getOcrMode() {
199-
return ocrMode;
200-
}
201-
202-
/**
203-
* Sets the hybrid OCR mode.
204-
*
205-
* @param ocrMode The OCR mode (auto or force).
206-
*/
207-
public void setOcrMode(String ocrMode) {
208-
this.ocrMode = ocrMode;
209-
}
210-
211-
/**
212-
* Checks if force OCR mode is enabled.
213-
*
214-
* @return true if OCR mode is force, false otherwise.
215-
*/
216-
public boolean isForceOcr() {
217-
return OCR_FORCE.equals(ocrMode);
218-
}
219186
}

java/opendataloader-pdf-core/src/test/java/org/opendataloader/pdf/processors/HybridDocumentProcessorTest.java

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -324,23 +324,6 @@ public void testHybridConfigModeFullMode() {
324324
Assertions.assertTrue(config.isFullMode());
325325
}
326326

327-
@Test
328-
public void testHybridConfigOcrDefaults() {
329-
HybridConfig config = new HybridConfig();
330-
331-
Assertions.assertEquals(HybridConfig.OCR_AUTO, config.getOcrMode());
332-
Assertions.assertFalse(config.isForceOcr());
333-
}
334-
335-
@Test
336-
public void testHybridConfigOcrForce() {
337-
HybridConfig config = new HybridConfig();
338-
config.setOcrMode(HybridConfig.OCR_FORCE);
339-
340-
Assertions.assertEquals(HybridConfig.OCR_FORCE, config.getOcrMode());
341-
Assertions.assertTrue(config.isForceOcr());
342-
}
343-
344327
@Test
345328
public void testDoclingBackendEnabled() {
346329
Config config = new Config();

node/opendataloader-pdf/src/cli-options.generated.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ export function registerCliOptions(program: Command): void {
2626
program.option('--pages <value>', 'Pages to extract (e.g., "1,3,5-7"). Default: all pages');
2727
program.option('--hybrid <value>', 'Hybrid backend for AI processing. Values: off (default), docling (docling-fast is deprecated alias)');
2828
program.option('--hybrid-mode <value>', 'Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend)');
29-
program.option('--hybrid-ocr <value>', 'Hybrid OCR mode for Docling backend. Values: auto (default, OCR only where needed), force (force full-page OCR)');
3029
program.option('--hybrid-url <value>', 'Hybrid backend server URL (overrides default)');
3130
program.option('--hybrid-timeout <value>', 'Hybrid backend request timeout in milliseconds. Default: 30000');
3231
program.option('--hybrid-fallback', 'Fallback to Java processing on hybrid backend error. Default: true');

0 commit comments

Comments
 (0)