Skip to content

Commit 71c5762

Browse files
author
Dhanush Varma
committed
fix: resolve TESSDATA_PREFIX path correctly for all Tesseract versions
Two bugs in init_ocr() in ocr.c: 1. The Tesseract 4/5 branch always blindly appended '/tessdata' to the path returned by probe_tessdata_location(). If TESSDATA_PREFIX was already set to a path ending in 'tessdata/', this caused a double- append e.g. '/usr/share/tessdata/tessdata'. 2. The legacy Tesseract <4 branch passed tessdata_path raw to TessBaseAPIInit4 without appending 'tessdata' at all, causing Tesseract to look for eng.traineddata directly in e.g. '/usr/share/' instead of '/usr/share/tessdata/'. Fix: normalize the path once before both branches. Detect whether the returned path already ends with 'tessdata' or 'tessdata/', handle Windows backslash separators, and use the resolved path in both Tesseract version branches. Add mprint diagnostic for the resolved path. Fixes #1492
1 parent 395f9b3 commit 71c5762

1 file changed

Lines changed: 29 additions & 3 deletions

File tree

src/lib_ccx/ocr.c

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -261,20 +261,46 @@ void *init_ocr(int lang_index)
261261
}
262262

263263
ctx->api = TessBaseAPICreate();
264+
265+
/* Build the correct tessdata path for TessBaseAPIInit4.
266+
* probe_tessdata_location() returns a base dir e.g. "/opt/homebrew/share/".
267+
* TessBaseAPIInit4 expects "<base>/tessdata" as the data path.
268+
* If TESSDATA_PREFIX already points at the tessdata dir itself,
269+
* avoid appending "tessdata" a second time. */
270+
char tess_path[1024];
271+
size_t tp_len = strlen(tessdata_path);
272+
int already_has_tessdata = (tp_len >= 8 &&
273+
(strcmp(tessdata_path + tp_len - 8, "tessdata/") == 0 ||
274+
strcmp(tessdata_path + tp_len - 8, "tessdata") == 0));
275+
if (already_has_tessdata)
276+
{
277+
snprintf(tess_path, sizeof(tess_path), "%s", tessdata_path);
278+
}
279+
else
280+
{
281+
snprintf(tess_path, sizeof(tess_path), "%s%stessdata",
282+
tessdata_path,
283+
(tessdata_path[tp_len - 1] == '/' || tessdata_path[tp_len - 1] == '\\') ? "" : "/");
284+
}
285+
286+
mprint("CCExtractor: using tessdata path: %s\n", tess_path);
287+
264288
if (!strncmp("4.", TessVersion(), 2) || !strncmp("5.", TessVersion(), 2))
265289
{
266-
char tess_path[1024];
267-
snprintf(tess_path, 1024, "%s%s%s", tessdata_path, "/", "tessdata");
268290
if (ccx_options.ocr_oem < 0)
291+
{
269292
ccx_options.ocr_oem = 1;
293+
}
270294
ret = TessBaseAPIInit4(ctx->api, tess_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec,
271295
&pars_values, 1, false);
272296
}
273297
else
274298
{
275299
if (ccx_options.ocr_oem < 0)
300+
{
276301
ccx_options.ocr_oem = 0;
277-
ret = TessBaseAPIInit4(ctx->api, tessdata_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec,
302+
}
303+
ret = TessBaseAPIInit4(ctx->api, tess_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec,
278304
&pars_values, 1, false);
279305
}
280306

0 commit comments

Comments
 (0)