Skip to content

Commit 73e28a0

Browse files
Update documentation
1 parent fb28a31 commit 73e28a0

12 files changed

Lines changed: 44 additions & 41 deletions

File tree

content/docs/_generated/node-convert-options.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ description: Options for the Node.js convert function
2828
| `imageDir` | `string` | - | Directory for extracted images |
2929
| `pages` | `string` | - | Pages to extract (e.g., "1,3,5-7"). Default: all pages |
3030
| `includeHeaderFooter` | `boolean` | `false` | Include page headers and footers in output |
31-
| `detectStrikethrough` | `boolean` | `false` | Detect strikethrough text and wrap with ~~ in Markdown output (experimental) |
31+
| `detectStrikethrough` | `boolean` | `false` | Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental) |
3232
| `hybrid` | `string` | `"off"` | Hybrid backend (requires a running server). Quick start: pip install "opendataloader-pdf[hybrid]" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast |
3333
| `hybridMode` | `string` | `"auto"` | Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) |
3434
| `hybridUrl` | `string` | - | Hybrid backend server URL (overrides default) |

content/docs/_generated/python-convert-options.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ description: Options for the Python convert function
2929
| `image_dir` | `str` | - | Directory for extracted images |
3030
| `pages` | `str` | - | Pages to extract (e.g., "1,3,5-7"). Default: all pages |
3131
| `include_header_footer` | `bool` | `False` | Include page headers and footers in output |
32-
| `detect_strikethrough` | `bool` | `False` | Detect strikethrough text and wrap with ~~ in Markdown output (experimental) |
32+
| `detect_strikethrough` | `bool` | `False` | Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental) |
3333
| `hybrid` | `str` | `"off"` | Hybrid backend (requires a running server). Quick start: pip install "opendataloader-pdf[hybrid]" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast |
3434
| `hybrid_mode` | `str` | `"auto"` | Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) |
3535
| `hybrid_url` | `str` | - | Hybrid backend server URL (overrides default) |

content/docs/cli-options-reference.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ This page documents all available CLI options for opendataloader-pdf.
3232
| `--image-dir` | - | `string` | - | Directory for extracted images |
3333
| `--pages` | - | `string` | - | Pages to extract (e.g., "1,3,5-7"). Default: all pages |
3434
| `--include-header-footer` | - | `boolean` | `false` | Include page headers and footers in output |
35-
| `--detect-strikethrough` | - | `boolean` | `false` | Detect strikethrough text and wrap with ~~ in Markdown output (experimental) |
35+
| `--detect-strikethrough` | - | `boolean` | `false` | Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental) |
3636
| `--hybrid` | - | `string` | `"off"` | Hybrid backend (requires a running server). Quick start: pip install "opendataloader-pdf[hybrid]" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast |
3737
| `--hybrid-mode` | - | `string` | `"auto"` | Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) |
3838
| `--hybrid-url` | - | `string` | - | Hybrid backend server URL (overrides default) |

java/opendataloader-pdf-cli/src/main/java/org/opendataloader/pdf/cli/CLIOptions.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ public class CLIOptions {
112112

113113
// ===== Detect Strikethrough =====
114114
private static final String DETECT_STRIKETHROUGH_LONG_OPTION = "detect-strikethrough";
115-
private static final String DETECT_STRIKETHROUGH_DESC = "Detect strikethrough text and wrap with ~~ in Markdown output (experimental)";
115+
private static final String DETECT_STRIKETHROUGH_DESC = "Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental)";
116116

117117
// ===== Hybrid Mode =====
118118
private static final String HYBRID_LONG_OPTION = "hybrid";

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/html/HtmlGenerator.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,10 @@ public class HtmlGenerator implements Closeable {
7676
protected String imageFormat = Config.IMAGE_FORMAT_PNG;
7777
/** Whether to include page headers and footers in output. */
7878
protected boolean includeHeaderFooter = false;
79+
/** Opening tag for strikethrough text*/
80+
protected static final String strikethroughTextHtmlOpeningTag = "<del>";
81+
/** Closing tag for strikethrough text*/
82+
protected static final String strikethroughTextHtmlClosingTag = "</del>";;
7983

8084
/**
8185
* Creates a new HtmlGenerator for the specified PDF file.
@@ -470,6 +474,18 @@ protected String escapeHtmlAttribute(String value) {
470474
.replace("\r", "");
471475
}
472476

477+
public static void getTextFromLineForHTML(TextLine line, StringBuilder stringBuilder) {
478+
for (TextChunk chunk : line.getTextChunks()) {
479+
if (chunk.getIsStrikethroughText()) {
480+
stringBuilder.append(strikethroughTextHtmlOpeningTag);
481+
}
482+
stringBuilder.append(chunk.getValue());
483+
if (chunk.getIsStrikethroughText()) {
484+
stringBuilder.append(strikethroughTextHtmlClosingTag);
485+
}
486+
}
487+
}
488+
473489
@Override
474490
public void close() throws IOException {
475491
if (htmlWriter != null) {

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/markdown/MarkdownGenerator.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ public class MarkdownGenerator implements Closeable {
5656
protected boolean embedImages = false;
5757
protected String imageFormat = Config.IMAGE_FORMAT_PNG;
5858
protected boolean includeHeaderFooter = false;
59+
protected static final String strikethroughTextMD = "~~";
5960

6061
MarkdownGenerator(File inputPdf, Config config) throws IOException {
6162
String cutPdfFileName = inputPdf.getName();
@@ -366,6 +367,18 @@ protected String getCorrectMarkdownString(String value) {
366367
return null;
367368
}
368369

370+
public static void getTextFromLineForMarkdown(TextLine line, StringBuilder stringBuilder) {
371+
for (TextChunk chunk : line.getTextChunks()) {
372+
if (chunk.getIsStrikethroughText()) {
373+
stringBuilder.append(strikethroughTextMD);
374+
}
375+
stringBuilder.append(chunk.getValue());
376+
if (chunk.getIsStrikethroughText()) {
377+
stringBuilder.append(strikethroughTextMD);
378+
}
379+
}
380+
}
381+
369382
@Override
370383
public void close() throws IOException {
371384
if (markdownWriter != null) {
Lines changed: 6 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,16 @@
11
package org.opendataloader.pdf.utils;
22

3+
import org.opendataloader.pdf.html.HtmlGenerator;
4+
import org.opendataloader.pdf.markdown.MarkdownGenerator;
35
import org.verapdf.wcag.algorithms.entities.SemanticTextNode;
46
import org.verapdf.wcag.algorithms.entities.content.TextBlock;
5-
import org.verapdf.wcag.algorithms.entities.content.TextChunk;
67
import org.verapdf.wcag.algorithms.entities.content.TextColumn;
78
import org.verapdf.wcag.algorithms.entities.content.TextLine;
89
import org.verapdf.wcag.algorithms.semanticalgorithms.utils.TextChunkUtils;
910

1011
import java.util.List;
1112

1213
public class GeneratorUtils {
13-
protected static final String strikethroughTextMD = "~~";
14-
protected static final String strikethroughTextHtmlOpeningTag = "<del>";
15-
protected static final String strikethroughTextHtmlClosingTag = "</del>";;
1614

1715
public static String getTextFromTextNode(SemanticTextNode textNode, OutputType outputType) {
1816
StringBuilder stringBuilder = new StringBuilder();
@@ -34,46 +32,22 @@ public static String getTextFromLines(List<TextLine> textLines, OutputType outpu
3432
TextLine line = textLines.get(i);
3533
switch (outputType) {
3634
case MD:
37-
getTextFromLineForMarkdown(line, stringBuilder);
35+
MarkdownGenerator.getTextFromLineForMarkdown(line, stringBuilder);
3836
break;
3937
case HTML:
40-
getTextFromLineForHTML(line, stringBuilder);
38+
HtmlGenerator.getTextFromLineForHTML(line, stringBuilder);
4139
break;
4240
}
4341
TextChunkUtils.formatLineEnd(stringBuilder);
4442
}
4543
switch (outputType) {
4644
case MD:
47-
getTextFromLineForMarkdown(textLines.get(textLines.size() - 1), stringBuilder);
45+
MarkdownGenerator.getTextFromLineForMarkdown(textLines.get(textLines.size() - 1), stringBuilder);
4846
break;
4947
case HTML:
50-
getTextFromLineForHTML(textLines.get(textLines.size() - 1), stringBuilder);
48+
HtmlGenerator.getTextFromLineForHTML(textLines.get(textLines.size() - 1), stringBuilder);
5149
break;
5250
}
5351
return stringBuilder.toString();
5452
}
55-
56-
public static void getTextFromLineForMarkdown(TextLine line, StringBuilder stringBuilder) {
57-
for (TextChunk chunk : line.getTextChunks()) {
58-
if (chunk.getIsStrikethroughText()) {
59-
stringBuilder.append(strikethroughTextMD);
60-
}
61-
stringBuilder.append(chunk.getValue());
62-
if (chunk.getIsStrikethroughText()) {
63-
stringBuilder.append(strikethroughTextMD);
64-
}
65-
}
66-
}
67-
68-
public static void getTextFromLineForHTML(TextLine line, StringBuilder stringBuilder) {
69-
for (TextChunk chunk : line.getTextChunks()) {
70-
if (chunk.getIsStrikethroughText()) {
71-
stringBuilder.append(strikethroughTextHtmlOpeningTag);
72-
}
73-
stringBuilder.append(chunk.getValue());
74-
if (chunk.getIsStrikethroughText()) {
75-
stringBuilder.append(strikethroughTextHtmlClosingTag);
76-
}
77-
}
78-
}
7953
}

node/opendataloader-pdf/src/cli-options.generated.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ export function registerCliOptions(program: Command): void {
2626
program.option('--image-dir <value>', 'Directory for extracted images');
2727
program.option('--pages <value>', 'Pages to extract (e.g., "1,3,5-7"). Default: all pages');
2828
program.option('--include-header-footer', 'Include page headers and footers in output');
29-
program.option('--detect-strikethrough', 'Detect strikethrough text and wrap with ~~ in Markdown output (experimental)');
29+
program.option('--detect-strikethrough', 'Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental)');
3030
program.option('--hybrid <value>', 'Hybrid backend (requires a running server). Quick start: pip install "opendataloader-pdf[hybrid]" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast');
3131
program.option('--hybrid-mode <value>', 'Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend)');
3232
program.option('--hybrid-url <value>', 'Hybrid backend server URL (overrides default)');

node/opendataloader-pdf/src/convert-options.generated.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ export interface ConvertOptions {
4343
pages?: string;
4444
/** Include page headers and footers in output */
4545
includeHeaderFooter?: boolean;
46-
/** Detect strikethrough text and wrap with ~~ in Markdown output (experimental) */
46+
/** Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental) */
4747
detectStrikethrough?: boolean;
4848
/** Hybrid backend (requires a running server). Quick start: pip install "opendataloader-pdf[hybrid]" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast */
4949
hybrid?: string;

options.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@
158158
"type": "boolean",
159159
"required": false,
160160
"default": false,
161-
"description": "Detect strikethrough text and wrap with ~~ in Markdown output (experimental)"
161+
"description": "Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental)"
162162
},
163163
{
164164
"name": "hybrid",

0 commit comments

Comments
 (0)