Skip to content

Commit 071011e

Browse files
Add strikethrough text to HTML generator (#379)
1 parent 45912a5 commit 071011e

15 files changed

Lines changed: 128 additions & 61 deletions

File tree

content/docs/_generated/node-convert-options.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ description: Options for the Node.js convert function
2828
| `imageDir` | `string` | - | Directory for extracted images |
2929
| `pages` | `string` | - | Pages to extract (e.g., "1,3,5-7"). Default: all pages |
3030
| `includeHeaderFooter` | `boolean` | `false` | Include page headers and footers in output |
31-
| `detectStrikethrough` | `boolean` | `false` | Detect strikethrough text and wrap with ~~ in Markdown output (experimental) |
31+
| `detectStrikethrough` | `boolean` | `false` | Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental) |
3232
| `hybrid` | `string` | `"off"` | Hybrid backend (requires a running server). Quick start: pip install "opendataloader-pdf[hybrid]" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast |
3333
| `hybridMode` | `string` | `"auto"` | Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) |
3434
| `hybridUrl` | `string` | - | Hybrid backend server URL (overrides default) |

content/docs/_generated/python-convert-options.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ description: Options for the Python convert function
2929
| `image_dir` | `str` | - | Directory for extracted images |
3030
| `pages` | `str` | - | Pages to extract (e.g., "1,3,5-7"). Default: all pages |
3131
| `include_header_footer` | `bool` | `False` | Include page headers and footers in output |
32-
| `detect_strikethrough` | `bool` | `False` | Detect strikethrough text and wrap with ~~ in Markdown output (experimental) |
32+
| `detect_strikethrough` | `bool` | `False` | Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental) |
3333
| `hybrid` | `str` | `"off"` | Hybrid backend (requires a running server). Quick start: pip install "opendataloader-pdf[hybrid]" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast |
3434
| `hybrid_mode` | `str` | `"auto"` | Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) |
3535
| `hybrid_url` | `str` | - | Hybrid backend server URL (overrides default) |

content/docs/cli-options-reference.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ This page documents all available CLI options for opendataloader-pdf.
3232
| `--image-dir` | - | `string` | - | Directory for extracted images |
3333
| `--pages` | - | `string` | - | Pages to extract (e.g., "1,3,5-7"). Default: all pages |
3434
| `--include-header-footer` | - | `boolean` | `false` | Include page headers and footers in output |
35-
| `--detect-strikethrough` | - | `boolean` | `false` | Detect strikethrough text and wrap with ~~ in Markdown output (experimental) |
35+
| `--detect-strikethrough` | - | `boolean` | `false` | Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental) |
3636
| `--hybrid` | - | `string` | `"off"` | Hybrid backend (requires a running server). Quick start: pip install "opendataloader-pdf[hybrid]" && opendataloader-pdf-hybrid --port 5002. For remote servers use --hybrid-url. Values: off (default), docling-fast |
3737
| `--hybrid-mode` | - | `string` | `"auto"` | Hybrid triage mode. Values: auto (default, dynamic triage), full (skip triage, all pages to backend) |
3838
| `--hybrid-url` | - | `string` | - | Hybrid backend server URL (overrides default) |

java/opendataloader-pdf-cli/src/main/java/org/opendataloader/pdf/cli/CLIOptions.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ public class CLIOptions {
112112

113113
// ===== Detect Strikethrough =====
114114
private static final String DETECT_STRIKETHROUGH_LONG_OPTION = "detect-strikethrough";
115-
private static final String DETECT_STRIKETHROUGH_DESC = "Detect strikethrough text and wrap with ~~ in Markdown output (experimental)";
115+
private static final String DETECT_STRIKETHROUGH_DESC = "Detect strikethrough text and wrap with ~~ in Markdown output or <del></del> tag in HTML output (experimental)";
116116

117117
// ===== Hybrid Mode =====
118118
private static final String HYBRID_LONG_OPTION = "hybrid";

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/html/HtmlGenerator.java

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,15 @@
2121
import org.opendataloader.pdf.entities.SemanticPicture;
2222
import org.opendataloader.pdf.markdown.MarkdownSyntax;
2323
import org.opendataloader.pdf.utils.Base64ImageUtils;
24+
import org.opendataloader.pdf.utils.GeneratorUtils;
2425
import org.opendataloader.pdf.utils.ImagesUtils;
26+
import org.opendataloader.pdf.utils.OutputType;
2527
import org.verapdf.wcag.algorithms.entities.IObject;
2628
import org.verapdf.wcag.algorithms.entities.SemanticHeaderOrFooter;
2729
import org.verapdf.wcag.algorithms.entities.SemanticHeading;
2830
import org.verapdf.wcag.algorithms.entities.SemanticParagraph;
2931
import org.verapdf.wcag.algorithms.entities.SemanticTextNode;
30-
import org.verapdf.wcag.algorithms.entities.content.ImageChunk;
32+
import org.verapdf.wcag.algorithms.entities.content.*;
3133
import org.verapdf.wcag.algorithms.entities.lists.ListItem;
3234
import org.verapdf.wcag.algorithms.entities.lists.PDFList;
3335
import org.verapdf.wcag.algorithms.entities.tables.tableBorders.TableBorder;
@@ -74,6 +76,10 @@ public class HtmlGenerator implements Closeable {
7476
protected String imageFormat = Config.IMAGE_FORMAT_PNG;
7577
/** Whether to include page headers and footers in output. */
7678
protected boolean includeHeaderFooter = false;
79+
/** Opening tag for strikethrough text*/
80+
protected static final String strikethroughTextHtmlOpeningTag = "<del>";
81+
/** Closing tag for strikethrough text*/
82+
protected static final String strikethroughTextHtmlClosingTag = "</del>";;
7783

7884
/**
7985
* Creates a new HtmlGenerator for the specified PDF file.
@@ -286,7 +292,8 @@ protected void writeList(PDFList list) throws IOException {
286292
htmlWriter.write(HtmlSyntax.HTML_LIST_ITEM_TAG);
287293

288294
htmlWriter.write(HtmlSyntax.HTML_PARAGRAPH_TAG);
289-
htmlWriter.write(getCorrectString(item.toString()));
295+
String value = GeneratorUtils.getTextFromLines(item.getLines(), OutputType.HTML);
296+
htmlWriter.write(getCorrectString(value));
290297
htmlWriter.write(HtmlSyntax.HTML_PARAGRAPH_CLOSE_TAG);
291298

292299
for (IObject object : item.getContents()) {
@@ -307,7 +314,7 @@ protected void writeList(PDFList list) throws IOException {
307314
*/
308315
protected void writeSemanticTextNode(SemanticTextNode textNode) throws IOException {
309316
htmlWriter.write(HtmlSyntax.HTML_FIGURE_CAPTION_TAG);
310-
htmlWriter.write(getCorrectString(textNode.getValue()));
317+
htmlWriter.write(getCorrectString(GeneratorUtils.getTextFromTextNode(textNode, OutputType.HTML)));
311318
htmlWriter.write(HtmlSyntax.HTML_FIGURE_CAPTION_CLOSE_TAG);
312319
htmlWriter.write(HtmlSyntax.HTML_LINE_BREAK);
313320
}
@@ -362,13 +369,13 @@ protected void writeTable(TableBorder table) throws IOException {
362369
* @throws IOException if unable to write to the output
363370
*/
364371
protected void writeParagraph(SemanticParagraph paragraph) throws IOException {
365-
String paragraphValue = paragraph.getValue();
366372
double paragraphIndent = paragraph.getColumns().get(0).getBlocks().get(0).getFirstLineIndent();
367373

368374
htmlWriter.write(HtmlSyntax.HTML_PARAGRAPH_TAG);
369375
if (paragraphIndent > 0) {
370376
htmlWriter.write(HtmlSyntax.HTML_INDENT);
371377
}
378+
String paragraphValue = GeneratorUtils.getTextFromTextNode(paragraph, OutputType.HTML);
372379

373380
if (isInsideTable() && StaticContainers.isKeepLineBreaks()) {
374381
paragraphValue = paragraphValue.replace(HtmlSyntax.HTML_LINE_BREAK, HtmlSyntax.HTML_LINE_BREAK_TAG);
@@ -388,7 +395,7 @@ protected void writeParagraph(SemanticParagraph paragraph) throws IOException {
388395
protected void writeHeading(SemanticHeading heading) throws IOException {
389396
int headingLevel = Math.min(6, Math.max(1, heading.getHeadingLevel()));
390397
htmlWriter.write("<h" + headingLevel + ">");
391-
htmlWriter.write(getCorrectString(heading.getValue()));
398+
htmlWriter.write(getCorrectString(GeneratorUtils.getTextFromTextNode(heading, OutputType.HTML)));
392399
htmlWriter.write("</h" + headingLevel + ">");
393400
htmlWriter.write(HtmlSyntax.HTML_LINE_BREAK);
394401
}
@@ -467,6 +474,18 @@ protected String escapeHtmlAttribute(String value) {
467474
.replace("\r", "");
468475
}
469476

477+
public static void getTextFromLineForHTML(TextLine line, StringBuilder stringBuilder) {
478+
for (TextChunk chunk : line.getTextChunks()) {
479+
if (chunk.getIsStrikethroughText()) {
480+
stringBuilder.append(strikethroughTextHtmlOpeningTag);
481+
}
482+
stringBuilder.append(chunk.getValue());
483+
if (chunk.getIsStrikethroughText()) {
484+
stringBuilder.append(strikethroughTextHtmlClosingTag);
485+
}
486+
}
487+
}
488+
470489
@Override
471490
public void close() throws IOException {
472491
if (htmlWriter != null) {

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/markdown/MarkdownGenerator.java

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@
2020
import org.opendataloader.pdf.entities.SemanticFormula;
2121
import org.opendataloader.pdf.entities.SemanticPicture;
2222
import org.opendataloader.pdf.utils.Base64ImageUtils;
23+
import org.opendataloader.pdf.utils.GeneratorUtils;
2324
import org.opendataloader.pdf.utils.ImagesUtils;
25+
import org.opendataloader.pdf.utils.OutputType;
2426
import org.verapdf.wcag.algorithms.entities.IObject;
2527
import org.verapdf.wcag.algorithms.entities.SemanticHeaderOrFooter;
2628
import org.verapdf.wcag.algorithms.entities.SemanticHeading;
@@ -54,6 +56,7 @@ public class MarkdownGenerator implements Closeable {
5456
protected boolean embedImages = false;
5557
protected String imageFormat = Config.IMAGE_FORMAT_PNG;
5658
protected boolean includeHeaderFooter = false;
59+
protected static final String strikethroughTextMD = "~~";
5760

5861
MarkdownGenerator(File inputPdf, Config config) throws IOException {
5962
String cutPdfFileName = inputPdf.getName();
@@ -234,7 +237,7 @@ protected void writeList(PDFList list) throws IOException {
234237
markdownWriter.write(MarkdownSyntax.LIST_ITEM);
235238
markdownWriter.write(MarkdownSyntax.SPACE);
236239
}
237-
markdownWriter.write(getCorrectMarkdownString(item.toString()));
240+
markdownWriter.write(getCorrectMarkdownString(GeneratorUtils.getTextFromLines(item.getLines(), OutputType.MD)));
238241
writeLineBreak();
239242

240243
List<IObject> itemContents = item.getContents();
@@ -246,7 +249,7 @@ protected void writeList(PDFList list) throws IOException {
246249
}
247250

248251
protected void writeSemanticTextNode(SemanticTextNode textNode) throws IOException {
249-
String value = textNode.getValue();
252+
String value = GeneratorUtils.getTextFromTextNode(textNode, OutputType.MD);
250253
if (StaticContainers.isKeepLineBreaks()) {
251254
if (textNode instanceof SemanticHeading) {
252255
value = value.replace(MarkdownSyntax.LINE_BREAK, MarkdownSyntax.SPACE);
@@ -261,6 +264,8 @@ protected void writeSemanticTextNode(SemanticTextNode textNode) throws IOExcepti
261264
markdownWriter.write(getCorrectMarkdownString(value));
262265
}
263266

267+
268+
264269
protected void writeTable(TableBorder table) throws IOException {
265270
enterTable();
266271
for (int rowNumber = 0; rowNumber < table.getNumberOfRows(); rowNumber++) {
@@ -362,6 +367,18 @@ protected String getCorrectMarkdownString(String value) {
362367
return null;
363368
}
364369

370+
public static void getTextFromLineForMarkdown(TextLine line, StringBuilder stringBuilder) {
371+
for (TextChunk chunk : line.getTextChunks()) {
372+
if (chunk.getIsStrikethroughText()) {
373+
stringBuilder.append(strikethroughTextMD);
374+
}
375+
stringBuilder.append(chunk.getValue());
376+
if (chunk.getIsStrikethroughText()) {
377+
stringBuilder.append(strikethroughTextMD);
378+
}
379+
}
380+
}
381+
365382
@Override
366383
public void close() throws IOException {
367384
if (markdownWriter != null) {

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/processors/StrikethroughProcessor.java

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@
2727

2828
/**
2929
* Detects strikethrough text by finding horizontal lines that pass through
30-
* the vertical center of text chunks. Marks affected TextChunks by wrapping
31-
* their values with ~~ markdown strikethrough syntax.
30+
* the vertical center of text chunks. Marks affected TextChunks by setting
31+
* their isStrikethroughText field to true.
3232
*
3333
* Filters to avoid false positives:
3434
* 1. Table border membership (via TableBordersCollection)
@@ -52,8 +52,8 @@ public class StrikethroughProcessor {
5252
private static final double MAX_STROKE_TO_TEXT_HEIGHT_RATIO = 1.3;
5353

5454
/**
55-
* Detects strikethrough lines among page contents and wraps affected
56-
* TextChunk values with ~~ markdown syntax.
55+
* Detects strikethrough lines among page contents and sets affected
56+
* TextChunk isStrikethroughText field to true.
5757
*
5858
* @param pageContents the list of content objects for a page
5959
* @return the page contents (modified in place)
@@ -95,8 +95,6 @@ public static List<IObject> processStrikethroughs(List<IObject> pageContents) {
9595
if (!matchingChunks.isEmpty() && matchingChunks.size() <= MAX_TEXT_CHUNKS_PER_LINE) {
9696
for (TextChunk chunk : matchingChunks) {
9797
if (!chunk.getIsStrikethroughText()) {
98-
String value = chunk.getValue();
99-
chunk.setValue("~~" + value + "~~");
10098
chunk.setIsStrikethroughText();
10199
}
102100
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
package org.opendataloader.pdf.utils;
2+
3+
import org.opendataloader.pdf.html.HtmlGenerator;
4+
import org.opendataloader.pdf.markdown.MarkdownGenerator;
5+
import org.verapdf.wcag.algorithms.entities.SemanticTextNode;
6+
import org.verapdf.wcag.algorithms.entities.content.TextBlock;
7+
import org.verapdf.wcag.algorithms.entities.content.TextColumn;
8+
import org.verapdf.wcag.algorithms.entities.content.TextLine;
9+
import org.verapdf.wcag.algorithms.semanticalgorithms.utils.TextChunkUtils;
10+
11+
import java.util.List;
12+
13+
public class GeneratorUtils {
14+
15+
public static String getTextFromTextNode(SemanticTextNode textNode, OutputType outputType) {
16+
StringBuilder stringBuilder = new StringBuilder();
17+
for (TextColumn column : textNode.getColumns()) {
18+
List<TextBlock> blocks = column.getBlocks();
19+
for (int i = 0; i < blocks.size() - 1; i++) {
20+
TextBlock block = blocks.get(i);
21+
stringBuilder.append(getTextFromLines(block.getLines(), outputType));
22+
TextChunkUtils.formatLineEnd(stringBuilder);
23+
}
24+
stringBuilder.append(getTextFromLines(blocks.get(blocks.size() - 1).getLines(), outputType));
25+
}
26+
return stringBuilder.toString();
27+
}
28+
29+
public static String getTextFromLines(List<TextLine> textLines, OutputType outputType) {
30+
StringBuilder stringBuilder = new StringBuilder();
31+
for (int i = 0; i < textLines.size() - 1; i++) {
32+
TextLine line = textLines.get(i);
33+
switch (outputType) {
34+
case MD:
35+
MarkdownGenerator.getTextFromLineForMarkdown(line, stringBuilder);
36+
break;
37+
case HTML:
38+
HtmlGenerator.getTextFromLineForHTML(line, stringBuilder);
39+
break;
40+
}
41+
TextChunkUtils.formatLineEnd(stringBuilder);
42+
}
43+
switch (outputType) {
44+
case MD:
45+
MarkdownGenerator.getTextFromLineForMarkdown(textLines.get(textLines.size() - 1), stringBuilder);
46+
break;
47+
case HTML:
48+
HtmlGenerator.getTextFromLineForHTML(textLines.get(textLines.size() - 1), stringBuilder);
49+
break;
50+
}
51+
return stringBuilder.toString();
52+
}
53+
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
package org.opendataloader.pdf.utils;
2+
3+
public enum OutputType {
4+
TXT,
5+
MD,
6+
HTML,
7+
JSON,
8+
PDF
9+
}

0 commit comments

Comments
 (0)