Skip to content

Commit fb28a31

Browse files
Update GeneratorUtils
1 parent 6c387c0 commit fb28a31

4 files changed

Lines changed: 60 additions & 18 deletions

File tree

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/html/HtmlGenerator.java

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import org.opendataloader.pdf.utils.Base64ImageUtils;
2424
import org.opendataloader.pdf.utils.GeneratorUtils;
2525
import org.opendataloader.pdf.utils.ImagesUtils;
26+
import org.opendataloader.pdf.utils.OutputType;
2627
import org.verapdf.wcag.algorithms.entities.IObject;
2728
import org.verapdf.wcag.algorithms.entities.SemanticHeaderOrFooter;
2829
import org.verapdf.wcag.algorithms.entities.SemanticHeading;
@@ -75,8 +76,6 @@ public class HtmlGenerator implements Closeable {
7576
protected String imageFormat = Config.IMAGE_FORMAT_PNG;
7677
/** Whether to include page headers and footers in output. */
7778
protected boolean includeHeaderFooter = false;
78-
protected static final String strikethroughTextOpeningTag = "<del>";
79-
protected static final String strikethroughTextClosingTag = "</del>";
8079

8180
/**
8281
* Creates a new HtmlGenerator for the specified PDF file.
@@ -289,7 +288,7 @@ protected void writeList(PDFList list) throws IOException {
289288
htmlWriter.write(HtmlSyntax.HTML_LIST_ITEM_TAG);
290289

291290
htmlWriter.write(HtmlSyntax.HTML_PARAGRAPH_TAG);
292-
String value = GeneratorUtils.getTextFromLines(item.getLines(), strikethroughTextOpeningTag, strikethroughTextClosingTag);
291+
String value = GeneratorUtils.getTextFromLines(item.getLines(), OutputType.HTML);
293292
htmlWriter.write(getCorrectString(value));
294293
htmlWriter.write(HtmlSyntax.HTML_PARAGRAPH_CLOSE_TAG);
295294

@@ -311,7 +310,7 @@ protected void writeList(PDFList list) throws IOException {
311310
*/
312311
protected void writeSemanticTextNode(SemanticTextNode textNode) throws IOException {
313312
htmlWriter.write(HtmlSyntax.HTML_FIGURE_CAPTION_TAG);
314-
htmlWriter.write(getCorrectString(GeneratorUtils.getTextFromTextNode(textNode, strikethroughTextOpeningTag, strikethroughTextClosingTag)));
313+
htmlWriter.write(getCorrectString(GeneratorUtils.getTextFromTextNode(textNode, OutputType.HTML)));
315314
htmlWriter.write(HtmlSyntax.HTML_FIGURE_CAPTION_CLOSE_TAG);
316315
htmlWriter.write(HtmlSyntax.HTML_LINE_BREAK);
317316
}
@@ -372,7 +371,7 @@ protected void writeParagraph(SemanticParagraph paragraph) throws IOException {
372371
if (paragraphIndent > 0) {
373372
htmlWriter.write(HtmlSyntax.HTML_INDENT);
374373
}
375-
String paragraphValue = GeneratorUtils.getTextFromTextNode(paragraph, strikethroughTextOpeningTag, strikethroughTextClosingTag);
374+
String paragraphValue = GeneratorUtils.getTextFromTextNode(paragraph, OutputType.HTML);
376375

377376
if (isInsideTable() && StaticContainers.isKeepLineBreaks()) {
378377
paragraphValue = paragraphValue.replace(HtmlSyntax.HTML_LINE_BREAK, HtmlSyntax.HTML_LINE_BREAK_TAG);
@@ -392,7 +391,7 @@ protected void writeParagraph(SemanticParagraph paragraph) throws IOException {
392391
protected void writeHeading(SemanticHeading heading) throws IOException {
393392
int headingLevel = Math.min(6, Math.max(1, heading.getHeadingLevel()));
394393
htmlWriter.write("<h" + headingLevel + ">");
395-
htmlWriter.write(getCorrectString(GeneratorUtils.getTextFromTextNode(heading, strikethroughTextOpeningTag, strikethroughTextClosingTag)));
394+
htmlWriter.write(getCorrectString(GeneratorUtils.getTextFromTextNode(heading, OutputType.HTML)));
396395
htmlWriter.write("</h" + headingLevel + ">");
397396
htmlWriter.write(HtmlSyntax.HTML_LINE_BREAK);
398397
}

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/markdown/MarkdownGenerator.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import org.opendataloader.pdf.utils.Base64ImageUtils;
2323
import org.opendataloader.pdf.utils.GeneratorUtils;
2424
import org.opendataloader.pdf.utils.ImagesUtils;
25+
import org.opendataloader.pdf.utils.OutputType;
2526
import org.verapdf.wcag.algorithms.entities.IObject;
2627
import org.verapdf.wcag.algorithms.entities.SemanticHeaderOrFooter;
2728
import org.verapdf.wcag.algorithms.entities.SemanticHeading;
@@ -55,7 +56,6 @@ public class MarkdownGenerator implements Closeable {
5556
protected boolean embedImages = false;
5657
protected String imageFormat = Config.IMAGE_FORMAT_PNG;
5758
protected boolean includeHeaderFooter = false;
58-
protected static final String strikethroughTextSyntax = "~~";
5959

6060
MarkdownGenerator(File inputPdf, Config config) throws IOException {
6161
String cutPdfFileName = inputPdf.getName();
@@ -236,7 +236,7 @@ protected void writeList(PDFList list) throws IOException {
236236
markdownWriter.write(MarkdownSyntax.LIST_ITEM);
237237
markdownWriter.write(MarkdownSyntax.SPACE);
238238
}
239-
markdownWriter.write(getCorrectMarkdownString(GeneratorUtils.getTextFromLines(item.getLines(), strikethroughTextSyntax, strikethroughTextSyntax)));
239+
markdownWriter.write(getCorrectMarkdownString(GeneratorUtils.getTextFromLines(item.getLines(), OutputType.MD)));
240240
writeLineBreak();
241241

242242
List<IObject> itemContents = item.getContents();
@@ -248,7 +248,7 @@ protected void writeList(PDFList list) throws IOException {
248248
}
249249

250250
protected void writeSemanticTextNode(SemanticTextNode textNode) throws IOException {
251-
String value = GeneratorUtils.getTextFromTextNode(textNode, strikethroughTextSyntax, strikethroughTextSyntax);
251+
String value = GeneratorUtils.getTextFromTextNode(textNode, OutputType.MD);
252252
if (StaticContainers.isKeepLineBreaks()) {
253253
if (textNode instanceof SemanticHeading) {
254254
value = value.replace(MarkdownSyntax.LINE_BREAK, MarkdownSyntax.SPACE);

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/utils/GeneratorUtils.java

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,35 +10,69 @@
1010
import java.util.List;
1111

1212
public class GeneratorUtils {
13-
public static String getTextFromTextNode(SemanticTextNode textNode, String strikethroughTextOpening, String strikethroughTextClosing) {
13+
protected static final String strikethroughTextMD = "~~";
14+
protected static final String strikethroughTextHtmlOpeningTag = "<del>";
15+
protected static final String strikethroughTextHtmlClosingTag = "</del>";;
16+
17+
public static String getTextFromTextNode(SemanticTextNode textNode, OutputType outputType) {
1418
StringBuilder stringBuilder = new StringBuilder();
1519
for (TextColumn column : textNode.getColumns()) {
16-
for (TextBlock block : column.getBlocks()) {
17-
stringBuilder.append(getTextFromLines(block.getLines(), strikethroughTextOpening, strikethroughTextClosing));
20+
List<TextBlock> blocks = column.getBlocks();
21+
for (int i = 0; i < blocks.size() - 1; i++) {
22+
TextBlock block = blocks.get(i);
23+
stringBuilder.append(getTextFromLines(block.getLines(), outputType));
24+
TextChunkUtils.formatLineEnd(stringBuilder);
1825
}
26+
stringBuilder.append(getTextFromLines(blocks.get(blocks.size() - 1).getLines(), outputType));
1927
}
2028
return stringBuilder.toString();
2129
}
2230

23-
public static String getTextFromLines(List<TextLine> textLines, String strikethroughTextOpening, String strikethroughTextClosing) {
31+
public static String getTextFromLines(List<TextLine> textLines, OutputType outputType) {
2432
StringBuilder stringBuilder = new StringBuilder();
2533
for (int i = 0; i < textLines.size() - 1; i++) {
2634
TextLine line = textLines.get(i);
27-
getTextFromLine(line, stringBuilder, strikethroughTextOpening, strikethroughTextClosing);
35+
switch (outputType) {
36+
case MD:
37+
getTextFromLineForMarkdown(line, stringBuilder);
38+
break;
39+
case HTML:
40+
getTextFromLineForHTML(line, stringBuilder);
41+
break;
42+
}
2843
TextChunkUtils.formatLineEnd(stringBuilder);
2944
}
30-
getTextFromLine(textLines.get(textLines.size() - 1), stringBuilder, strikethroughTextOpening, strikethroughTextClosing);
45+
switch (outputType) {
46+
case MD:
47+
getTextFromLineForMarkdown(textLines.get(textLines.size() - 1), stringBuilder);
48+
break;
49+
case HTML:
50+
getTextFromLineForHTML(textLines.get(textLines.size() - 1), stringBuilder);
51+
break;
52+
}
3153
return stringBuilder.toString();
3254
}
3355

34-
public static void getTextFromLine(TextLine line, StringBuilder stringBuilder, String strikethroughTextOpening, String strikethroughTextClosing) {
56+
public static void getTextFromLineForMarkdown(TextLine line, StringBuilder stringBuilder) {
57+
for (TextChunk chunk : line.getTextChunks()) {
58+
if (chunk.getIsStrikethroughText()) {
59+
stringBuilder.append(strikethroughTextMD);
60+
}
61+
stringBuilder.append(chunk.getValue());
62+
if (chunk.getIsStrikethroughText()) {
63+
stringBuilder.append(strikethroughTextMD);
64+
}
65+
}
66+
}
67+
68+
public static void getTextFromLineForHTML(TextLine line, StringBuilder stringBuilder) {
3569
for (TextChunk chunk : line.getTextChunks()) {
3670
if (chunk.getIsStrikethroughText()) {
37-
stringBuilder.append(strikethroughTextOpening);
71+
stringBuilder.append(strikethroughTextHtmlOpeningTag);
3872
}
3973
stringBuilder.append(chunk.getValue());
4074
if (chunk.getIsStrikethroughText()) {
41-
stringBuilder.append(strikethroughTextClosing);
75+
stringBuilder.append(strikethroughTextHtmlClosingTag);
4276
}
4377
}
4478
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
package org.opendataloader.pdf.utils;
2+
3+
public enum OutputType {
4+
TXT,
5+
MD,
6+
HTML,
7+
JSON,
8+
PDF
9+
}

0 commit comments

Comments
 (0)