Skip to content

Commit 6c387c0

Browse files
Add GeneratorUtils class
1 parent 79372ff commit 6c387c0

3 files changed

Lines changed: 57 additions & 64 deletions

File tree

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/html/HtmlGenerator.java

Lines changed: 8 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import org.opendataloader.pdf.entities.SemanticPicture;
2222
import org.opendataloader.pdf.markdown.MarkdownSyntax;
2323
import org.opendataloader.pdf.utils.Base64ImageUtils;
24+
import org.opendataloader.pdf.utils.GeneratorUtils;
2425
import org.opendataloader.pdf.utils.ImagesUtils;
2526
import org.verapdf.wcag.algorithms.entities.IObject;
2627
import org.verapdf.wcag.algorithms.entities.SemanticHeaderOrFooter;
@@ -34,7 +35,6 @@
3435
import org.verapdf.wcag.algorithms.entities.tables.tableBorders.TableBorderCell;
3536
import org.verapdf.wcag.algorithms.entities.tables.tableBorders.TableBorderRow;
3637
import org.verapdf.wcag.algorithms.semanticalgorithms.containers.StaticContainers;
37-
import org.verapdf.wcag.algorithms.semanticalgorithms.utils.TextChunkUtils;
3838

3939
import java.io.Closeable;
4040
import java.io.File;
@@ -75,6 +75,8 @@ public class HtmlGenerator implements Closeable {
7575
protected String imageFormat = Config.IMAGE_FORMAT_PNG;
7676
/** Whether to include page headers and footers in output. */
7777
protected boolean includeHeaderFooter = false;
78+
protected static final String strikethroughTextOpeningTag = "<del>";
79+
protected static final String strikethroughTextClosingTag = "</del>";
7880

7981
/**
8082
* Creates a new HtmlGenerator for the specified PDF file.
@@ -287,9 +289,8 @@ protected void writeList(PDFList list) throws IOException {
287289
htmlWriter.write(HtmlSyntax.HTML_LIST_ITEM_TAG);
288290

289291
htmlWriter.write(HtmlSyntax.HTML_PARAGRAPH_TAG);
290-
StringBuilder stringBuilder = new StringBuilder();
291-
getTextFromLines(item.getLines(), stringBuilder);
292-
htmlWriter.write(getCorrectString(stringBuilder.toString()));
292+
String value = GeneratorUtils.getTextFromLines(item.getLines(), strikethroughTextOpeningTag, strikethroughTextClosingTag);
293+
htmlWriter.write(getCorrectString(value));
293294
htmlWriter.write(HtmlSyntax.HTML_PARAGRAPH_CLOSE_TAG);
294295

295296
for (IObject object : item.getContents()) {
@@ -310,40 +311,11 @@ protected void writeList(PDFList list) throws IOException {
310311
*/
311312
protected void writeSemanticTextNode(SemanticTextNode textNode) throws IOException {
312313
htmlWriter.write(HtmlSyntax.HTML_FIGURE_CAPTION_TAG);
313-
htmlWriter.write(getCorrectString(getTextFromColumns(textNode)));
314+
htmlWriter.write(getCorrectString(GeneratorUtils.getTextFromTextNode(textNode, strikethroughTextOpeningTag, strikethroughTextClosingTag)));
314315
htmlWriter.write(HtmlSyntax.HTML_FIGURE_CAPTION_CLOSE_TAG);
315316
htmlWriter.write(HtmlSyntax.HTML_LINE_BREAK);
316317
}
317318

318-
protected void getTextFromLines(List<TextLine> textLines, StringBuilder stringBuilder) {
319-
for (int i = 0; i < textLines.size() - 1; i++) {
320-
TextLine line = textLines.get(i);
321-
getTextFromLine(line, stringBuilder);
322-
TextChunkUtils.formatLineEnd(stringBuilder);
323-
}
324-
getTextFromLine(textLines.get(textLines.size() - 1), stringBuilder);
325-
}
326-
327-
protected void getTextFromLine(TextLine line, StringBuilder stringBuilder) {
328-
for (TextChunk chunk : line.getTextChunks()) {
329-
if (chunk.getIsStrikethroughText()) {
330-
stringBuilder.append("<del>").append(chunk.getValue()).append("</del>");
331-
} else {
332-
stringBuilder.append(chunk.getValue());
333-
}
334-
}
335-
}
336-
337-
protected String getTextFromColumns(SemanticTextNode node) {
338-
StringBuilder stringBuilder = new StringBuilder();
339-
for (TextColumn column : node.getColumns()) {
340-
for (TextBlock block : column.getBlocks()) {
341-
getTextFromLines(block.getLines(), stringBuilder);
342-
}
343-
}
344-
return stringBuilder.toString();
345-
}
346-
347319
/**
348320
* Writes a table element to the HTML output.
349321
*
@@ -400,7 +372,7 @@ protected void writeParagraph(SemanticParagraph paragraph) throws IOException {
400372
if (paragraphIndent > 0) {
401373
htmlWriter.write(HtmlSyntax.HTML_INDENT);
402374
}
403-
String paragraphValue = getTextFromColumns(paragraph);
375+
String paragraphValue = GeneratorUtils.getTextFromTextNode(paragraph, strikethroughTextOpeningTag, strikethroughTextClosingTag);
404376

405377
if (isInsideTable() && StaticContainers.isKeepLineBreaks()) {
406378
paragraphValue = paragraphValue.replace(HtmlSyntax.HTML_LINE_BREAK, HtmlSyntax.HTML_LINE_BREAK_TAG);
@@ -420,7 +392,7 @@ protected void writeParagraph(SemanticParagraph paragraph) throws IOException {
420392
protected void writeHeading(SemanticHeading heading) throws IOException {
421393
int headingLevel = Math.min(6, Math.max(1, heading.getHeadingLevel()));
422394
htmlWriter.write("<h" + headingLevel + ">");
423-
htmlWriter.write(getCorrectString(getTextFromColumns(heading)));
395+
htmlWriter.write(getCorrectString(GeneratorUtils.getTextFromTextNode(heading, strikethroughTextOpeningTag, strikethroughTextClosingTag)));
424396
htmlWriter.write("</h" + headingLevel + ">");
425397
htmlWriter.write(HtmlSyntax.HTML_LINE_BREAK);
426398
}

java/opendataloader-pdf-core/src/main/java/org/opendataloader/pdf/markdown/MarkdownGenerator.java

Lines changed: 4 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import org.opendataloader.pdf.entities.SemanticFormula;
2121
import org.opendataloader.pdf.entities.SemanticPicture;
2222
import org.opendataloader.pdf.utils.Base64ImageUtils;
23+
import org.opendataloader.pdf.utils.GeneratorUtils;
2324
import org.opendataloader.pdf.utils.ImagesUtils;
2425
import org.verapdf.wcag.algorithms.entities.IObject;
2526
import org.verapdf.wcag.algorithms.entities.SemanticHeaderOrFooter;
@@ -33,7 +34,6 @@
3334
import org.verapdf.wcag.algorithms.entities.tables.tableBorders.TableBorderCell;
3435
import org.verapdf.wcag.algorithms.entities.tables.tableBorders.TableBorderRow;
3536
import org.verapdf.wcag.algorithms.semanticalgorithms.containers.StaticContainers;
36-
import org.verapdf.wcag.algorithms.semanticalgorithms.utils.TextChunkUtils;
3737

3838
import java.io.Closeable;
3939
import java.io.File;
@@ -55,6 +55,7 @@ public class MarkdownGenerator implements Closeable {
5555
protected boolean embedImages = false;
5656
protected String imageFormat = Config.IMAGE_FORMAT_PNG;
5757
protected boolean includeHeaderFooter = false;
58+
protected static final String strikethroughTextSyntax = "~~";
5859

5960
MarkdownGenerator(File inputPdf, Config config) throws IOException {
6061
String cutPdfFileName = inputPdf.getName();
@@ -235,9 +236,7 @@ protected void writeList(PDFList list) throws IOException {
235236
markdownWriter.write(MarkdownSyntax.LIST_ITEM);
236237
markdownWriter.write(MarkdownSyntax.SPACE);
237238
}
238-
StringBuilder stringBuilder = new StringBuilder();
239-
getTextFromLines(item.getLines(), stringBuilder);
240-
markdownWriter.write(getCorrectMarkdownString(stringBuilder.toString()));
239+
markdownWriter.write(getCorrectMarkdownString(GeneratorUtils.getTextFromLines(item.getLines(), strikethroughTextSyntax, strikethroughTextSyntax)));
241240
writeLineBreak();
242241

243242
List<IObject> itemContents = item.getContents();
@@ -249,13 +248,7 @@ protected void writeList(PDFList list) throws IOException {
249248
}
250249

251250
protected void writeSemanticTextNode(SemanticTextNode textNode) throws IOException {
252-
StringBuilder stringBuilder = new StringBuilder();
253-
for (TextColumn column : textNode.getColumns()) {
254-
for (TextBlock block : column.getBlocks()) {
255-
getTextFromLines(block.getLines(), stringBuilder);
256-
}
257-
}
258-
String value = stringBuilder.toString();
251+
String value = GeneratorUtils.getTextFromTextNode(textNode, strikethroughTextSyntax, strikethroughTextSyntax);
259252
if (StaticContainers.isKeepLineBreaks()) {
260253
if (textNode instanceof SemanticHeading) {
261254
value = value.replace(MarkdownSyntax.LINE_BREAK, MarkdownSyntax.SPACE);
@@ -270,24 +263,7 @@ protected void writeSemanticTextNode(SemanticTextNode textNode) throws IOExcepti
270263
markdownWriter.write(getCorrectMarkdownString(value));
271264
}
272265

273-
protected void getTextFromLines(List<TextLine> textLines, StringBuilder stringBuilder) {
274-
for (int i = 0; i < textLines.size() - 1; i++) {
275-
TextLine line = textLines.get(i);
276-
getTextFromLine(line, stringBuilder);
277-
TextChunkUtils.formatLineEnd(stringBuilder);
278-
}
279-
getTextFromLine(textLines.get(textLines.size() - 1), stringBuilder);
280-
}
281266

282-
protected void getTextFromLine(TextLine line, StringBuilder stringBuilder) {
283-
for (TextChunk chunk : line.getTextChunks()) {
284-
if (chunk.getIsStrikethroughText()) {
285-
stringBuilder.append("~~").append(chunk.getValue()).append("~~");
286-
} else {
287-
stringBuilder.append(chunk.getValue());
288-
}
289-
}
290-
}
291267

292268
protected void writeTable(TableBorder table) throws IOException {
293269
enterTable();
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
package org.opendataloader.pdf.utils;
2+
3+
import org.verapdf.wcag.algorithms.entities.SemanticTextNode;
4+
import org.verapdf.wcag.algorithms.entities.content.TextBlock;
5+
import org.verapdf.wcag.algorithms.entities.content.TextChunk;
6+
import org.verapdf.wcag.algorithms.entities.content.TextColumn;
7+
import org.verapdf.wcag.algorithms.entities.content.TextLine;
8+
import org.verapdf.wcag.algorithms.semanticalgorithms.utils.TextChunkUtils;
9+
10+
import java.util.List;
11+
12+
public class GeneratorUtils {
13+
public static String getTextFromTextNode(SemanticTextNode textNode, String strikethroughTextOpening, String strikethroughTextClosing) {
14+
StringBuilder stringBuilder = new StringBuilder();
15+
for (TextColumn column : textNode.getColumns()) {
16+
for (TextBlock block : column.getBlocks()) {
17+
stringBuilder.append(getTextFromLines(block.getLines(), strikethroughTextOpening, strikethroughTextClosing));
18+
}
19+
}
20+
return stringBuilder.toString();
21+
}
22+
23+
public static String getTextFromLines(List<TextLine> textLines, String strikethroughTextOpening, String strikethroughTextClosing) {
24+
StringBuilder stringBuilder = new StringBuilder();
25+
for (int i = 0; i < textLines.size() - 1; i++) {
26+
TextLine line = textLines.get(i);
27+
getTextFromLine(line, stringBuilder, strikethroughTextOpening, strikethroughTextClosing);
28+
TextChunkUtils.formatLineEnd(stringBuilder);
29+
}
30+
getTextFromLine(textLines.get(textLines.size() - 1), stringBuilder, strikethroughTextOpening, strikethroughTextClosing);
31+
return stringBuilder.toString();
32+
}
33+
34+
public static void getTextFromLine(TextLine line, StringBuilder stringBuilder, String strikethroughTextOpening, String strikethroughTextClosing) {
35+
for (TextChunk chunk : line.getTextChunks()) {
36+
if (chunk.getIsStrikethroughText()) {
37+
stringBuilder.append(strikethroughTextOpening);
38+
}
39+
stringBuilder.append(chunk.getValue());
40+
if (chunk.getIsStrikethroughText()) {
41+
stringBuilder.append(strikethroughTextClosing);
42+
}
43+
}
44+
}
45+
}

0 commit comments

Comments
 (0)