Skip to content

Commit c0bc6e2

Browse files
committed
Ready for PR
1 parent f348278 commit c0bc6e2

7 files changed

Lines changed: 263 additions & 31 deletions

File tree

contrib/format-pdf/README.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,25 @@ provided schema.
1010
The PDF reader reads tables from PDF files on each page. If your PDF file has tables that span multiple pages, you can set the `combinePages` parameter to `true` and Drill
1111
will merge all the tables in the PDF file. You can also do this at query time with the `table()` function.
1212

13+
## Configuration
14+
To configure the PDF reader, simply add the information below to the `formats` section of a file base storage plugin.
15+
16+
```json
17+
"pdf": {
18+
"type": "pdf",
19+
"extensions": [
20+
"pdf"
21+
],
22+
"extractHeaders": true,
23+
"combinePages": false
24+
}
25+
```
26+
The avaialable options are:
27+
* `extractHeaders`: Extracts the first row of any tables as the header row. If set to false, Drill will assign column names of `field_0`, `field_1` to each column.
28+
* `combinePages`: Merges multipage tables together.
29+
* `defaultTableIndex`: Allows you to query different tables within the PDF file. Index starts at `0`.
30+
31+
1332
## Accessing Document Metadata Fields
1433
PDF files have a considerable amount of metadata which can be useful for analysis. Drill will extract the following fields from every PDF file. Note that these fields are not
1534
projected in star queries and must be selected explicitly. The document's creator populates these fields and some or all may be empty. With the exception of `_page_count
@@ -35,3 +54,14 @@ _keywords, _creator, _producer, _creation_date,
3554
_modification_date, _trapped
3655
FROM dfs.`pdf/20.pdf`
3756
```
57+
The query below demonstrates how to define a schema at query time:
58+
59+
```sql
60+
SELECT * FROM table(cp.`pdf/schools.pdf` (type => 'pdf', combinePages => true,
61+
schema => 'inline=(`Last Name` VARCHAR, `First Name Address` VARCHAR,
62+
`field_0` VARCHAR, `City` VARCHAR, `State` VARCHAR, `Zip` VARCHAR,
63+
`field_1` VARCHAR, `Occupation Employer` VARCHAR,
64+
`Date` VARCHAR, `field_2` DATE properties {`drill.format` = `M/d/yyyy`},
65+
`Amount` DOUBLE)'))
66+
LIMIT 5
67+
```

contrib/format-pdf/pom.xml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@
1818
limitations under the License.
1919
2020
-->
21-
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
21+
<project xmlns="http://maven.apache.org/POM/4.0.0"
22+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
23+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
2224
<modelVersion>4.0.0</modelVersion>
2325

2426
<parent>
@@ -39,7 +41,7 @@
3941
<dependency>
4042
<groupId>technology.tabula</groupId>
4143
<artifactId>tabula</artifactId>
42-
<version>1.0.4</version>
44+
<version>1.0.5</version>
4345
<exclusions>
4446
<exclusion>
4547
<artifactId>slf4j-simple</artifactId>

contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfBatchReader.java

Lines changed: 115 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,15 @@
4545
import technology.tabula.Table;
4646

4747
import java.io.InputStream;
48+
import java.text.ParseException;
49+
import java.text.SimpleDateFormat;
4850
import java.time.Instant;
51+
import java.time.LocalDate;
52+
import java.time.LocalTime;
53+
import java.time.format.DateTimeFormatter;
4954
import java.util.ArrayList;
5055
import java.util.Calendar;
56+
import java.util.Date;
5157
import java.util.List;
5258

5359
public class PdfBatchReader implements ManagedReader<FileScanFramework.FileSchemaNegotiator> {
@@ -68,7 +74,9 @@ public class PdfBatchReader implements ManagedReader<FileScanFramework.FileSchem
6874
private List<String> columnHeaders;
6975
private int currentRowIndex;
7076
private Table currentTable;
71-
77+
private int currentTableIndex;
78+
private int startingTableIndex;
79+
private FileScanFramework.FileSchemaNegotiator negotiator;
7280

7381
// Document Metadata Fields
7482
private int pageCount;
@@ -86,7 +94,7 @@ public class PdfBatchReader implements ManagedReader<FileScanFramework.FileSchem
8694
private int tableCount;
8795
private int rows;
8896
private int metadataIndex;
89-
private int currentTableIndex;
97+
9098

9199
// Tables
92100
private List<Table> tables;
@@ -103,20 +111,23 @@ static class PdfReaderConfig {
103111
public PdfBatchReader(PdfReaderConfig readerConfig, int maxRecords) {
104112
this.maxRecords = maxRecords;
105113
this.unregisteredColumnCount = 0;
106-
this.currentTableIndex = 0;
107114
this.writers = new ArrayList<>();
108115
this.config = readerConfig;
116+
this.startingTableIndex = readerConfig.plugin.getConfig().getDefaultTableIndex() < 0 ? 0 : readerConfig.plugin.getConfig().getDefaultTableIndex();
117+
this.currentTableIndex = this.startingTableIndex;
118+
this.columnHeaders = new ArrayList<>();
109119
}
110120

111121
@Override
112122
public boolean open(FileScanFramework.FileSchemaNegotiator negotiator) {
113123
System.setProperty("java.awt.headless", "true");
124+
this.negotiator = negotiator;
114125

115126
split = negotiator.split();
116127
errorContext = negotiator.parentErrorContext();
117128
builder = new SchemaBuilder();
118129

119-
openFile(negotiator);
130+
openFile();
120131

121132
// Get the tables
122133
tables = Utils.extractTablesFromPDF(document);
@@ -125,13 +136,13 @@ public boolean open(FileScanFramework.FileSchemaNegotiator negotiator) {
125136

126137
// Support provided schema
127138
TupleMetadata schema = null;
128-
if (negotiator.hasProvidedSchema()) {
129-
schema = negotiator.providedSchema();
130-
negotiator.tableSchema(schema, false);
139+
if (this.negotiator.hasProvidedSchema()) {
140+
schema = this.negotiator.providedSchema();
141+
this.negotiator.tableSchema(schema, false);
131142
} else {
132-
negotiator.tableSchema(buildSchema(), false);
143+
this.negotiator.tableSchema(buildSchema(), false);
133144
}
134-
ResultSetLoader loader = negotiator.build();
145+
ResultSetLoader loader = this.negotiator.build();
135146
rowWriter = loader.writer();
136147

137148
if (negotiator.hasProvidedSchema()) {
@@ -143,7 +154,7 @@ public boolean open(FileScanFramework.FileSchemaNegotiator negotiator) {
143154

144155
// Prepare for reading
145156
currentRowIndex = 1; // Skip the first line if there are headers
146-
currentTable = tables.get(0);
157+
currentTable = tables.get(startingTableIndex);
147158

148159
return true;
149160
}
@@ -159,7 +170,6 @@ public boolean next() {
159170
} else if (currentRowIndex >= currentTable.getRows().size() &&
160171
currentTableIndex < tables.size() &&
161172
config.plugin.getConfig().getCombinePages()) {
162-
logger.debug("Merging table {} with current table.", currentTableIndex);
163173
currentRowIndex = 0;
164174
currentTable = tables.get(currentTableIndex++);
165175
} else if (currentRowIndex >= currentTable.getRows().size()) {
@@ -174,6 +184,10 @@ public boolean next() {
174184
}
175185

176186
private void processRow(List<RectangularTextContainer> row) {
187+
if (row == null || row.size() == 0) {
188+
return;
189+
}
190+
177191
String value;
178192
rowWriter.start();
179193
for (int i = 0; i < row.size(); i++) {
@@ -204,9 +218,8 @@ public void close() {
204218

205219
/**
206220
* This method opens the PDF file, and finds the tables
207-
* @param negotiator The Drill file negotiator object that represents the file system
208221
*/
209-
private void openFile(FileScanFramework.FileSchemaNegotiator negotiator) {
222+
private void openFile() {
210223
try {
211224
fsStream = negotiator.fileSystem().openPossiblyCompressedStream(split.getPath());
212225
document = PDDocument.load(fsStream);
@@ -321,7 +334,7 @@ private void addUnknownColumnToSchemaAndCreateWriter (TupleWriter rowWriter, Str
321334
}
322335

323336
private TupleMetadata buildSchema() {
324-
Table table = tables.get(0);
337+
Table table = tables.get(startingTableIndex);
325338
columns = table.getColCount();
326339
rows = table.getRowCount();
327340

@@ -376,9 +389,18 @@ private void buildWriterListFromProvidedSchema(TupleMetadata schema) {
376389
case FLOAT8:
377390
writers.add(new DoublePdfColumnWriter(counter, fieldName, rowWriter));
378391
break;
392+
case DATE:
393+
writers.add(new DatePdfColumnWriter(counter, fieldName, rowWriter, negotiator));
394+
break;
395+
case TIME:
396+
writers.add(new TimePdfColumnWriter(counter, fieldName, rowWriter, negotiator));
397+
break;
398+
case TIMESTAMP:
399+
writers.add(new TimestampPdfColumnWriter(counter, fieldName, rowWriter, negotiator));
400+
break;
379401
default:
380402
throw UserException.unsupportedError()
381-
.message("PDF Reader with Provided Schema only supports String, and Numeric Types")
403+
.message("PDF Reader with provided schema does not support " + type.name() + " data type.")
382404
.addContext(errorContext)
383405
.build(logger);
384406
}
@@ -446,4 +468,82 @@ public void load(RectangularTextContainer<?> cell) {
446468
writer.setString(cell.getText());
447469
}
448470
}
471+
472+
public static class DatePdfColumnWriter extends PdfColumnWriter {
473+
private String dateFormat;
474+
475+
DatePdfColumnWriter (int columnIndex, String columnName, RowSetLoader rowWriter, FileScanFramework.FileSchemaNegotiator negotiator) {
476+
super(columnIndex, columnName, rowWriter.scalar(columnName));
477+
478+
ColumnMetadata metadata = negotiator.providedSchema().metadata(columnName);
479+
if (metadata != null) {
480+
this.dateFormat = metadata.property("drill.format");
481+
}
482+
}
483+
484+
@Override
485+
public void load(RectangularTextContainer<?> cell) {
486+
LocalDate localDate;
487+
if (Strings.isNullOrEmpty(this.dateFormat)) {
488+
localDate = LocalDate.parse(cell.getText());
489+
} else {
490+
localDate = LocalDate.parse(cell.getText(), DateTimeFormatter.ofPattern(dateFormat));
491+
}
492+
writer.setDate(localDate);
493+
}
494+
}
495+
496+
public static class TimePdfColumnWriter extends PdfColumnWriter {
497+
private String dateFormat;
498+
499+
TimePdfColumnWriter (int columnIndex, String columnName, RowSetLoader rowWriter, FileScanFramework.FileSchemaNegotiator negotiator) {
500+
super(columnIndex, columnName, rowWriter.scalar(columnName));
501+
502+
ColumnMetadata metadata = negotiator.providedSchema().metadata(columnName);
503+
if (metadata != null) {
504+
this.dateFormat = metadata.property("drill.format");
505+
}
506+
}
507+
508+
@Override
509+
public void load(RectangularTextContainer<?> cell) {
510+
LocalTime localTime;
511+
if (Strings.isNullOrEmpty(this.dateFormat)) {
512+
localTime = LocalTime.parse(cell.getText());
513+
} else {
514+
localTime = LocalTime.parse(cell.getText(), DateTimeFormatter.ofPattern(dateFormat));
515+
}
516+
writer.setTime(localTime);
517+
}
518+
}
519+
520+
public static class TimestampPdfColumnWriter extends PdfColumnWriter {
521+
private String dateFormat;
522+
523+
TimestampPdfColumnWriter (int columnIndex, String columnName, RowSetLoader rowWriter, FileScanFramework.FileSchemaNegotiator negotiator) {
524+
super(columnIndex, columnName, rowWriter.scalar(columnName));
525+
526+
ColumnMetadata metadata = negotiator.providedSchema().metadata(columnName);
527+
if (metadata != null) {
528+
this.dateFormat = metadata.property("drill.format");
529+
}
530+
}
531+
532+
@Override
533+
public void load(RectangularTextContainer<?> cell) {
534+
Instant timestamp = null;
535+
if (Strings.isNullOrEmpty(this.dateFormat)) {
536+
timestamp = Instant.parse(cell.getText());
537+
} else {
538+
try {
539+
SimpleDateFormat simpleDateFormat = new SimpleDateFormat(dateFormat);
540+
Date parsedDate = simpleDateFormat.parse(cell.getText());
541+
timestamp = Instant.ofEpochMilli(parsedDate.getTime());
542+
} catch (ParseException e) {
543+
logger.error("Error parsing timestamp: " + e.getMessage());
544+
}
545+
}
546+
writer.setTimestamp(timestamp);
547+
}
548+
}
449549
}

contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfFormatConfig.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
* See the License for the specific language governing permissions and
1616
* limitations under the License.
1717
*/
18+
1819
package org.apache.drill.exec.store.pdf;
1920

2021
import com.fasterxml.jackson.annotation.JsonCreator;
@@ -80,6 +81,9 @@ public String getExtractionAlgorithm() {
8081
return extractionAlgorithm;
8182
}
8283

84+
@JsonProperty("defaultTableIndex")
85+
public int getDefaultTableIndex() { return defaultTableIndex; }
86+
8387
@Override
8488
public int hashCode() {
8589
return Objects.hash(extensions, extractHeaders, extractionAlgorithm, combinePages, defaultTableIndex);

contrib/format-pdf/src/main/resources/bootstrap-format-plugins.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
"pdf"
1010
],
1111
"extractHeaders": true,
12-
"combinePages": true
12+
"combinePages": false
1313
}
1414
}
1515
},
@@ -22,7 +22,7 @@
2222
"pdf"
2323
],
2424
"extractHeaders": true,
25-
"combinePages": true
25+
"combinePages": false
2626
}
2727
}
2828
},
@@ -35,7 +35,7 @@
3535
"pdf"
3636
],
3737
"extractHeaders": true,
38-
"combinePages": true
38+
"combinePages": false
3939
}
4040
}
4141
}

0 commit comments

Comments
 (0)