Ready for PR

cgivre · cgivre · commit c0bc6e27a2e5 · 2021-12-09T19:33:51.000-05:00
diff --git a/contrib/format-pdf/README.md b/contrib/format-pdf/README.md
@@ -10,6 +10,25 @@ provided schema.
 The PDF reader reads tables from PDF files on each page.  If your PDF file has tables that span multiple pages, you can set the `combinePages` parameter to `true` and Drill 
 will merge all the tables in the PDF file.  You can also do this at query time with the `table()` function.
 
+## Configuration
+To configure the PDF reader, simply add the information below to the `formats` section of a file base storage plugin. 
+
+```json
+"pdf": {
+  "type": "pdf",
+  "extensions": [
+    "pdf"
+  ],
+  "extractHeaders": true,
+  "combinePages": false
+}
+```
+The avaialable options are:
+* `extractHeaders`: Extracts the first row of any tables as the header row.  If set to false, Drill will assign column names of `field_0`, `field_1` to each column.
+* `combinePages`: Merges multipage tables together.
+* `defaultTableIndex`:  Allows you to query different tables within the PDF file. Index starts at `0`. 
+
+
 ## Accessing Document Metadata Fields
 PDF files have a considerable amount of metadata which can be useful for analysis.  Drill will extract the following fields from every PDF file.  Note that these fields are not
  projected in star queries and must be selected explicitly.  The document's creator populates these fields and some or all may be empty. With the exception of `_page_count
@@ -35,3 +54,14 @@ _keywords, _creator, _producer, _creation_date,
 _modification_date, _trapped 
 FROM dfs.`pdf/20.pdf`
 ```
+The query below demonstrates how to define a schema at query time:
+
+```sql
+SELECT * FROM table(cp.`pdf/schools.pdf` (type => 'pdf', combinePages => true, 
+schema => 'inline=(`Last Name` VARCHAR, `First Name Address` VARCHAR, 
+`field_0` VARCHAR, `City` VARCHAR, `State` VARCHAR, `Zip` VARCHAR, 
+`field_1` VARCHAR, `Occupation Employer` VARCHAR, 
+`Date` VARCHAR, `field_2` DATE properties {`drill.format` = `M/d/yyyy`}, 
+`Amount` DOUBLE)')) 
+LIMIT 5
+```
diff --git a/contrib/format-pdf/pom.xml b/contrib/format-pdf/pom.xml
@@ -18,7 +18,9 @@
     limitations under the License.
 
 -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
   <modelVersion>4.0.0</modelVersion>
 
   <parent>
@@ -39,7 +41,7 @@
     <dependency>
       <groupId>technology.tabula</groupId>
       <artifactId>tabula</artifactId>
-      <version>1.0.4</version>
+      <version>1.0.5</version>
       <exclusions>
         <exclusion>
           <artifactId>slf4j-simple</artifactId>
diff --git a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfBatchReader.java b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfBatchReader.java
@@ -45,9 +45,15 @@
 import technology.tabula.Table;
 
 import java.io.InputStream;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
 import java.time.Instant;
+import java.time.LocalDate;
+import java.time.LocalTime;
+import java.time.format.DateTimeFormatter;
 import java.util.ArrayList;
 import java.util.Calendar;
+import java.util.Date;
 import java.util.List;
 
 public class PdfBatchReader implements ManagedReader<FileScanFramework.FileSchemaNegotiator> {
@@ -68,7 +74,9 @@ public class PdfBatchReader implements ManagedReader<FileScanFramework.FileSchem
   private List<String> columnHeaders;
   private int currentRowIndex;
   private Table currentTable;
-
+  private int currentTableIndex;
+  private int startingTableIndex;
+  private FileScanFramework.FileSchemaNegotiator negotiator;
 
   // Document Metadata Fields
   private int pageCount;
@@ -86,7 +94,7 @@ public class PdfBatchReader implements ManagedReader<FileScanFramework.FileSchem
   private int tableCount;
   private int rows;
   private int metadataIndex;
-  private int currentTableIndex;
+
 
   // Tables
   private List<Table> tables;
@@ -103,20 +111,23 @@ static class PdfReaderConfig {
   public PdfBatchReader(PdfReaderConfig readerConfig, int maxRecords) {
     this.maxRecords = maxRecords;
     this.unregisteredColumnCount = 0;
-    this.currentTableIndex = 0;
     this.writers = new ArrayList<>();
     this.config = readerConfig;
+    this.startingTableIndex = readerConfig.plugin.getConfig().getDefaultTableIndex() < 0 ? 0 : readerConfig.plugin.getConfig().getDefaultTableIndex();
+    this.currentTableIndex = this.startingTableIndex;
+    this.columnHeaders = new ArrayList<>();
   }
 
   @Override
   public boolean open(FileScanFramework.FileSchemaNegotiator negotiator) {
     System.setProperty("java.awt.headless", "true");
+    this.negotiator = negotiator;
 
     split = negotiator.split();
     errorContext = negotiator.parentErrorContext();
     builder = new SchemaBuilder();
 
-    openFile(negotiator);
+    openFile();
 
     // Get the tables
     tables = Utils.extractTablesFromPDF(document);
@@ -125,13 +136,13 @@ public boolean open(FileScanFramework.FileSchemaNegotiator negotiator) {
 
     // Support provided schema
     TupleMetadata schema = null;
-    if (negotiator.hasProvidedSchema()) {
-      schema = negotiator.providedSchema();
-      negotiator.tableSchema(schema, false);
+    if (this.negotiator.hasProvidedSchema()) {
+      schema = this.negotiator.providedSchema();
+      this.negotiator.tableSchema(schema, false);
     } else {
-      negotiator.tableSchema(buildSchema(), false);
+      this.negotiator.tableSchema(buildSchema(), false);
     }
-    ResultSetLoader loader = negotiator.build();
+    ResultSetLoader loader = this.negotiator.build();
     rowWriter = loader.writer();
 
     if (negotiator.hasProvidedSchema()) {
@@ -143,7 +154,7 @@ public boolean open(FileScanFramework.FileSchemaNegotiator negotiator) {
 
     // Prepare for reading
     currentRowIndex = 1;  // Skip the first line if there are headers
-    currentTable = tables.get(0);
+    currentTable = tables.get(startingTableIndex);
 
     return true;
   }
@@ -159,7 +170,6 @@ public boolean next() {
       } else if (currentRowIndex >= currentTable.getRows().size() &&
                   currentTableIndex < tables.size() &&
                   config.plugin.getConfig().getCombinePages()) {
-        logger.debug("Merging table {} with current table.", currentTableIndex);
         currentRowIndex = 0;
         currentTable = tables.get(currentTableIndex++);
       } else if (currentRowIndex >= currentTable.getRows().size()) {
@@ -174,6 +184,10 @@ public boolean next() {
   }
 
   private void processRow(List<RectangularTextContainer> row) {
+    if (row == null || row.size() == 0) {
+      return;
+    }
+
     String value;
     rowWriter.start();
     for (int i = 0; i < row.size(); i++) {
@@ -204,9 +218,8 @@ public void close() {
 
   /**
    * This method opens the PDF file, and finds the tables
-   * @param negotiator The Drill file negotiator object that represents the file system
    */
-  private void openFile(FileScanFramework.FileSchemaNegotiator negotiator) {
+  private void openFile() {
     try {
       fsStream = negotiator.fileSystem().openPossiblyCompressedStream(split.getPath());
       document = PDDocument.load(fsStream);
@@ -321,7 +334,7 @@ private void addUnknownColumnToSchemaAndCreateWriter (TupleWriter rowWriter, Str
   }
 
   private TupleMetadata buildSchema() {
-    Table table = tables.get(0);
+    Table table = tables.get(startingTableIndex);
     columns = table.getColCount();
     rows = table.getRowCount();
 
@@ -376,9 +389,18 @@ private void buildWriterListFromProvidedSchema(TupleMetadata schema) {
         case FLOAT8:
           writers.add(new DoublePdfColumnWriter(counter, fieldName, rowWriter));
           break;
+        case DATE:
+          writers.add(new DatePdfColumnWriter(counter, fieldName, rowWriter, negotiator));
+          break;
+        case TIME:
+          writers.add(new TimePdfColumnWriter(counter, fieldName, rowWriter, negotiator));
+          break;
+        case TIMESTAMP:
+          writers.add(new TimestampPdfColumnWriter(counter, fieldName, rowWriter, negotiator));
+          break;
         default:
           throw UserException.unsupportedError()
-            .message("PDF Reader with Provided Schema only supports String, and Numeric Types")
+            .message("PDF Reader with provided schema does not support " + type.name() + " data type.")
             .addContext(errorContext)
             .build(logger);
       }
@@ -446,4 +468,82 @@ public void load(RectangularTextContainer<?> cell) {
       writer.setString(cell.getText());
     }
   }
+
+  public static class DatePdfColumnWriter extends PdfColumnWriter {
+    private String dateFormat;
+
+    DatePdfColumnWriter (int columnIndex, String columnName, RowSetLoader rowWriter, FileScanFramework.FileSchemaNegotiator negotiator) {
+      super(columnIndex, columnName, rowWriter.scalar(columnName));
+
+      ColumnMetadata metadata = negotiator.providedSchema().metadata(columnName);
+      if (metadata != null) {
+        this.dateFormat = metadata.property("drill.format");
+      }
+    }
+
+    @Override
+    public void load(RectangularTextContainer<?> cell) {
+      LocalDate localDate;
+      if (Strings.isNullOrEmpty(this.dateFormat)) {
+       localDate = LocalDate.parse(cell.getText());
+      } else {
+        localDate = LocalDate.parse(cell.getText(), DateTimeFormatter.ofPattern(dateFormat));
+      }
+      writer.setDate(localDate);
+    }
+  }
+
+  public static class TimePdfColumnWriter extends PdfColumnWriter {
+    private String dateFormat;
+
+    TimePdfColumnWriter (int columnIndex, String columnName, RowSetLoader rowWriter, FileScanFramework.FileSchemaNegotiator negotiator) {
+      super(columnIndex, columnName, rowWriter.scalar(columnName));
+
+      ColumnMetadata metadata = negotiator.providedSchema().metadata(columnName);
+      if (metadata != null) {
+        this.dateFormat = metadata.property("drill.format");
+      }
+    }
+
+    @Override
+    public void load(RectangularTextContainer<?> cell) {
+      LocalTime localTime;
+      if (Strings.isNullOrEmpty(this.dateFormat)) {
+        localTime = LocalTime.parse(cell.getText());
+      } else {
+        localTime = LocalTime.parse(cell.getText(), DateTimeFormatter.ofPattern(dateFormat));
+      }
+      writer.setTime(localTime);
+    }
+  }
+
+  public static class TimestampPdfColumnWriter extends PdfColumnWriter {
+    private String dateFormat;
+
+    TimestampPdfColumnWriter (int columnIndex, String columnName, RowSetLoader rowWriter, FileScanFramework.FileSchemaNegotiator negotiator) {
+      super(columnIndex, columnName, rowWriter.scalar(columnName));
+
+      ColumnMetadata metadata = negotiator.providedSchema().metadata(columnName);
+      if (metadata != null) {
+        this.dateFormat = metadata.property("drill.format");
+      }
+    }
+
+    @Override
+    public void load(RectangularTextContainer<?> cell) {
+      Instant timestamp = null;
+      if (Strings.isNullOrEmpty(this.dateFormat)) {
+        timestamp = Instant.parse(cell.getText());
+      } else {
+        try {
+          SimpleDateFormat simpleDateFormat = new SimpleDateFormat(dateFormat);
+          Date parsedDate = simpleDateFormat.parse(cell.getText());
+          timestamp = Instant.ofEpochMilli(parsedDate.getTime());
+        } catch (ParseException e) {
+          logger.error("Error parsing timestamp: " + e.getMessage());
+        }
+      }
+      writer.setTimestamp(timestamp);
+    }
+  }
 }
diff --git a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfFormatConfig.java b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfFormatConfig.java
@@ -15,6 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.drill.exec.store.pdf;
 
 import com.fasterxml.jackson.annotation.JsonCreator;
@@ -80,6 +81,9 @@ public String getExtractionAlgorithm() {
     return extractionAlgorithm;
   }
 
+  @JsonProperty("defaultTableIndex")
+  public int getDefaultTableIndex() { return defaultTableIndex; }
+
   @Override
   public int hashCode() {
     return Objects.hash(extensions, extractHeaders, extractionAlgorithm, combinePages, defaultTableIndex);
diff --git a/contrib/format-pdf/src/main/resources/bootstrap-format-plugins.json b/contrib/format-pdf/src/main/resources/bootstrap-format-plugins.json
@@ -9,7 +9,7 @@
             "pdf"
           ],
           "extractHeaders": true,
-          "combinePages": true
+          "combinePages": false
         }
       }
     },
@@ -22,7 +22,7 @@
             "pdf"
           ],
           "extractHeaders": true,
-          "combinePages": true
+          "combinePages": false
         }
       }
     },
@@ -35,7 +35,7 @@
             "pdf"
           ],
           "extractHeaders": true,
-          "combinePages": true
+          "combinePages": false
         }
       }
     }
diff --git a/contrib/format-pdf/src/test/java/org/apache/drill/exec/store/pdf/TestPdfFormat.java b/contrib/format-pdf/src/test/java/org/apache/drill/exec/store/pdf/TestPdfFormat.java
diff --git a/pom.xml b/pom.xml

Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@`
`9`	`9`	`"pdf"`
`10`	`10`	`],`
`11`	`11`	`"extractHeaders": true,`
`12`		`- "combinePages": true`
	`12`	`+ "combinePages": false`
`13`	`13`	`}`
`14`	`14`	`}`
`15`	`15`	`},`
`@@ -22,7 +22,7 @@`
`22`	`22`	`"pdf"`
`23`	`23`	`],`
`24`	`24`	`"extractHeaders": true,`
`25`		`- "combinePages": true`
	`25`	`+ "combinePages": false`
`26`	`26`	`}`
`27`	`27`	`}`
`28`	`28`	`},`
`@@ -35,7 +35,7 @@`
`35`	`35`	`"pdf"`
`36`	`36`	`],`
`37`	`37`	`"extractHeaders": true,`
`38`		`- "combinePages": true`
	`38`	`+ "combinePages": false`
`39`	`39`	`}`
`40`	`40`	`}`
`41`	`41`	`}`