|
19 | 19 | package org.apache.iceberg.spark.source; |
20 | 20 |
|
21 | 21 | import java.util.Map; |
22 | | -import java.util.Set; |
23 | 22 | import org.apache.iceberg.FileFormat; |
24 | | -import org.apache.iceberg.MetadataColumns; |
25 | 23 | import org.apache.iceberg.ScanTask; |
26 | 24 | import org.apache.iceberg.ScanTaskGroup; |
27 | 25 | import org.apache.iceberg.Schema; |
28 | 26 | import org.apache.iceberg.Table; |
29 | 27 | import org.apache.iceberg.expressions.Expression; |
30 | 28 | import org.apache.iceberg.io.CloseableIterable; |
31 | 29 | import org.apache.iceberg.io.InputFile; |
| 30 | +import org.apache.iceberg.io.datafile.DataFileServiceRegistry; |
| 31 | +import org.apache.iceberg.io.datafile.DeleteFilter; |
| 32 | +import org.apache.iceberg.io.datafile.ReaderBuilder; |
32 | 33 | import org.apache.iceberg.orc.ORC; |
33 | 34 | import org.apache.iceberg.parquet.Parquet; |
34 | | -import org.apache.iceberg.relocated.com.google.common.collect.Sets; |
35 | 35 | import org.apache.iceberg.spark.OrcBatchReadConf; |
36 | 36 | import org.apache.iceberg.spark.ParquetBatchReadConf; |
37 | 37 | import org.apache.iceberg.spark.ParquetReaderType; |
38 | 38 | import org.apache.iceberg.spark.data.vectorized.VectorizedSparkOrcReaders; |
39 | 39 | import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders; |
40 | | -import org.apache.iceberg.types.TypeUtil; |
| 40 | +import org.apache.spark.sql.catalyst.InternalRow; |
41 | 41 | import org.apache.spark.sql.vectorized.ColumnarBatch; |
42 | 42 |
|
43 | 43 | abstract class BaseBatchReader<T extends ScanTask> extends BaseReader<ColumnarBatch, T> { |
@@ -65,76 +65,105 @@ protected CloseableIterable<ColumnarBatch> newBatchIterable( |
65 | 65 | Expression residual, |
66 | 66 | Map<Integer, ?> idToConstant, |
67 | 67 | SparkDeleteFilter deleteFilter) { |
68 | | - switch (format) { |
69 | | - case PARQUET: |
70 | | - return newParquetIterable(inputFile, start, length, residual, idToConstant, deleteFilter); |
| 68 | + ReaderBuilder readerBuilder = |
| 69 | + DataFileServiceRegistry.read( |
| 70 | + format, |
| 71 | + ColumnarBatch.class.getName(), |
| 72 | + parquetConf != null ? parquetConf.readerType().name() : null, |
| 73 | + inputFile, |
| 74 | + expectedSchema(), |
| 75 | + idToConstant, |
| 76 | + deleteFilter) |
| 77 | + .split(start, length) |
| 78 | + .filter(residual) |
| 79 | + .caseSensitive(caseSensitive()) |
| 80 | + // Spark eagerly consumes the batches. So the underlying memory allocated could be |
| 81 | + // reused |
| 82 | + // without worrying about subsequent reads clobbering over each other. This improves |
| 83 | + // read performance as every batch read doesn't have to pay the cost of allocating |
| 84 | + // memory. |
| 85 | + .reuseContainers() |
| 86 | + .withNameMapping(nameMapping()); |
| 87 | + if (parquetConf != null) { |
| 88 | + readerBuilder = readerBuilder.recordsPerBatch(parquetConf.batchSize()); |
| 89 | + } else if (orcConf != null) { |
| 90 | + readerBuilder = readerBuilder.recordsPerBatch(orcConf.batchSize()); |
| 91 | + } |
71 | 92 |
|
72 | | - case ORC: |
73 | | - return newOrcIterable(inputFile, start, length, residual, idToConstant); |
| 93 | + return readerBuilder.build(); |
| 94 | + } |
74 | 95 |
|
75 | | - default: |
76 | | - throw new UnsupportedOperationException( |
77 | | - "Format: " + format + " not supported for batched reads"); |
| 96 | + public static class IcebergParquetReaderService implements DataFileServiceRegistry.ReaderService { |
| 97 | + @Override |
| 98 | + public DataFileServiceRegistry.Key key() { |
| 99 | + return new DataFileServiceRegistry.Key( |
| 100 | + FileFormat.PARQUET, ColumnarBatch.class.getName(), ParquetReaderType.ICEBERG.name()); |
| 101 | + } |
| 102 | + |
| 103 | + @Override |
| 104 | + public ReaderBuilder builder( |
| 105 | + InputFile inputFile, |
| 106 | + Schema readSchema, |
| 107 | + Map<Integer, ?> idToConstant, |
| 108 | + DeleteFilter<?> deleteFilter) { |
| 109 | + // get required schema if there are deletes |
| 110 | + Schema requiredSchema = deleteFilter != null ? deleteFilter.requiredSchema() : readSchema; |
| 111 | + return Parquet.read(inputFile) |
| 112 | + .project(requiredSchema) |
| 113 | + .createBatchedReaderFunc( |
| 114 | + fileSchema -> |
| 115 | + VectorizedSparkParquetReaders.buildReader( |
| 116 | + requiredSchema, |
| 117 | + fileSchema, |
| 118 | + idToConstant, |
| 119 | + (DeleteFilter<InternalRow>) deleteFilter)); |
78 | 120 | } |
79 | 121 | } |
80 | 122 |
|
81 | | - private CloseableIterable<ColumnarBatch> newParquetIterable( |
82 | | - InputFile inputFile, |
83 | | - long start, |
84 | | - long length, |
85 | | - Expression residual, |
86 | | - Map<Integer, ?> idToConstant, |
87 | | - SparkDeleteFilter deleteFilter) { |
88 | | - // get required schema if there are deletes |
89 | | - Schema requiredSchema = deleteFilter != null ? deleteFilter.requiredSchema() : expectedSchema(); |
| 123 | + public static class CometParquetReaderService implements DataFileServiceRegistry.ReaderService { |
| 124 | + @Override |
| 125 | + public DataFileServiceRegistry.Key key() { |
| 126 | + return new DataFileServiceRegistry.Key( |
| 127 | + FileFormat.PARQUET, ColumnarBatch.class.getName(), ParquetReaderType.COMET.name()); |
| 128 | + } |
90 | 129 |
|
91 | | - return Parquet.read(inputFile) |
92 | | - .project(requiredSchema) |
93 | | - .split(start, length) |
94 | | - .createBatchedReaderFunc( |
95 | | - fileSchema -> { |
96 | | - if (parquetConf.readerType() == ParquetReaderType.COMET) { |
97 | | - return VectorizedSparkParquetReaders.buildCometReader( |
98 | | - requiredSchema, fileSchema, idToConstant, deleteFilter); |
99 | | - } else { |
100 | | - return VectorizedSparkParquetReaders.buildReader( |
101 | | - requiredSchema, fileSchema, idToConstant, deleteFilter); |
102 | | - } |
103 | | - }) |
104 | | - .recordsPerBatch(parquetConf.batchSize()) |
105 | | - .filter(residual) |
106 | | - .caseSensitive(caseSensitive()) |
107 | | - // Spark eagerly consumes the batches. So the underlying memory allocated could be reused |
108 | | - // without worrying about subsequent reads clobbering over each other. This improves |
109 | | - // read performance as every batch read doesn't have to pay the cost of allocating memory. |
110 | | - .reuseContainers() |
111 | | - .withNameMapping(nameMapping()) |
112 | | - .build(); |
| 130 | + @Override |
| 131 | + public ReaderBuilder builder( |
| 132 | + InputFile inputFile, |
| 133 | + Schema readSchema, |
| 134 | + Map<Integer, ?> idToConstant, |
| 135 | + DeleteFilter<?> deleteFilter) { |
| 136 | + // get required schema if there are deletes |
| 137 | + Schema requiredSchema = deleteFilter != null ? deleteFilter.requiredSchema() : readSchema; |
| 138 | + return Parquet.read(inputFile) |
| 139 | + .project(requiredSchema) |
| 140 | + .createBatchedReaderFunc( |
| 141 | + fileSchema -> |
| 142 | + VectorizedSparkParquetReaders.buildCometReader( |
| 143 | + requiredSchema, |
| 144 | + fileSchema, |
| 145 | + idToConstant, |
| 146 | + (DeleteFilter<InternalRow>) deleteFilter)); |
| 147 | + } |
113 | 148 | } |
114 | 149 |
|
115 | | - private CloseableIterable<ColumnarBatch> newOrcIterable( |
116 | | - InputFile inputFile, |
117 | | - long start, |
118 | | - long length, |
119 | | - Expression residual, |
120 | | - Map<Integer, ?> idToConstant) { |
121 | | - Set<Integer> constantFieldIds = idToConstant.keySet(); |
122 | | - Set<Integer> metadataFieldIds = MetadataColumns.metadataFieldIds(); |
123 | | - Sets.SetView<Integer> constantAndMetadataFieldIds = |
124 | | - Sets.union(constantFieldIds, metadataFieldIds); |
125 | | - Schema schemaWithoutConstantAndMetadataFields = |
126 | | - TypeUtil.selectNot(expectedSchema(), constantAndMetadataFieldIds); |
| 150 | + public static class ORCReaderService implements DataFileServiceRegistry.ReaderService { |
| 151 | + @Override |
| 152 | + public DataFileServiceRegistry.Key key() { |
| 153 | + return new DataFileServiceRegistry.Key(FileFormat.ORC, ColumnarBatch.class.getName()); |
| 154 | + } |
127 | 155 |
|
128 | | - return ORC.read(inputFile) |
129 | | - .project(schemaWithoutConstantAndMetadataFields) |
130 | | - .split(start, length) |
131 | | - .createBatchedReaderFunc( |
132 | | - fileSchema -> |
133 | | - VectorizedSparkOrcReaders.buildReader(expectedSchema(), fileSchema, idToConstant)) |
134 | | - .recordsPerBatch(orcConf.batchSize()) |
135 | | - .filter(residual) |
136 | | - .caseSensitive(caseSensitive()) |
137 | | - .withNameMapping(nameMapping()) |
138 | | - .build(); |
| 156 | + @Override |
| 157 | + public ReaderBuilder builder( |
| 158 | + InputFile inputFile, |
| 159 | + Schema readSchema, |
| 160 | + Map<Integer, ?> idToConstant, |
| 161 | + DeleteFilter<?> deleteFilter) { |
| 162 | + return ORC.read(inputFile) |
| 163 | + .project(ORC.schemaWithoutConstantAndMetadataFields(readSchema, idToConstant)) |
| 164 | + .createBatchedReaderFunc( |
| 165 | + fileSchema -> |
| 166 | + VectorizedSparkOrcReaders.buildReader(readSchema, fileSchema, idToConstant)); |
| 167 | + } |
139 | 168 | } |
140 | 169 | } |
0 commit comments