33
44package dev .vortex .spark .read ;
55
6+ import dev .vortex .api .DataSource ;
7+ import dev .vortex .api .Session ;
8+ import dev .vortex .jni .NativeFiles ;
9+ import dev .vortex .spark .VortexSparkSession ;
10+ import java .util .HashMap ;
611import java .util .List ;
712import java .util .Map ;
13+ import java .util .OptionalLong ;
14+ import java .util .stream .Collectors ;
15+ import java .util .stream .Stream ;
816import org .apache .spark .sql .connector .catalog .CatalogV2Util ;
917import org .apache .spark .sql .connector .catalog .Column ;
18+ import org .apache .spark .sql .connector .expressions .NamedReference ;
1019import org .apache .spark .sql .connector .read .Batch ;
1120import org .apache .spark .sql .connector .read .Scan ;
21+ import org .apache .spark .sql .connector .read .Statistics ;
22+ import org .apache .spark .sql .connector .read .SupportsReportStatistics ;
23+ import org .apache .spark .sql .connector .read .colstats .ColumnStatistics ;
24+ import org .apache .spark .sql .internal .SQLConf ;
1225import org .apache .spark .sql .types .StructType ;
1326
14- /** Spark V2 {@link Scan} over a table of Vortex files. */
15- public final class VortexScan implements Scan {
27+ /**
28+ * Spark V2 {@link Scan} over a table of Vortex files.
29+ *
30+ * <p>Implements {@link SupportsReportStatistics} to surface both the row count Vortex records in each file footer and a
31+ * Spark scan-size estimate. The byte estimate starts from the on-storage file sizes collected by
32+ * {@code MultiFileDataSource}, then follows Spark's file scan convention by applying the SQL file-compression factor
33+ * and scaling by the pushed read schema's default size relative to the full table schema's default size. When the
34+ * listing did not return a size for one or more files the file-byte total is extrapolated before Spark scaling is
35+ * applied.
36+ */
37+ public final class VortexScan implements Scan , SupportsReportStatistics {
1638
1739 private final List <String > paths ;
40+ private final List <Column > tableColumns ;
1841 private final List <Column > readColumns ;
1942 private final Map <String , String > formatOptions ;
2043
44+ private volatile Statistics cachedStatistics ;
45+
2146 /**
2247 * Creates a new VortexScan for the specified file paths and columns. The caller is responsible for passing
2348 * immutable collections; the constructor does not copy.
@@ -26,7 +51,24 @@ public final class VortexScan implements Scan {
2651 * @param readColumns the list of columns to read from the files
2752 */
2853 public VortexScan (List <String > paths , List <Column > readColumns , Map <String , String > formatOptions ) {
54+ this (paths , readColumns , readColumns , formatOptions );
55+ }
56+
57+ /**
58+ * Creates a new VortexScan for the specified file paths, table columns, and requested read columns. The caller is
59+ * responsible for passing immutable collections; the constructor does not copy.
60+ *
61+ * @param paths the list of Vortex file paths to scan
62+ * @param tableColumns the full table columns before projection pushdown
63+ * @param readColumns the columns Spark requested after projection pushdown
64+ */
65+ public VortexScan (
66+ List <String > paths ,
67+ List <Column > tableColumns ,
68+ List <Column > readColumns ,
69+ Map <String , String > formatOptions ) {
2970 this .paths = paths ;
71+ this .tableColumns = tableColumns ;
3072 this .readColumns = readColumns ;
3173 this .formatOptions = formatOptions ;
3274 }
@@ -72,4 +114,77 @@ public Batch toBatch() {
72114 public ColumnarSupportMode columnarSupportMode () {
73115 return ColumnarSupportMode .SUPPORTED ;
74116 }
117+
118+ /**
119+ * Returns statistics for this scan.
120+ *
121+ * <p>Opens the Vortex {@link DataSource} on first invocation and caches the result. The row count is taken from the
122+ * data source (sum of file-footer row counts; extrapolated from the first opened file when other files are
123+ * deferred). {@link Statistics#sizeInBytes()} is derived from the per-file sizes reported by the filesystem
124+ * listing, then adjusted by Spark's compression factor and the ratio between the pushed read schema and the full
125+ * table schema. When a listing did not return a size for some file the file-byte total is extrapolated. When no
126+ * file size is known at all the value is left empty so Spark falls back to its default heuristic.
127+ *
128+ * @return statistics with row-count and Spark scan-size estimates
129+ */
130+ @ Override
131+ public Statistics estimateStatistics () {
132+ Statistics local = cachedStatistics ;
133+ if (local != null ) {
134+ return local ;
135+ }
136+ synchronized (this ) {
137+ if (cachedStatistics == null ) {
138+ cachedStatistics = computeStatistics ();
139+ }
140+ return cachedStatistics ;
141+ }
142+ }
143+
144+ private Statistics computeStatistics () {
145+ Session session = VortexSparkSession .get (formatOptions );
146+ // Expand directory paths to concrete files the way VortexBatchExec does, so we use the
147+ // same per-path resolution end-to-end.
148+ List <String > resolvedPaths = paths .stream ()
149+ .flatMap (path -> path .endsWith (".vortex" )
150+ ? Stream .of (path )
151+ : NativeFiles .listFiles (session , path , formatOptions ).stream ())
152+ .collect (Collectors .toList ());
153+
154+ if (resolvedPaths .isEmpty ()) {
155+ return new VortexStatistics (OptionalLong .empty (), OptionalLong .empty ());
156+ }
157+
158+ DataSource source = DataSource .open (session , resolvedPaths , formatOptions );
159+ return new VortexStatistics (
160+ source .rowCount ().asOptional (),
161+ scaleSizeInBytes (source .byteSize ().asOptional ()));
162+ }
163+
164+ private OptionalLong scaleSizeInBytes (OptionalLong fileBytes ) {
165+ if (fileBytes .isEmpty ()) {
166+ return OptionalLong .empty ();
167+ }
168+
169+ StructType tableSchema = CatalogV2Util .v2ColumnsToStructType (tableColumns .toArray (new Column [0 ]));
170+ StructType readSchema = readSchema ();
171+ int tableDefaultSize = tableSchema .defaultSize ();
172+ if (tableDefaultSize <= 0 ) {
173+ return fileBytes ;
174+ }
175+
176+ double scaled = SQLConf .get ().fileCompressionFactor ()
177+ * fileBytes .getAsLong ()
178+ / tableDefaultSize
179+ * readSchema .defaultSize ();
180+ return OptionalLong .of ((long ) scaled );
181+ }
182+
183+ private record VortexStatistics (OptionalLong numRows , OptionalLong sizeInBytes ) implements Statistics {
184+
185+ @ Override
186+ public Map <NamedReference , ColumnStatistics > columnStats () {
187+ return new HashMap <>();
188+ }
189+ }
75190}
0 commit comments