Skip to content

Commit 33a736b

Browse files
committed
address feedback
1 parent 5b11c46 commit 33a736b

File tree

4 files changed

+77
-76
lines changed

4 files changed

+77
-76
lines changed

src/iceberg/manifest/manifest_entry.h

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -178,112 +178,112 @@ struct ICEBERG_EXPORT DataFile {
178178
/// present
179179
std::optional<int64_t> content_size_in_bytes;
180180

181-
inline static constexpr int32_t kContentFieldId = 134;
181+
static constexpr int32_t kContentFieldId = 134;
182182
inline static const SchemaField kContent = SchemaField::MakeOptional(
183183
kContentFieldId, "content", int32(),
184184
"Contents of the file: 0=data, 1=position deletes, 2=equality deletes");
185185

186-
inline static constexpr int32_t kFilePathFieldId = 100;
186+
static constexpr int32_t kFilePathFieldId = 100;
187187
inline static const SchemaField kFilePath = SchemaField::MakeRequired(
188188
kFilePathFieldId, "file_path", string(), "Location URI with FS scheme");
189189

190-
inline static constexpr int32_t kFileFormatFieldId = 101;
190+
static constexpr int32_t kFileFormatFieldId = 101;
191191
inline static const SchemaField kFileFormat =
192192
SchemaField::MakeRequired(kFileFormatFieldId, "file_format", string(),
193193
"File format name: avro, orc, or parquet");
194194

195-
inline static constexpr int32_t kPartitionFieldId = 102;
195+
static constexpr int32_t kPartitionFieldId = 102;
196196
inline static const std::string kPartitionField = "partition";
197197
inline static const std::string kPartitionDoc =
198198
"Partition data tuple, schema based on the partition spec";
199199

200-
inline static constexpr int32_t kRecordCountFieldId = 103;
200+
static constexpr int32_t kRecordCountFieldId = 103;
201201
inline static const SchemaField kRecordCount = SchemaField::MakeRequired(
202202
kRecordCountFieldId, "record_count", int64(), "Number of records in the file");
203203

204-
inline static constexpr int32_t kFileSizeFieldId = 104;
204+
static constexpr int32_t kFileSizeFieldId = 104;
205205
inline static const SchemaField kFileSize = SchemaField::MakeRequired(
206206
kFileSizeFieldId, "file_size_in_bytes", int64(), "Total file size in bytes");
207207

208-
inline static constexpr int32_t kColumnSizesFieldId = 108;
208+
static constexpr int32_t kColumnSizesFieldId = 108;
209209
inline static const SchemaField kColumnSizes = SchemaField::MakeOptional(
210210
kColumnSizesFieldId, "column_sizes",
211211
map(SchemaField::MakeRequired(117, std::string(MapType::kKeyName), int32()),
212212
SchemaField::MakeRequired(118, std::string(MapType::kValueName), int64())),
213213
"Map of column id to total size on disk");
214214

215-
inline static constexpr int32_t kValueCountsFieldId = 109;
215+
static constexpr int32_t kValueCountsFieldId = 109;
216216
inline static const SchemaField kValueCounts = SchemaField::MakeOptional(
217217
kValueCountsFieldId, "value_counts",
218218
map(SchemaField::MakeRequired(119, std::string(MapType::kKeyName), int32()),
219219
SchemaField::MakeRequired(120, std::string(MapType::kValueName), int64())),
220220
"Map of column id to total count, including null and NaN");
221221

222-
inline static constexpr int32_t kNullValueCountsFieldId = 110;
222+
static constexpr int32_t kNullValueCountsFieldId = 110;
223223
inline static const SchemaField kNullValueCounts = SchemaField::MakeOptional(
224224
kNullValueCountsFieldId, "null_value_counts",
225225
map(SchemaField::MakeRequired(121, std::string(MapType::kKeyName), int32()),
226226
SchemaField::MakeRequired(122, std::string(MapType::kValueName), int64())),
227227
"Map of column id to null value count");
228228

229-
inline static constexpr int32_t kNanValueCountsFieldId = 137;
229+
static constexpr int32_t kNanValueCountsFieldId = 137;
230230
inline static const SchemaField kNanValueCounts = SchemaField::MakeOptional(
231231
kNanValueCountsFieldId, "nan_value_counts",
232232
map(SchemaField::MakeRequired(138, std::string(MapType::kKeyName), int32()),
233233
SchemaField::MakeRequired(139, std::string(MapType::kValueName), int64())),
234234
"Map of column id to number of NaN values in the column");
235235

236-
inline static constexpr int32_t kLowerBoundsFieldId = 125;
236+
static constexpr int32_t kLowerBoundsFieldId = 125;
237237
inline static const SchemaField kLowerBounds = SchemaField::MakeOptional(
238238
kLowerBoundsFieldId, "lower_bounds",
239239
map(SchemaField::MakeRequired(126, std::string(MapType::kKeyName), int32()),
240240
SchemaField::MakeRequired(127, std::string(MapType::kValueName), binary())),
241241
"Map of column id to lower bound");
242242

243-
inline static constexpr int32_t kUpperBoundsFieldId = 128;
243+
static constexpr int32_t kUpperBoundsFieldId = 128;
244244
inline static const SchemaField kUpperBounds = SchemaField::MakeOptional(
245245
kUpperBoundsFieldId, "upper_bounds",
246246
map(SchemaField::MakeRequired(129, std::string(MapType::kKeyName), int32()),
247247
SchemaField::MakeRequired(130, std::string(MapType::kValueName), binary())),
248248
"Map of column id to upper bound");
249249

250-
inline static constexpr int32_t kKeyMetadataFieldId = 131;
250+
static constexpr int32_t kKeyMetadataFieldId = 131;
251251
inline static const SchemaField kKeyMetadata = SchemaField::MakeOptional(
252252
kKeyMetadataFieldId, "key_metadata", binary(), "Encryption key metadata blob");
253253

254-
inline static constexpr int32_t kSplitOffsetsFieldId = 132;
254+
static constexpr int32_t kSplitOffsetsFieldId = 132;
255255
inline static const SchemaField kSplitOffsets = SchemaField::MakeOptional(
256256
kSplitOffsetsFieldId, "split_offsets",
257257
list(SchemaField::MakeRequired(133, std::string(ListType::kElementName), int64())),
258258
"Splittable offsets");
259259

260-
inline static constexpr int32_t kEqualityIdsFieldId = 135;
260+
static constexpr int32_t kEqualityIdsFieldId = 135;
261261
inline static const SchemaField kEqualityIds = SchemaField::MakeOptional(
262262
kEqualityIdsFieldId, "equality_ids",
263263
list(SchemaField::MakeRequired(136, std::string(ListType::kElementName), int32())),
264264
"Equality comparison field IDs");
265265

266-
inline static constexpr int32_t kSortOrderIdFieldId = 140;
266+
static constexpr int32_t kSortOrderIdFieldId = 140;
267267
inline static const SchemaField kSortOrderId = SchemaField::MakeOptional(
268268
kSortOrderIdFieldId, "sort_order_id", int32(), "Sort order ID");
269269

270-
inline static constexpr int32_t kFirstRowIdFieldId = 142;
270+
static constexpr int32_t kFirstRowIdFieldId = 142;
271271
inline static const SchemaField kFirstRowId =
272272
SchemaField::MakeOptional(kFirstRowIdFieldId, "first_row_id", int64(),
273273
"Starting row ID to assign to new rows");
274274

275-
inline static constexpr int32_t kReferencedDataFileFieldId = 143;
275+
static constexpr int32_t kReferencedDataFileFieldId = 143;
276276
inline static const SchemaField kReferencedDataFile = SchemaField::MakeOptional(
277277
kReferencedDataFileFieldId, "referenced_data_file", string(),
278278
"Fully qualified location (URI with FS scheme) of a data file that all deletes "
279279
"reference");
280280

281-
inline static constexpr int32_t kContentOffsetFieldId = 144;
281+
static constexpr int32_t kContentOffsetFieldId = 144;
282282
inline static const SchemaField kContentOffset =
283283
SchemaField::MakeOptional(kContentOffsetFieldId, "content_offset", int64(),
284284
"The offset in the file where the content starts");
285285

286-
inline static constexpr int32_t kContentSizeFieldId = 145;
286+
static constexpr int32_t kContentSizeFieldId = 145;
287287
inline static const SchemaField kContentSize =
288288
SchemaField::MakeOptional(kContentSizeFieldId, "content_size_in_bytes", int64(),
289289
"The length of referenced content stored in the file");
@@ -318,22 +318,22 @@ struct ICEBERG_EXPORT ManifestEntry {
318318
/// File path, partition tuple, metrics, ...
319319
std::shared_ptr<DataFile> data_file;
320320

321-
inline static constexpr int32_t kStatusFieldId = 0;
321+
static constexpr int32_t kStatusFieldId = 0;
322322
inline static const SchemaField kStatus =
323323
SchemaField::MakeRequired(kStatusFieldId, "status", int32());
324324

325-
inline static constexpr int32_t kSnapshotIdFieldId = 1;
325+
static constexpr int32_t kSnapshotIdFieldId = 1;
326326
inline static const SchemaField kSnapshotId =
327327
SchemaField::MakeOptional(kSnapshotIdFieldId, "snapshot_id", int64());
328328

329329
inline static const int32_t kDataFileFieldId = 2;
330330
inline static const std::string kDataFileField = "data_file";
331331

332-
inline static constexpr int32_t kSequenceNumberFieldId = 3;
332+
static constexpr int32_t kSequenceNumberFieldId = 3;
333333
inline static const SchemaField kSequenceNumber =
334334
SchemaField::MakeOptional(kSequenceNumberFieldId, "sequence_number", int64());
335335

336-
inline static constexpr int32_t kFileSequenceNumberFieldId = 4;
336+
static constexpr int32_t kFileSequenceNumberFieldId = 4;
337337
inline static const SchemaField kFileSequenceNumber = SchemaField::MakeOptional(
338338
kFileSequenceNumberFieldId, "file_sequence_number", int64());
339339

src/iceberg/manifest/manifest_reader.cc

Lines changed: 52 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -305,33 +305,42 @@ Result<std::vector<ManifestFile>> ParseManifestList(ArrowSchema* schema,
305305

306306
Status ParseLiteral(ArrowArrayView* view_of_partition, int64_t row_idx,
307307
std::vector<ManifestEntry>& manifest_entries) {
308-
if (view_of_partition->storage_type == ArrowType::NANOARROW_TYPE_BOOL) {
309-
auto value = ArrowArrayViewGetUIntUnsafe(view_of_partition, row_idx);
310-
manifest_entries[row_idx].data_file->partition.AddValue(Literal::Boolean(value != 0));
311-
} else if (view_of_partition->storage_type == ArrowType::NANOARROW_TYPE_INT32) {
312-
auto value = ArrowArrayViewGetIntUnsafe(view_of_partition, row_idx);
313-
manifest_entries[row_idx].data_file->partition.AddValue(Literal::Int(value));
314-
} else if (view_of_partition->storage_type == ArrowType::NANOARROW_TYPE_INT64) {
315-
auto value = ArrowArrayViewGetIntUnsafe(view_of_partition, row_idx);
316-
manifest_entries[row_idx].data_file->partition.AddValue(Literal::Long(value));
317-
} else if (view_of_partition->storage_type == ArrowType::NANOARROW_TYPE_FLOAT) {
318-
auto value = ArrowArrayViewGetDoubleUnsafe(view_of_partition, row_idx);
319-
manifest_entries[row_idx].data_file->partition.AddValue(Literal::Float(value));
320-
} else if (view_of_partition->storage_type == ArrowType::NANOARROW_TYPE_DOUBLE) {
321-
auto value = ArrowArrayViewGetDoubleUnsafe(view_of_partition, row_idx);
322-
manifest_entries[row_idx].data_file->partition.AddValue(Literal::Double(value));
323-
} else if (view_of_partition->storage_type == ArrowType::NANOARROW_TYPE_STRING) {
324-
auto value = ArrowArrayViewGetStringUnsafe(view_of_partition, row_idx);
325-
manifest_entries[row_idx].data_file->partition.AddValue(
326-
Literal::String(std::string(value.data, value.size_bytes)));
327-
} else if (view_of_partition->storage_type == ArrowType::NANOARROW_TYPE_BINARY) {
328-
auto buffer = ArrowArrayViewGetBytesUnsafe(view_of_partition, row_idx);
329-
manifest_entries[row_idx].data_file->partition.AddValue(
330-
Literal::Binary(std::vector<uint8_t>(buffer.data.as_char,
331-
buffer.data.as_char + buffer.size_bytes)));
332-
} else {
333-
return InvalidManifest("Unsupported field type: {} in data file partition.",
334-
static_cast<int32_t>(view_of_partition->storage_type));
308+
switch (view_of_partition->storage_type) {
309+
case ArrowType::NANOARROW_TYPE_BOOL: {
310+
auto value = ArrowArrayViewGetUIntUnsafe(view_of_partition, row_idx);
311+
manifest_entries[row_idx].data_file->partition.AddValue(
312+
Literal::Boolean(value != 0));
313+
} break;
314+
case ArrowType::NANOARROW_TYPE_INT32: {
315+
auto value = ArrowArrayViewGetIntUnsafe(view_of_partition, row_idx);
316+
manifest_entries[row_idx].data_file->partition.AddValue(Literal::Int(value));
317+
} break;
318+
case ArrowType::NANOARROW_TYPE_INT64: {
319+
auto value = ArrowArrayViewGetIntUnsafe(view_of_partition, row_idx);
320+
manifest_entries[row_idx].data_file->partition.AddValue(Literal::Long(value));
321+
} break;
322+
case ArrowType::NANOARROW_TYPE_FLOAT: {
323+
auto value = ArrowArrayViewGetDoubleUnsafe(view_of_partition, row_idx);
324+
manifest_entries[row_idx].data_file->partition.AddValue(Literal::Float(value));
325+
} break;
326+
case ArrowType::NANOARROW_TYPE_DOUBLE: {
327+
auto value = ArrowArrayViewGetDoubleUnsafe(view_of_partition, row_idx);
328+
manifest_entries[row_idx].data_file->partition.AddValue(Literal::Double(value));
329+
} break;
330+
case ArrowType::NANOARROW_TYPE_STRING: {
331+
auto value = ArrowArrayViewGetStringUnsafe(view_of_partition, row_idx);
332+
manifest_entries[row_idx].data_file->partition.AddValue(
333+
Literal::String(std::string(value.data, value.size_bytes)));
334+
} break;
335+
case ArrowType::NANOARROW_TYPE_BINARY: {
336+
auto buffer = ArrowArrayViewGetBytesUnsafe(view_of_partition, row_idx);
337+
manifest_entries[row_idx].data_file->partition.AddValue(
338+
Literal::Binary(std::vector<uint8_t>(buffer.data.as_char,
339+
buffer.data.as_char + buffer.size_bytes)));
340+
} break;
341+
default:
342+
return InvalidManifest("Unsupported field type: {} in data file partition.",
343+
static_cast<int32_t>(view_of_partition->storage_type));
335344
}
336345
return {};
337346
}
@@ -547,9 +556,9 @@ Result<std::vector<ManifestEntry>> ParseManifestEntry(
547556
return manifest_entries;
548557
}
549558

550-
const std::unordered_set<std::string> kStatsColumns = {
551-
"value_counts", "null_value_counts", "nan_value_counts",
552-
"lower_bounds", "upper_bounds", "record_count"};
559+
const std::vector<std::string> kStatsColumns = {"value_counts", "null_value_counts",
560+
"nan_value_counts", "lower_bounds",
561+
"upper_bounds", "record_count"};
553562

554563
bool RequireStatsProjection(const std::shared_ptr<Expression>& row_filter,
555564
const std::vector<std::string>& columns) {
@@ -560,9 +569,7 @@ bool RequireStatsProjection(const std::shared_ptr<Expression>& row_filter,
560569
return false;
561570
}
562571
const std::unordered_set<std::string> selected(columns.cbegin(), columns.cend());
563-
if (std::ranges::all_of(
564-
ManifestReader::kAllColumns,
565-
[&selected](const std::string& col) { return selected.contains(col); })) {
572+
if (selected.contains(ManifestReader::kAllColumns)) {
566573
return false;
567574
}
568575
if (std::ranges::all_of(kStatsColumns, [&selected](const std::string& col) {
@@ -586,10 +593,7 @@ Result<std::shared_ptr<Schema>> ProjectSchema(std::shared_ptr<Schema> schema,
586593

587594
std::vector<std::string> ManifestReader::WithStatsColumns(
588595
const std::vector<std::string>& columns) {
589-
if (std::ranges::all_of(ManifestReader::kAllColumns,
590-
[&columns](const std::string& col) {
591-
return std::ranges::find(columns, col) != columns.cend();
592-
})) {
596+
if (std::ranges::contains(columns, ManifestReader::kAllColumns)) {
593597
return columns;
594598
} else {
595599
std::vector<std::string> updated_columns{columns};
@@ -641,11 +645,13 @@ ManifestReader& ManifestReaderImpl::CaseSensitive(bool case_sensitive) {
641645
}
642646

643647
bool ManifestReaderImpl::HasPartitionFilter() const {
644-
return part_filter_ && part_filter_->op() != Expression::Operation::kTrue;
648+
ICEBERG_DCHECK(part_filter_, "Partition filter is not set");
649+
return part_filter_->op() != Expression::Operation::kTrue;
645650
}
646651

647652
bool ManifestReaderImpl::HasRowFilter() const {
648-
return row_filter_ && row_filter_->op() != Expression::Operation::kTrue;
653+
ICEBERG_DCHECK(row_filter_, "Row filter is not set");
654+
return row_filter_->op() != Expression::Operation::kTrue;
649655
}
650656

651657
Result<Evaluator*> ManifestReaderImpl::GetEvaluator() {
@@ -818,17 +824,16 @@ Result<std::vector<ManifestFile>> ManifestListReaderImpl::Files() const {
818824
internal::ArrowSchemaGuard schema_guard(&arrow_schema);
819825
while (true) {
820826
ICEBERG_ASSIGN_OR_RAISE(auto result, reader_->Next());
821-
if (result.has_value()) {
822-
internal::ArrowArrayGuard array_guard(&result.value());
823-
ICEBERG_ASSIGN_OR_RAISE(
824-
auto parse_result, ParseManifestList(&arrow_schema, &result.value(), *schema_));
825-
manifest_files.insert(manifest_files.end(),
826-
std::make_move_iterator(parse_result.begin()),
827-
std::make_move_iterator(parse_result.end()));
828-
} else {
827+
if (!result.has_value()) {
829828
// eof
830829
break;
831830
}
831+
internal::ArrowArrayGuard array_guard(&result.value());
832+
ICEBERG_ASSIGN_OR_RAISE(auto parse_result,
833+
ParseManifestList(&arrow_schema, &result.value(), *schema_));
834+
manifest_files.insert(manifest_files.end(),
835+
std::make_move_iterator(parse_result.begin()),
836+
std::make_move_iterator(parse_result.end()));
832837
}
833838
return manifest_files;
834839
}

src/iceberg/manifest/manifest_reader.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ namespace iceberg {
3737
class ICEBERG_EXPORT ManifestReader {
3838
public:
3939
/// \brief Special value to select all columns from manifest files.
40-
inline static const std::vector<std::string> kAllColumns{"*"};
40+
inline static const std::string kAllColumns = "*";
4141

4242
virtual ~ManifestReader() = default;
4343

src/iceberg/manifest/manifest_reader_internal.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -87,10 +87,6 @@ class ManifestReaderImpl : public ManifestReader {
8787
/// \brief Check if there's a non-trivial row filter.
8888
bool HasRowFilter() const;
8989

90-
/// \brief Add stats columns to the column list if needed.
91-
std::vector<std::string> WithStatsColumns(
92-
const std::vector<std::string>& columns) const;
93-
9490
/// \brief Get or create the partition evaluator.
9591
Result<Evaluator*> GetEvaluator();
9692

0 commit comments

Comments
 (0)