Skip to content

Commit 4d02577

Browse files
committed
minor fixes
1 parent 761c03c commit 4d02577

9 files changed

Lines changed: 127 additions & 41 deletions

File tree

src/iceberg/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ set(ICEBERG_SOURCES
108108
util/murmurhash3_internal.cc
109109
util/property_util.cc
110110
util/snapshot_util.cc
111+
util/string_util.cc
111112
util/temporal_util.cc
112113
util/timepoint.cc
113114
util/transform_util.cc

src/iceberg/expression/json_serde.cc

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
* under the License.
1818
*/
1919

20+
#include <limits>
2021
#include <string>
2122
#include <vector>
2223

@@ -315,11 +316,18 @@ Result<Literal> LiteralFromJson(const nlohmann::json& json, const Type* type) {
315316
}
316317
return Literal::Boolean(json.get<bool>());
317318

318-
case TypeId::kInt:
319+
case TypeId::kInt: {
319320
if (!json.is_number_integer()) [[unlikely]] {
320321
return JsonParseError("Cannot parse {} as an int value", SafeDumpJson(json));
321322
}
322-
return Literal::Int(json.get<int32_t>());
323+
auto val = json.get<int64_t>();
324+
if (val < std::numeric_limits<int32_t>::min() ||
325+
val > std::numeric_limits<int32_t>::max()) [[unlikely]] {
326+
return JsonParseError("Cannot parse {} as an int value: out of range",
327+
SafeDumpJson(json));
328+
}
329+
return Literal::Int(static_cast<int32_t>(val));
330+
}
323331

324332
case TypeId::kLong:
325333
if (!json.is_number_integer()) [[unlikely]] {
@@ -328,13 +336,13 @@ Result<Literal> LiteralFromJson(const nlohmann::json& json, const Type* type) {
328336
return Literal::Long(json.get<int64_t>());
329337

330338
case TypeId::kFloat:
331-
if (!json.is_number()) [[unlikely]] {
339+
if (!json.is_number_float()) [[unlikely]] {
332340
return JsonParseError("Cannot parse {} as a float value", SafeDumpJson(json));
333341
}
334342
return Literal::Float(json.get<float>());
335343

336344
case TypeId::kDouble:
337-
if (!json.is_number()) [[unlikely]] {
345+
if (!json.is_number_float()) [[unlikely]] {
338346
return JsonParseError("Cannot parse {} as a double value", SafeDumpJson(json));
339347
}
340348
return Literal::Double(json.get<double>());
@@ -418,12 +426,15 @@ Result<Literal> LiteralFromJson(const nlohmann::json& json, const Type* type) {
418426
return JsonParseError("Cannot parse {} as a decimal value", SafeDumpJson(json));
419427
}
420428
const auto& dec_type = internal::checked_cast<const DecimalType&>(*type);
429+
int32_t parsed_precision = 0;
421430
int32_t parsed_scale = 0;
422431
ICEBERG_ASSIGN_OR_RAISE(
423-
auto dec, Decimal::FromString(json.get<std::string>(), nullptr, &parsed_scale));
424-
if (parsed_scale != dec_type.scale()) [[unlikely]] {
425-
return JsonParseError("Cannot parse {} as a {} value: the scale doesn't match",
426-
SafeDumpJson(json), type->ToString());
432+
auto dec,
433+
Decimal::FromString(json.get<std::string>(), &parsed_precision, &parsed_scale));
434+
if (parsed_precision > dec_type.precision() || parsed_scale != dec_type.scale())
435+
[[unlikely]] {
436+
return JsonParseError("Cannot parse {} as a {} value", SafeDumpJson(json),
437+
type->ToString());
427438
}
428439
return Literal::Decimal(dec.value(), dec_type.precision(), dec_type.scale());
429440
}

src/iceberg/expression/literal.cc

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -220,17 +220,24 @@ Result<Literal> LiteralCaster::CastFromString(
220220
return Literal::Binary(std::move(bytes));
221221
}
222222
case TypeId::kFixed: {
223+
const auto& fixed_type = internal::checked_cast<const FixedType&>(*target_type);
224+
if (str_val.size() != static_cast<size_t>(fixed_type.length()) * 2) {
225+
return InvalidArgument("Cannot cast string to {}: expected {} hex chars, got {}",
226+
target_type->ToString(), fixed_type.length() * 2,
227+
str_val.size());
228+
}
223229
ICEBERG_ASSIGN_OR_RAISE(auto bytes, StringUtils::HexStringToBytes(str_val));
224230
return Literal::Fixed(std::move(bytes));
225231
}
226232
case TypeId::kDecimal: {
227233
const auto& dec_type = internal::checked_cast<const DecimalType&>(*target_type);
234+
int32_t parsed_precision = 0;
228235
int32_t parsed_scale = 0;
229-
ICEBERG_ASSIGN_OR_RAISE(auto dec,
230-
Decimal::FromString(str_val, nullptr, &parsed_scale));
231-
if (parsed_scale != dec_type.scale()) {
232-
return InvalidArgument("Cannot cast {} as a {} value: the scale doesn't match",
233-
str_val, target_type->ToString());
236+
ICEBERG_ASSIGN_OR_RAISE(
237+
auto dec, Decimal::FromString(str_val, &parsed_precision, &parsed_scale));
238+
if (parsed_precision > dec_type.precision() || parsed_scale != dec_type.scale()) {
239+
return InvalidArgument("Cannot cast {} as a {} value", str_val,
240+
target_type->ToString());
234241
}
235242
return Literal::Decimal(dec.value(), dec_type.precision(), dec_type.scale());
236243
}

src/iceberg/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ iceberg_sources = files(
126126
'util/murmurhash3_internal.cc',
127127
'util/property_util.cc',
128128
'util/snapshot_util.cc',
129+
'util/string_util.cc',
129130
'util/temporal_util.cc',
130131
'util/timepoint.cc',
131132
'util/transform_util.cc',

src/iceberg/test/expression_json_test.cc

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,9 @@ TEST_P(LiteralFromJsonTypedTest, Parses) {
424424
const auto& p = GetParam();
425425
ICEBERG_UNWRAP_OR_FAIL(auto lit, LiteralFromJson(p.json, p.type.get()));
426426
EXPECT_EQ(lit.type()->type_id(), p.expected_type_id);
427-
if (p.expected_str) EXPECT_EQ(lit.ToString(), *p.expected_str);
427+
if (p.expected_str) {
428+
EXPECT_EQ(lit.ToString(), *p.expected_str);
429+
}
428430
}
429431

430432
INSTANTIATE_TEST_SUITE_P(
@@ -493,13 +495,41 @@ INSTANTIATE_TEST_SUITE_P(
493495
return info.param.name;
494496
});
495497

496-
TEST(LiteralFromJsonTyped, SchemaAwareDatePredicateRoundTrip) {
498+
struct SchemaAwarePredicateParam {
499+
std::string name;
500+
std::string field_name;
501+
std::shared_ptr<Type> field_type;
502+
nlohmann::json value;
503+
};
504+
505+
class SchemaAwarePredicateRoundTripTest
506+
: public ::testing::TestWithParam<SchemaAwarePredicateParam> {};
507+
508+
TEST_P(SchemaAwarePredicateRoundTripTest, RoundTrip) {
509+
const auto& p = GetParam();
497510
auto schema = std::make_shared<Schema>(
498-
std::vector<SchemaField>{SchemaField::MakeOptional(1, "event_date", date())});
499-
nlohmann::json pred_json = {
500-
{"type", "eq"}, {"term", "event_date"}, {"value", "2024-01-15"}};
511+
std::vector<SchemaField>{SchemaField::MakeOptional(1, p.field_name, p.field_type)});
512+
nlohmann::json pred_json = {{"type", "eq"}, {"term", p.field_name}, {"value", p.value}};
501513
ICEBERG_UNWRAP_OR_FAIL(auto expr, ExpressionFromJson(pred_json, schema.get()));
502514
ASSERT_NE(expr, nullptr);
503515
}
504516

517+
INSTANTIATE_TEST_SUITE_P(
518+
LiteralFromJsonTyped, SchemaAwarePredicateRoundTripTest,
519+
::testing::Values(
520+
SchemaAwarePredicateParam{"Date", "event_date", date(), "2024-01-15"},
521+
SchemaAwarePredicateParam{"Time", "event_time", time(), "14:30:00"},
522+
SchemaAwarePredicateParam{"Timestamp", "created_at", timestamp(),
523+
"2026-01-01T00:00:01.500"},
524+
SchemaAwarePredicateParam{"TimestampTz", "updated_at", timestamp_tz(),
525+
"2026-01-01T00:00:01.500+00:00"},
526+
SchemaAwarePredicateParam{"Uuid", "trace_id", uuid(),
527+
"f79c3e09-677c-4bbd-a479-3f349cb785e7"},
528+
SchemaAwarePredicateParam{"Binary", "payload", binary(), "deadbeef"},
529+
SchemaAwarePredicateParam{"Fixed", "hash", fixed(4), "cafebabe"},
530+
SchemaAwarePredicateParam{"Decimal", "amount", decimal(9, 2), "123.45"}),
531+
[](const ::testing::TestParamInfo<SchemaAwarePredicateParam>& info) {
532+
return info.param.name;
533+
});
534+
505535
} // namespace iceberg

src/iceberg/util/string_util.cc

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#include "iceberg/util/string_util.h"
21+
22+
#include "iceberg/util/macros.h"
23+
24+
namespace iceberg {
25+
26+
Result<std::vector<uint8_t>> StringUtils::HexStringToBytes(std::string_view hex) {
27+
if (hex.size() % 2 != 0) [[unlikely]] {
28+
return InvalidArgument("Hex string must have even length, got: {}", hex.size());
29+
}
30+
std::vector<uint8_t> bytes;
31+
bytes.reserve(hex.size() / 2);
32+
auto nibble = [](char c) -> Result<uint8_t> {
33+
if (c >= '0' && c <= '9') return static_cast<uint8_t>(c - '0');
34+
if (c >= 'a' && c <= 'f') return static_cast<uint8_t>(c - 'a' + 10);
35+
if (c >= 'A' && c <= 'F') return static_cast<uint8_t>(c - 'A' + 10);
36+
return InvalidArgument("Invalid hex character: '{}'", c);
37+
};
38+
for (size_t i = 0; i < hex.size(); i += 2) {
39+
ICEBERG_ASSIGN_OR_RAISE(auto hi, nibble(hex[i]));
40+
ICEBERG_ASSIGN_OR_RAISE(auto lo, nibble(hex[i + 1]));
41+
bytes.push_back(static_cast<uint8_t>((hi << 4) | lo));
42+
}
43+
return bytes;
44+
}
45+
46+
} // namespace iceberg

src/iceberg/util/string_util.h

Lines changed: 5 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232

3333
#include "iceberg/iceberg_export.h"
3434
#include "iceberg/result.h"
35-
#include "iceberg/util/macros.h"
3635

3736
namespace iceberg {
3837

@@ -80,6 +79,10 @@ class ICEBERG_EXPORT StringUtils {
8079
T value = 0;
8180
auto [ptr, ec] = std::from_chars(str.data(), str.data() + str.size(), value);
8281
if (ec == std::errc()) [[likely]] {
82+
if (ptr != str.data() + str.size()) {
83+
return InvalidArgument("Failed to parse {} from string '{}': trailing characters",
84+
typeid(T).name(), str);
85+
}
8386
return value;
8487
}
8588
if (ec == std::errc::invalid_argument) {
@@ -95,25 +98,7 @@ class ICEBERG_EXPORT StringUtils {
9598

9699
/// \brief Decode a hex string (upper or lower case) into bytes.
97100
/// Returns an error if the string has odd length or contains invalid hex characters.
98-
static Result<std::vector<uint8_t>> HexStringToBytes(std::string_view hex) {
99-
if (hex.size() % 2 != 0) [[unlikely]] {
100-
return InvalidArgument("Hex string must have even length, got: {}", hex.size());
101-
}
102-
std::vector<uint8_t> bytes;
103-
bytes.reserve(hex.size() / 2);
104-
auto nibble = [](char c) -> Result<uint8_t> {
105-
if (c >= '0' && c <= '9') return static_cast<uint8_t>(c - '0');
106-
if (c >= 'a' && c <= 'f') return static_cast<uint8_t>(c - 'a' + 10);
107-
if (c >= 'A' && c <= 'F') return static_cast<uint8_t>(c - 'A' + 10);
108-
return InvalidArgument("Invalid hex character: '{}'", c);
109-
};
110-
for (size_t i = 0; i < hex.size(); i += 2) {
111-
ICEBERG_ASSIGN_OR_RAISE(auto hi, nibble(hex[i]));
112-
ICEBERG_ASSIGN_OR_RAISE(auto lo, nibble(hex[i + 1]));
113-
bytes.push_back(static_cast<uint8_t>((hi << 4) | lo));
114-
}
115-
return bytes;
116-
}
101+
static Result<std::vector<uint8_t>> HexStringToBytes(std::string_view hex);
117102

118103
template <typename T>
119104
requires std::is_floating_point_v<T> && (!FromChars<T>)

src/iceberg/util/transform_util.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ Result<int64_t> TransformUtil::ParseTime(std::string_view str) {
187187

188188
// check that hours, minutes, seconds are in valid ranges
189189
if (hours < 0 || hours > 23 || minutes < 0 || minutes > 59 || seconds < 0 ||
190-
seconds > 60) [[unlikely]] {
190+
seconds > 59) [[unlikely]] {
191191
return InvalidArgument("Invalid time string: '{}'", str);
192192
}
193193

src/iceberg/util/transform_util.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,15 +109,19 @@ class ICEBERG_EXPORT TransformUtil {
109109

110110
/// \brief Parses a time string into microseconds from midnight.
111111
///
112-
/// Accepts: "HH:mm", "HH:mm:ss", "HH:mm:ss.SSS", "HH:mm:ss.SSSSSS".
112+
/// Accepts ISO-8601 local time formats: "HH:mm", "HH:mm:ss", or
113+
/// "HH:mm:ss.f" where the fractional part can be 1-9 digits.
114+
/// Digits beyond 6 (microsecond precision) are truncated.
113115
///
114116
/// \param str The time string to parse.
115117
/// \return The number of microseconds from midnight, or an error.
116118
static Result<int64_t> ParseTime(std::string_view str);
117119

118120
/// \brief Parses a timestamp string into microseconds since epoch.
119121
///
120-
/// Accepts: "yyyy-MM-ddTHH:mm:ss", with optional fractional seconds (.SSS or .SSSSSS).
122+
/// Accepts ISO-8601 local date-time formats: "yyyy-MM-ddTHH:mm",
123+
/// "yyyy-MM-ddTHH:mm:ss", or "yyyy-MM-ddTHH:mm:ss.f" where the
124+
/// fractional part can be 1-9 digits (truncated to microseconds).
121125
///
122126
/// \param str The timestamp string to parse.
123127
/// \return The number of microseconds since epoch, or an error.
@@ -127,6 +131,7 @@ class ICEBERG_EXPORT TransformUtil {
127131
///
128132
/// Accepts the same formats as ParseTimestamp, with a timezone suffix:
129133
/// "Z", "+HH:mm", or "-HH:mm". Non-UTC offsets are converted to UTC.
134+
/// The seconds and fractional parts are optional (e.g. "yyyy-MM-ddTHH:mm+00:00").
130135
///
131136
/// \param str The timestamp string to parse.
132137
/// \return The number of microseconds since epoch (UTC), or an error.

0 commit comments

Comments
 (0)