Skip to content

Commit 08b6ecd

Browse files
authored
[feature](function) support is_valid_utf8 function (#62515)
Add `is_valid_utf8(s)` / `isValidUTF8(s)` function that returns `true` if the input is valid UTF-8, `false` otherwise. Also adds `is_valid_utf8()` method to `ColumnStr` for column-level UTF-8 validation.
1 parent 2e7e2a5 commit 08b6ecd

10 files changed

Lines changed: 384 additions & 0 deletions

File tree

be/src/core/column/column_string.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
#include "util/simd/bits.h"
3737
#include "util/simd/vstring_function.h"
3838
#include "util/unaligned.h"
39+
#include "util/utf8_check.h"
3940
namespace doris {
4041

4142
template <typename T>
@@ -759,6 +760,20 @@ bool ColumnStr<T>::is_ascii() const {
759760
return simd::VStringFunctions::is_ascii(StringRef(chars.data(), chars.size()));
760761
}
761762

763+
template <typename T>
764+
bool ColumnStr<T>::is_valid_utf8() const {
765+
const auto num_rows = offsets.size();
766+
const char* data = reinterpret_cast<const char*>(chars.data());
767+
for (size_t i = 0; i < num_rows; ++i) {
768+
auto str_offset = offset_at(i);
769+
auto str_size = size_at(i);
770+
if (!validate_utf8(data + str_offset, str_size)) {
771+
return false;
772+
}
773+
}
774+
return true;
775+
}
776+
762777
template class ColumnStr<uint32_t>;
763778
template class ColumnStr<uint64_t>;
764779
} // namespace doris

be/src/core/column/column_string.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ class Arena;
5151
class ColumnSorter;
5252

5353
/** Column for String values.
54+
* Note: In string functions, we assume that ColumnStr contains valid UTF-8 encoded data.
55+
* However, ColumnStr is not guaranteed to always hold valid UTF-8, since it is also used
56+
* as a serialization container where the content may be arbitrary binary data.
5457
*/
5558
template <typename T>
5659
class ColumnStr final : public COWHelper<IColumn, ColumnStr<T>> {
@@ -536,6 +539,7 @@ class ColumnStr final : public COWHelper<IColumn, ColumnStr<T>> {
536539
}
537540

538541
bool is_ascii() const;
542+
bool is_valid_utf8() const;
539543

540544
Chars& get_chars() { return chars; }
541545
const Chars& get_chars() const { return chars; }

be/src/exprs/function/function_string.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
#include "exprs/function/string_hex_util.h"
4545
#include "util/string_search.hpp"
4646
#include "util/url_coding.h"
47+
#include "util/utf8_check.h"
4748

4849
namespace doris {
4950
struct NameStringASCII {
@@ -225,6 +226,29 @@ struct StringUtf8LengthImpl {
225226
}
226227
};
227228

229+
struct NameIsValidUTF8 {
230+
static constexpr auto name = "is_valid_utf8";
231+
};
232+
233+
struct IsValidUTF8Impl {
234+
using ReturnType = DataTypeUInt8;
235+
static constexpr auto PrimitiveTypeImpl = PrimitiveType::TYPE_STRING;
236+
using Type = String;
237+
using ReturnColumnType = ColumnUInt8;
238+
239+
static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets,
240+
PaddedPODArray<UInt8>& res) {
241+
auto size = offsets.size();
242+
res.resize(size);
243+
for (size_t i = 0; i < size; ++i) {
244+
const char* raw_str = reinterpret_cast<const char*>(&data[offsets[i - 1]]);
245+
size_t str_size = offsets[i] - offsets[i - 1];
246+
res[i] = validate_utf8(raw_str, str_size) ? 1 : 0;
247+
}
248+
return Status::OK();
249+
}
250+
};
251+
228252
struct NameStartsWith {
229253
static constexpr auto name = "starts_with";
230254
};
@@ -1316,6 +1340,7 @@ using FunctionStringLength = FunctionUnaryToType<StringLengthImpl, NameStringLen
13161340
using FunctionCrc32 = FunctionUnaryToType<Crc32Impl, NameCrc32>;
13171341
using FunctionStringUTF8Length = FunctionUnaryToType<StringUtf8LengthImpl, NameStringUtf8Length>;
13181342
using FunctionStringSpace = FunctionUnaryToType<StringSpace, NameStringSpace>;
1343+
using FunctionIsValidUTF8 = FunctionUnaryToType<IsValidUTF8Impl, NameIsValidUTF8>;
13191344
using FunctionStringStartsWith =
13201345
FunctionBinaryToType<DataTypeString, DataTypeString, StringStartsWithImpl, NameStartsWith>;
13211346
using FunctionStringEndsWith =
@@ -1422,7 +1447,9 @@ void register_function_string(SimpleFunctionFactory& factory) {
14221447
factory.register_function<FunctionSubReplace<SubReplaceThreeImpl>>();
14231448
factory.register_function<FunctionSubReplace<SubReplaceFourImpl>>();
14241449
factory.register_function<FunctionOverlay>();
1450+
factory.register_function<FunctionIsValidUTF8>();
14251451

1452+
factory.register_alias(FunctionIsValidUTF8::name, "isValidUTF8");
14261453
factory.register_alias(FunctionToLower::name, "lcase");
14271454
factory.register_alias(FunctionToUpper::name, "ucase");
14281455
factory.register_alias(FunctionStringUTF8Length::name, "character_length");

be/test/core/column/column_string_test.cpp

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1425,4 +1425,89 @@ TEST_F(ColumnStringTest, is_ascii) {
14251425
}
14261426
}
14271427

1428+
TEST_F(ColumnStringTest, is_valid_utf8) {
1429+
// all ASCII strings are valid UTF-8
1430+
{
1431+
auto column = ColumnString::create();
1432+
column->insert_data("hello", 5);
1433+
column->insert_data("world", 5);
1434+
column->insert_data("123!@#", 6);
1435+
EXPECT_TRUE(column->is_valid_utf8());
1436+
}
1437+
// empty column is valid
1438+
{
1439+
auto column = ColumnString::create();
1440+
EXPECT_TRUE(column->is_valid_utf8());
1441+
}
1442+
// empty strings are valid UTF-8
1443+
{
1444+
auto column = ColumnString::create();
1445+
column->insert_data("", 0);
1446+
column->insert_data("", 0);
1447+
EXPECT_TRUE(column->is_valid_utf8());
1448+
}
1449+
// multi-byte UTF-8 characters
1450+
{
1451+
auto column = ColumnString::create();
1452+
column->insert_data("Hello, 世界", strlen("Hello, 世界"));
1453+
column->insert_data("こんにちは", strlen("こんにちは"));
1454+
column->insert_data("😀", strlen("😀"));
1455+
EXPECT_TRUE(column->is_valid_utf8());
1456+
}
1457+
// invalid: lone continuation byte 0x80
1458+
{
1459+
auto column = ColumnString::create();
1460+
const char data[] = {'\x80'};
1461+
column->insert_data(data, 1);
1462+
EXPECT_FALSE(column->is_valid_utf8());
1463+
}
1464+
// invalid: bad 2-byte sequence 0xC3 0x28
1465+
{
1466+
auto column = ColumnString::create();
1467+
const char data[] = {'\xc3', '\x28'};
1468+
column->insert_data(data, 2);
1469+
EXPECT_FALSE(column->is_valid_utf8());
1470+
}
1471+
// invalid: overlong encoding 0xC0 0xAF
1472+
{
1473+
auto column = ColumnString::create();
1474+
const char data[] = {'\xc0', '\xaf'};
1475+
column->insert_data(data, 2);
1476+
EXPECT_FALSE(column->is_valid_utf8());
1477+
}
1478+
// invalid: 0xFE byte
1479+
{
1480+
auto column = ColumnString::create();
1481+
const char data[] = {'\xfe'};
1482+
column->insert_data(data, 1);
1483+
EXPECT_FALSE(column->is_valid_utf8());
1484+
}
1485+
// invalid: truncated 3-byte sequence 0xE4 0xB8
1486+
{
1487+
auto column = ColumnString::create();
1488+
const char data[] = {'\xe4', '\xb8'};
1489+
column->insert_data(data, 2);
1490+
EXPECT_FALSE(column->is_valid_utf8());
1491+
}
1492+
// mixed: one invalid byte makes the whole column invalid
1493+
{
1494+
auto column = ColumnString::create();
1495+
column->insert_data("hello", 5);
1496+
const char bad[] = {'\xff'};
1497+
column->insert_data(bad, 1);
1498+
column->insert_data("world", 5);
1499+
EXPECT_FALSE(column->is_valid_utf8());
1500+
}
1501+
// cross-row concatenation: "\xE4" + "\xB8\x96" form valid UTF-8 (世) when
1502+
// concatenated, but each row is invalid individually. Must validate per-row.
1503+
{
1504+
auto column = ColumnString::create();
1505+
const char row1[] = {'\xe4'};
1506+
const char row2[] = {'\xb8', '\x96'};
1507+
column->insert_data(row1, 1);
1508+
column->insert_data(row2, 2);
1509+
EXPECT_FALSE(column->is_valid_utf8());
1510+
}
1511+
}
1512+
14281513
} // namespace doris

be/test/exprs/function/function_string_test.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -994,6 +994,36 @@ TEST(function_string_test, function_ascii_test) {
994994
check_function_all_arg_comb<DataTypeInt32, true>(func_name, input_types, data_set);
995995
}
996996

997+
TEST(function_string_test, function_is_valid_utf8_test) {
998+
std::string func_name = "is_valid_utf8";
999+
1000+
InputTypeSet input_types = {PrimitiveType::TYPE_VARCHAR};
1001+
1002+
DataSet data_set = {
1003+
// valid UTF-8 strings
1004+
{{std::string("hello")}, std::uint8_t(1)},
1005+
{{std::string("")}, std::uint8_t(1)},
1006+
{{std::string("Hello, 世界")}, std::uint8_t(1)},
1007+
{{std::string("こんにちは")}, std::uint8_t(1)},
1008+
{{std::string("123!@#")}, std::uint8_t(1)},
1009+
{{std::string("\xc3\xb1")}, std::uint8_t(1)}, // ñ
1010+
{{std::string("\xe2\x82\xac")}, std::uint8_t(1)}, //
1011+
{{std::string("\xf0\x9f\x98\x80")}, std::uint8_t(1)}, // 😀
1012+
// invalid UTF-8 strings
1013+
{{std::string("\x80")}, std::uint8_t(0)}, // invalid leading byte
1014+
{{std::string("\xc3\x28")}, std::uint8_t(0)}, // invalid 2-byte sequence
1015+
{{std::string("\xe2\x28\xa1")}, std::uint8_t(0)}, // invalid 3-byte sequence
1016+
{{std::string("\xf0\x28\x8c\xbc")}, std::uint8_t(0)}, // invalid 4-byte sequence
1017+
{{std::string("\xfe")}, std::uint8_t(0)}, // invalid byte 0xFE
1018+
{{std::string("\xff")}, std::uint8_t(0)}, // invalid byte 0xFF
1019+
{{std::string("abc\xc0\xaf")}, std::uint8_t(0)}, // overlong encoding
1020+
// NULL
1021+
{{Null()}, Null()},
1022+
};
1023+
1024+
check_function_all_arg_comb<DataTypeUInt8, true>(func_name, input_types, data_set);
1025+
}
1026+
9971027
TEST(function_string_test, function_char_length_test) {
9981028
std::string func_name = "char_length";
9991029

fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,7 @@
278278
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsIpv6String;
279279
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsNan;
280280
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsUuid;
281+
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsValidUtf8;
281282
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArray;
282283
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArrayIgnoreNull;
283284
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonContains;
@@ -837,6 +838,7 @@ public class BuiltinScalarFunctions implements FunctionHelper {
837838
scalar(IsIpAddressInRange.class, "is_ip_address_in_range"),
838839
scalar(IsNan.class, "isnan"),
839840
scalar(IsUuid.class, "is_uuid"),
841+
scalar(IsValidUtf8.class, "is_valid_utf8", "isValidUTF8"),
840842
scalar(IsInf.class, "isinf"),
841843
scalar(Ipv4CIDRToRange.class, "ipv4_cidr_to_range"),
842844
scalar(Ipv6CIDRToRange.class, "ipv6_cidr_to_range"),
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
package org.apache.doris.nereids.trees.expressions.functions.scalar;
19+
20+
import org.apache.doris.catalog.FunctionSignature;
21+
import org.apache.doris.nereids.trees.expressions.Expression;
22+
import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
23+
import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
24+
import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression;
25+
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
26+
import org.apache.doris.nereids.types.BooleanType;
27+
import org.apache.doris.nereids.types.StringType;
28+
import org.apache.doris.nereids.types.VarcharType;
29+
30+
import com.google.common.base.Preconditions;
31+
import com.google.common.collect.ImmutableList;
32+
33+
import java.util.List;
34+
35+
/**
36+
* ScalarFunction 'is_valid_utf8'.
37+
*/
38+
public class IsValidUtf8 extends ScalarFunction
39+
implements UnaryExpression, ExplicitlyCastableSignature, PropagateNullable {
40+
41+
public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
42+
FunctionSignature.ret(BooleanType.INSTANCE).args(VarcharType.SYSTEM_DEFAULT),
43+
FunctionSignature.ret(BooleanType.INSTANCE).args(StringType.INSTANCE)
44+
);
45+
46+
/**
47+
* constructor with 1 argument.
48+
*/
49+
public IsValidUtf8(Expression arg) {
50+
super("is_valid_utf8", arg);
51+
}
52+
53+
/** constructor for withChildren and reuse signature */
54+
private IsValidUtf8(ScalarFunctionParams functionParams) {
55+
super(functionParams);
56+
}
57+
58+
/**
59+
* withChildren.
60+
*/
61+
@Override
62+
public IsValidUtf8 withChildren(List<Expression> children) {
63+
Preconditions.checkArgument(children.size() == 1);
64+
return new IsValidUtf8(getFunctionParams(children));
65+
}
66+
67+
@Override
68+
public List<FunctionSignature> getSignatures() {
69+
return SIGNATURES;
70+
}
71+
72+
@Override
73+
public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
74+
return visitor.visitIsValidUtf8(this, context);
75+
}
76+
}

fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,7 @@
298298
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsIpv6String;
299299
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsNan;
300300
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsUuid;
301+
import org.apache.doris.nereids.trees.expressions.functions.scalar.IsValidUtf8;
301302
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArray;
302303
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonArrayIgnoreNull;
303304
import org.apache.doris.nereids.trees.expressions.functions.scalar.JsonContains;
@@ -1701,6 +1702,10 @@ default R visitIsUuid(IsUuid isUuid, C context) {
17011702
return visitScalarFunction(isUuid, context);
17021703
}
17031704

1705+
default R visitIsValidUtf8(IsValidUtf8 isValidUtf8, C context) {
1706+
return visitScalarFunction(isValidUtf8, context);
1707+
}
1708+
17041709
default R visitIsInf(IsInf isInf, C context) {
17051710
return visitScalarFunction(isInf, context);
17061711
}

0 commit comments

Comments
 (0)