@@ -1425,4 +1425,89 @@ TEST_F(ColumnStringTest, is_ascii) {
14251425 }
14261426}
14271427
1428+ TEST_F (ColumnStringTest, is_valid_utf8) {
1429+ // all ASCII strings are valid UTF-8
1430+ {
1431+ auto column = ColumnString::create ();
1432+ column->insert_data (" hello" , 5 );
1433+ column->insert_data (" world" , 5 );
1434+ column->insert_data (" 123!@#" , 6 );
1435+ EXPECT_TRUE (column->is_valid_utf8 ());
1436+ }
1437+ // empty column is valid
1438+ {
1439+ auto column = ColumnString::create ();
1440+ EXPECT_TRUE (column->is_valid_utf8 ());
1441+ }
1442+ // empty strings are valid UTF-8
1443+ {
1444+ auto column = ColumnString::create ();
1445+ column->insert_data (" " , 0 );
1446+ column->insert_data (" " , 0 );
1447+ EXPECT_TRUE (column->is_valid_utf8 ());
1448+ }
1449+ // multi-byte UTF-8 characters
1450+ {
1451+ auto column = ColumnString::create ();
1452+ column->insert_data (" Hello, 世界" , strlen (" Hello, 世界" ));
1453+ column->insert_data (" こんにちは" , strlen (" こんにちは" ));
1454+ column->insert_data (" 😀" , strlen (" 😀" ));
1455+ EXPECT_TRUE (column->is_valid_utf8 ());
1456+ }
1457+ // invalid: lone continuation byte 0x80
1458+ {
1459+ auto column = ColumnString::create ();
1460+ const char data[] = {' \x80 ' };
1461+ column->insert_data (data, 1 );
1462+ EXPECT_FALSE (column->is_valid_utf8 ());
1463+ }
1464+ // invalid: bad 2-byte sequence 0xC3 0x28
1465+ {
1466+ auto column = ColumnString::create ();
1467+ const char data[] = {' \xc3 ' , ' \x28 ' };
1468+ column->insert_data (data, 2 );
1469+ EXPECT_FALSE (column->is_valid_utf8 ());
1470+ }
1471+ // invalid: overlong encoding 0xC0 0xAF
1472+ {
1473+ auto column = ColumnString::create ();
1474+ const char data[] = {' \xc0 ' , ' \xaf ' };
1475+ column->insert_data (data, 2 );
1476+ EXPECT_FALSE (column->is_valid_utf8 ());
1477+ }
1478+ // invalid: 0xFE byte
1479+ {
1480+ auto column = ColumnString::create ();
1481+ const char data[] = {' \xfe ' };
1482+ column->insert_data (data, 1 );
1483+ EXPECT_FALSE (column->is_valid_utf8 ());
1484+ }
1485+ // invalid: truncated 3-byte sequence 0xE4 0xB8
1486+ {
1487+ auto column = ColumnString::create ();
1488+ const char data[] = {' \xe4 ' , ' \xb8 ' };
1489+ column->insert_data (data, 2 );
1490+ EXPECT_FALSE (column->is_valid_utf8 ());
1491+ }
1492+ // mixed: one invalid byte makes the whole column invalid
1493+ {
1494+ auto column = ColumnString::create ();
1495+ column->insert_data (" hello" , 5 );
1496+ const char bad[] = {' \xff ' };
1497+ column->insert_data (bad, 1 );
1498+ column->insert_data (" world" , 5 );
1499+ EXPECT_FALSE (column->is_valid_utf8 ());
1500+ }
1501+ // cross-row concatenation: "\xE4" + "\xB8\x96" form valid UTF-8 (世) when
1502+ // concatenated, but each row is invalid individually. Must validate per-row.
1503+ {
1504+ auto column = ColumnString::create ();
1505+ const char row1[] = {' \xe4 ' };
1506+ const char row2[] = {' \xb8 ' , ' \x96 ' };
1507+ column->insert_data (row1, 1 );
1508+ column->insert_data (row2, 2 );
1509+ EXPECT_FALSE (column->is_valid_utf8 ());
1510+ }
1511+ }
1512+
14281513} // namespace doris
0 commit comments