Skip to content

Commit 8f053b7

Browse files
committed
GH-49614: [C++] Fix silent truncation in base64_decode on invalid input
1 parent df88383 commit 8f053b7

File tree

2 files changed

+86
-1
lines changed

2 files changed

+86
-1
lines changed

cpp/src/arrow/util/string_test.cc

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "arrow/testing/gtest_util.h"
2929
#include "arrow/util/regex.h"
3030
#include "arrow/util/string.h"
31+
#include "arrow/util/base64.h"
3132

3233
namespace arrow {
3334
namespace internal {
@@ -238,6 +239,49 @@ TEST(ToChars, FloatingPoint) {
238239
}
239240
}
240241

242+
TEST(Base64DecodeTest, ValidInputs) {
243+
EXPECT_EQ(arrow::util::base64_decode("Zg=="), "f");
244+
EXPECT_EQ(arrow::util::base64_decode("Zm8="), "fo");
245+
EXPECT_EQ(arrow::util::base64_decode("Zm9v"), "foo");
246+
EXPECT_EQ(arrow::util::base64_decode("aGVsbG8gd29ybGQ="), "hello world");
247+
}
248+
249+
TEST(Base64DecodeTest, InvalidLength) {
250+
EXPECT_EQ(arrow::util::base64_decode("abc"), "");
251+
EXPECT_EQ(arrow::util::base64_decode("abcde"), "");
252+
}
253+
254+
TEST(Base64DecodeTest, InvalidCharacters) {
255+
EXPECT_EQ(arrow::util::base64_decode("ab$="), "");
256+
EXPECT_EQ(arrow::util::base64_decode("Zm9v*"), "");
257+
EXPECT_EQ(arrow::util::base64_decode("abcd$AAA"), "");
258+
}
259+
260+
TEST(Base64DecodeTest, InvalidPadding) {
261+
EXPECT_EQ(arrow::util::base64_decode("ab=c"), "");
262+
EXPECT_EQ(arrow::util::base64_decode("abc==="), "");
263+
EXPECT_EQ(arrow::util::base64_decode("abcd=AAA"), "");
264+
EXPECT_EQ(arrow::util::base64_decode("Zm=9v"), "");
265+
}
266+
267+
TEST(Base64DecodeTest, EdgeCases) {
268+
EXPECT_EQ(arrow::util::base64_decode("===="), "");
269+
EXPECT_EQ(arrow::util::base64_decode("TQ=="), "M");
270+
}
271+
272+
TEST(Base64DecodeTest, EmptyInput) {
273+
EXPECT_EQ(arrow::util::base64_decode(""), "");
274+
}
275+
276+
TEST(Base64DecodeTest, NonAsciiInput) {
277+
std::string input = std::string("abcd") + char(0xFF) + "==";
278+
EXPECT_EQ(arrow::util::base64_decode(input), "");
279+
}
280+
281+
TEST(Base64DecodeTest, PartialCorruption) {
282+
EXPECT_EQ(arrow::util::base64_decode("aGVs$G8gd29ybGQ="), "");
283+
}
284+
241285
#if !defined(_WIN32) || defined(NDEBUG)
242286

243287
TEST(ToChars, LocaleIndependent) {

cpp/src/arrow/vendored/base64.cpp

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@
3030
*/
3131

3232
#include "arrow/util/base64.h"
33+
#include "arrow/util/logging.h"
3334
#include <iostream>
35+
#include <cctype>
3436

3537
namespace arrow {
3638
namespace util {
@@ -101,7 +103,46 @@ std::string base64_decode(std::string_view encoded_string) {
101103
unsigned char char_array_4[4], char_array_3[3];
102104
std::string ret;
103105

104-
while (in_len-- && ( encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
106+
static const std::string base64_chars =
107+
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
108+
"abcdefghijklmnopqrstuvwxyz"
109+
"0123456789+/";
110+
111+
auto is_base64 = [](unsigned char c) -> bool {
112+
return (std::isalnum(c) || (c == '+') || (c == '/'));
113+
};
114+
115+
if (encoded_string.size() % 4 != 0) {
116+
ARROW_LOG(ERROR) << "Invalid base64 input: length is not a multiple of 4";
117+
return "";
118+
}
119+
120+
size_t padding_start = encoded_string.find('=');
121+
122+
if (padding_start != std::string::npos) {
123+
for (size_t k = padding_start; k < encoded_string.size(); ++k) {
124+
if (encoded_string[k] != '=') {
125+
ARROW_LOG(ERROR) << "Invalid base64 input: padding character '=' found at invalid position";
126+
return "";
127+
}
128+
}
129+
130+
size_t padding_count = encoded_string.size() - padding_start;
131+
132+
if (padding_count > 2) {
133+
ARROW_LOG(ERROR) << "Invalid base64 input: too many padding characters";
134+
return "";
135+
}
136+
}
137+
138+
for (char c : encoded_string) {
139+
if (c != '=' && !is_base64(c)) {
140+
ARROW_LOG(ERROR) << "Invalid base64 input: contains non-base64 character '" << c << "'";
141+
return "";
142+
}
143+
}
144+
145+
while (in_len-- && encoded_string[in_] != '=') {
105146
char_array_4[i++] = encoded_string[in_]; in_++;
106147
if (i ==4) {
107148
for (i = 0; i <4; i++)

0 commit comments

Comments
 (0)