Skip to content

Commit 6dd07b1

Browse files
authored
GH-49720: [C++] Optimize base64_decode validation using lookup table (#49748)
### Rationale for this change The current implementation of `base64_decode` validates characters using `std::string::find` for each byte, which introduces unnecessary overhead due to repeated linear searches. This change replaces those lookups with a precomputed 256-entry lookup table, enabling constant-time validation and value lookup per character. ### What changes are included in this PR? - Introduced a static lookup table (`kBase64Lookup`) to map base64 characters to their corresponding values - Replaced `std::string::find` with constant-time table lookup for character validation ### Are these changes tested? Yes. Existing base64 decoding behavior remains unchanged and continues to pass all current tests. This change is a performance optimization and does not alter functional output. ### Are there any user-facing changes? No. This change is internal and does not affect public APIs. * GitHub Issue: #49720 Authored-by: Aaditya Srinivasan <aadityasri03@gmail.com> Signed-off-by: Sutou Kouhei <kou@clear-code.com>
1 parent 8595105 commit 6dd07b1

File tree

1 file changed

+18
-9
lines changed

1 file changed

+18
-9
lines changed

cpp/src/arrow/vendored/base64.cpp

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,29 @@
3030
*/
3131

3232
#include "arrow/util/base64.h"
33+
#include <array>
34+
#include <cstdint>
3335
#include <iostream>
3436

3537
namespace arrow {
3638
namespace util {
3739

38-
static const std::string base64_chars =
40+
constexpr std::string_view base64_chars =
3941
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
4042
"abcdefghijklmnopqrstuvwxyz"
4143
"0123456789+/";
4244

45+
static const std::array<int8_t, 256> kBase64Lookup = [] {
46+
std::array<int8_t, 256> table{};
47+
table.fill(-1);
48+
49+
for (size_t i = 0; i < base64_chars.size(); ++i) {
50+
table[static_cast<uint8_t>(base64_chars[i])] = static_cast<int8_t>(i);
51+
}
52+
53+
return table;
54+
}();
55+
4356
static std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len) {
4457
std::string ret;
4558
int i = 0;
@@ -119,22 +132,18 @@ Result<std::string> base64_decode(std::string_view encoded_string) {
119132
return Status::Invalid("Invalid base64 input: padding in wrong position");
120133
}
121134

122-
if (base64_chars.find(c) == std::string::npos) {
135+
int8_t val = kBase64Lookup[static_cast<uint8_t>(c)];
136+
137+
if (val == -1) {
123138
return Status::Invalid("Invalid base64 input: character is not valid base64 character");
124139
}
125140

126-
char_array_4[i++] = c;
141+
char_array_4[i++] = val;
127142
}
128143

129144
in_++;
130145

131146
if (i == 4) {
132-
for (i = 0; i < 4; i++) {
133-
if (char_array_4[i] != 0) {
134-
char_array_4[i] = base64_chars.find(char_array_4[i]) & 0xff;
135-
}
136-
}
137-
138147
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
139148
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
140149
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];

0 commit comments

Comments
 (0)