Skip to content

Commit a6c434e

Browse files
committed
TH_Tokenized: use accessible detok before raw escapes.
1 parent 0436db6 commit a6c434e

2 files changed

Lines changed: 96 additions & 58 deletions

File tree

src/TypeHandlers/TH_Tokenized.cpp

Lines changed: 82 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <fstream>
2323
#include <cstring>
2424
#include <array>
25+
#include <algorithm>
2526

2627
#include <pugixml.hpp>
2728

@@ -497,12 +498,24 @@ namespace tivars::TypeHandlers
497498
uint16_t lastTokenBytes = 0;
498499
};
499500

500-
struct TokenScanCheckpoint
501+
static void register_token_lookup_name(const std::string& name, uint16_t tokenValue)
501502
{
502-
size_t strPos = 0;
503-
size_t rawLen = 0;
504-
TokenScanState state;
505-
};
503+
if (name.empty())
504+
{
505+
return;
506+
}
507+
508+
tokens_NameToBytes[name] = tokenValue;
509+
if (name.size() > lengthOfLongestTokenName)
510+
{
511+
lengthOfLongestTokenName = static_cast<uint8_t>(name.size());
512+
}
513+
}
514+
515+
static bool can_start_explicit_string_alias(char c)
516+
{
517+
return std::string_view("[]{}|^_").find(c) != std::string_view::npos;
518+
}
506519

507520
template<typename OnToken, typename OnSkipped>
508521
static void scan_source_tokens_impl(const std::string& str, bool detect_strings, TokenScanState& state, OnToken&& onToken, OnSkipped&& onSkipped)
@@ -572,9 +585,28 @@ namespace tivars::TypeHandlers
572585
const bool needMinMunch = state.isInCustomName || (state.isWithinString && !state.inEvaluatedString);
573586
bool matched = false;
574587

588+
if (state.isWithinString && !state.inEvaluatedString && can_start_explicit_string_alias(str[strCursorPos]))
589+
{
590+
for (size_t currentLength = maxTokSearchLen; currentLength > 1; currentLength--)
591+
{
592+
const std::string currentSubString = str.substr(strCursorPos, currentLength);
593+
const auto tokenIt = tokens_NameToBytes.find(currentSubString);
594+
if (tokenIt == tokens_NameToBytes.end())
595+
{
596+
continue;
597+
}
598+
599+
onToken(currentSubString, tokenIt->second);
600+
strCursorPos += currentLength - 1;
601+
state.lastTokenBytes = tokenIt->second;
602+
matched = true;
603+
break;
604+
}
605+
}
606+
575607
/* needMinMunch => minimum token length, otherwise maximal munch */
576608
for (size_t currentLength = needMinMunch ? 1 : maxTokSearchLen;
577-
needMinMunch ? (currentLength <= maxTokSearchLen) : (currentLength > 0);
609+
!matched && (needMinMunch ? (currentLength <= maxTokSearchLen) : (currentLength > 0));
578610
currentLength += (needMinMunch ? 1 : -1))
579611
{
580612
std::string currentSubString = str.substr(strCursorPos, currentLength);
@@ -634,11 +666,6 @@ namespace tivars::TypeHandlers
634666
return data;
635667
}
636668

637-
static data_t tokenize_source_to_raw_bytes(const std::string& str, bool detect_strings = true)
638-
{
639-
return tokenize_source_to_raw_bytes(str, detect_strings, TokenScanState{});
640-
}
641-
642669
static void advance_token_scan_state(const std::string& str, TokenScanState& state)
643670
{
644671
scan_source_tokens_impl(str, true, state,
@@ -659,6 +686,30 @@ namespace tivars::TypeHandlers
659686
}
660687
}
661688

689+
static bool append_can_merge_with_previous_token(const std::string& existing, const std::string& appended)
690+
{
691+
if (existing.empty() || appended.empty() || lengthOfLongestTokenName < 2)
692+
{
693+
return false;
694+
}
695+
696+
const size_t maxSuffixLen = std::min(existing.size(), static_cast<size_t>(lengthOfLongestTokenName - 1));
697+
for (size_t suffixLen = 1; suffixLen <= maxSuffixLen; suffixLen++)
698+
{
699+
const std::string suffix = existing.substr(existing.size() - suffixLen);
700+
const size_t maxPrefixLen = std::min(appended.size(), static_cast<size_t>(lengthOfLongestTokenName) - suffixLen);
701+
for (size_t prefixLen = 1; prefixLen <= maxPrefixLen; prefixLen++)
702+
{
703+
if (tokens_NameToBytes.contains(suffix + appended.substr(0, prefixLen)))
704+
{
705+
return true;
706+
}
707+
}
708+
}
709+
710+
return false;
711+
}
712+
662713
static std::vector<std::string> get_detok_alias_candidates(uint16_t bytesKey, uint8_t langIdx, const std::string& primaryDisplay)
663714
{
664715
using TH_Tokenized::LANG_EN;
@@ -682,13 +733,13 @@ namespace tivars::TypeHandlers
682733
}
683734
};
684735

685-
append_aliases(tokenNames.variants, langIdx);
686736
append_aliases(tokenNames.accessibles, langIdx);
737+
append_aliases(tokenNames.variants, langIdx);
687738

688739
if (langIdx != LANG_EN)
689740
{
690-
append_aliases(tokenNames.variants, LANG_EN);
691741
append_aliases(tokenNames.accessibles, LANG_EN);
742+
append_aliases(tokenNames.variants, LANG_EN);
692743
}
693744

694745
return candidates;
@@ -785,9 +836,7 @@ namespace tivars::TypeHandlers
785836
auto& tokenNames = tokens_BytesToNames[bytes];
786837
auto& aliasList = std::strcmp(which, "variant") == 0 ? tokenNames.variants[langIdx] : tokenNames.accessibles[langIdx];
787838
append_unique_string(aliasList, s);
788-
tokens_NameToBytes[s] = bytes;
789-
if (s.size() > lengthOfLongestTokenName)
790-
lengthOfLongestTokenName = (uint8_t)s.size();
839+
register_token_lookup_name(s, bytes);
791840
}
792841
}
793842
}
@@ -803,13 +852,9 @@ namespace tivars::TypeHandlers
803852
if (!en.empty() || !fr.empty())
804853
tokens_BytesToNames[bytes].display = { en, fr.empty() ? en : fr };
805854
if (!en.empty())
806-
tokens_NameToBytes[en] = bytes;
855+
register_token_lookup_name(en, bytes);
807856
if (!fr.empty())
808-
tokens_NameToBytes[fr] = bytes;
809-
if (en.size() > lengthOfLongestTokenName)
810-
lengthOfLongestTokenName = (uint8_t)en.size();
811-
if (fr.size() > lengthOfLongestTokenName)
812-
lengthOfLongestTokenName = (uint8_t)fr.size();
857+
register_token_lookup_name(fr, bytes);
813858

814859
register_aliases(bytes, tokenNode);
815860
};
@@ -975,46 +1020,28 @@ namespace tivars::TypeHandlers
9751020
std::string str;
9761021
data_t verifiedRawBytes;
9771022
TokenScanState detokState{};
978-
std::vector<TokenScanCheckpoint> detokCheckpoints = { {0, 0, detokState} };
979-
const size_t detokValidationLookbehind = std::max<size_t>(lengthOfLongestTokenName, 6) + 1;
9801023

9811024
auto validate_detok_token = [&](const std::string& token, const data_t& tokenRawBytes)
9821025
{
983-
const size_t tailMinPos = str.size() > detokValidationLookbehind ? str.size() - detokValidationLookbehind : 0;
984-
auto checkpointIt = std::upper_bound(detokCheckpoints.begin(), detokCheckpoints.end(), tailMinPos,
985-
[](size_t pos, const TokenScanCheckpoint& checkpoint)
986-
{
987-
return pos < checkpoint.strPos;
988-
});
989-
if (checkpointIt != detokCheckpoints.begin())
1026+
if (tokenize_source_to_raw_bytes(token, true, detokState) != tokenRawBytes)
9901027
{
991-
checkpointIt--;
1028+
return false;
9921029
}
993-
994-
std::string candidateSegment = str.substr(checkpointIt->strPos);
995-
candidateSegment += token;
996-
997-
data_t expectedSegmentBytes(verifiedRawBytes.begin() + (long long)checkpointIt->rawLen, verifiedRawBytes.end());
998-
tivars::vector_append(expectedSegmentBytes, tokenRawBytes);
999-
1000-
const data_t actualBytes = tokenize_source_to_raw_bytes(candidateSegment, true, checkpointIt->state);
1001-
if (actualBytes == expectedSegmentBytes)
1030+
if (!append_can_merge_with_previous_token(str, token))
10021031
{
10031032
return true;
10041033
}
10051034

1006-
const std::string fullCandidate = str + token;
10071035
data_t expectedFullBytes = verifiedRawBytes;
10081036
tivars::vector_append(expectedFullBytes, tokenRawBytes);
1009-
return tokenize_source_to_raw_bytes(fullCandidate) == expectedFullBytes;
1037+
return tokenize_source_to_raw_bytes(str + token, true, TokenScanState{}) == expectedFullBytes;
10101038
};
10111039

10121040
auto accept_detok_token = [&](const std::string& token, const data_t& tokenRawBytes)
10131041
{
10141042
str += token;
10151043
tivars::vector_append(verifiedRawBytes, tokenRawBytes);
10161044
advance_token_scan_state(token, detokState);
1017-
detokCheckpoints.push_back({str.size(), verifiedRawBytes.size(), detokState});
10181045
};
10191046

10201047
for (size_t i = fromRawBytes ? 0 : 2; i < dataSize; i++)
@@ -1054,24 +1081,21 @@ namespace tivars::TypeHandlers
10541081
{
10551082
bool acceptedFallback = false;
10561083

1057-
const std::string escapedToken = "\\" + tokStr;
1058-
if (validate_detok_token(escapedToken, currentRawBytes))
1084+
for (const auto& aliasCandidate : get_detok_alias_candidates(bytesKey, langIdx, tokStr))
10591085
{
1060-
accept_detok_token(escapedToken, currentRawBytes);
1061-
acceptedFallback = true;
1086+
if (validate_detok_token(aliasCandidate, currentRawBytes))
1087+
{
1088+
accept_detok_token(aliasCandidate, currentRawBytes);
1089+
acceptedFallback = true;
1090+
break;
1091+
}
10621092
}
10631093

1064-
if (!acceptedFallback)
1094+
const std::string escapedToken = "\\" + tokStr;
1095+
if (!acceptedFallback && validate_detok_token(escapedToken, currentRawBytes))
10651096
{
1066-
for (const auto& aliasCandidate : get_detok_alias_candidates(bytesKey, langIdx, tokStr))
1067-
{
1068-
if (validate_detok_token(aliasCandidate, currentRawBytes))
1069-
{
1070-
accept_detok_token(aliasCandidate, currentRawBytes);
1071-
acceptedFallback = true;
1072-
break;
1073-
}
1074-
}
1097+
accept_detok_token(escapedToken, currentRawBytes);
1098+
acceptedFallback = true;
10751099
}
10761100

10771101
if (!acceptedFallback)

tests.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2368,6 +2368,20 @@ End)";
23682368
assert(longProtectedProgram.getVarEntries()[0]._type.getName() == "ProtectedProgram");
23692369
const std::string longProtectedName = entry_name_to_string(longProtectedProgram.getVarEntries()[0]._type, longProtectedProgram.getVarEntries()[0].varname);
23702370
assert(!longProtectedName.empty());
2371+
{
2372+
ScopedStderrCapture longProtectedDetokenizeStderr;
2373+
const std::string longProtectedReadable = longProtectedProgram.getReadableContent();
2374+
assert(longProtectedDetokenizeStderr.str().find("non-roundtrippable") == std::string::npos);
2375+
assert(longProtectedReadable.find("\"Q|ui[t][t][|e][r]→Str7") != std::string::npos);
2376+
assert(longProtectedReadable.find("\"R[|e][t]o|u[r]→Str9") != std::string::npos);
2377+
assert(longProtectedReadable.find("\\u5E80") == std::string::npos);
2378+
assert(longProtectedReadable.find("\\u621A") == std::string::npos);
2379+
assert(longProtectedReadable.find("\\u6212") == std::string::npos);
2380+
assert(longProtectedReadable.find("\\u6224") == std::string::npos);
2381+
TIVarFile recreatedLongProtectedProgram = TIVarFile::createNew("ProtectedProgram", longProtectedName);
2382+
recreatedLongProtectedProgram.setContentFromString(longProtectedReadable);
2383+
assert(recreatedLongProtectedProgram.getRawContent() == longProtectedProgram.getRawContent());
2384+
}
23712385
const std::string longProtectedPath = "/tmp/"s + longProtectedName + ".8xp";
23722386
assert(longProtectedProgram.saveVarToFile("/tmp", "") == longProtectedPath);
23732387
assert(remove(longProtectedPath.c_str()) == 0);

0 commit comments

Comments
 (0)