2222#include < fstream>
2323#include < cstring>
2424#include < array>
25+ #include < algorithm>
2526
2627#include < pugixml.hpp>
2728
@@ -497,12 +498,24 @@ namespace tivars::TypeHandlers
497498 uint16_t lastTokenBytes = 0 ;
498499 };
499500
500- struct TokenScanCheckpoint
501+ static void register_token_lookup_name ( const std::string& name, uint16_t tokenValue)
501502 {
502- size_t strPos = 0 ;
503- size_t rawLen = 0 ;
504- TokenScanState state;
505- };
503+ if (name.empty ())
504+ {
505+ return ;
506+ }
507+
508+ tokens_NameToBytes[name] = tokenValue;
509+ if (name.size () > lengthOfLongestTokenName)
510+ {
511+ lengthOfLongestTokenName = static_cast <uint8_t >(name.size ());
512+ }
513+ }
514+
515+ static bool can_start_explicit_string_alias (char c)
516+ {
517+ return std::string_view (" []{}|^_" ).find (c) != std::string_view::npos;
518+ }
506519
507520 template <typename OnToken, typename OnSkipped>
508521 static void scan_source_tokens_impl (const std::string& str, bool detect_strings, TokenScanState& state, OnToken&& onToken, OnSkipped&& onSkipped)
@@ -572,9 +585,28 @@ namespace tivars::TypeHandlers
572585 const bool needMinMunch = state.isInCustomName || (state.isWithinString && !state.inEvaluatedString );
573586 bool matched = false ;
574587
588+ if (state.isWithinString && !state.inEvaluatedString && can_start_explicit_string_alias (str[strCursorPos]))
589+ {
590+ for (size_t currentLength = maxTokSearchLen; currentLength > 1 ; currentLength--)
591+ {
592+ const std::string currentSubString = str.substr (strCursorPos, currentLength);
593+ const auto tokenIt = tokens_NameToBytes.find (currentSubString);
594+ if (tokenIt == tokens_NameToBytes.end ())
595+ {
596+ continue ;
597+ }
598+
599+ onToken (currentSubString, tokenIt->second );
600+ strCursorPos += currentLength - 1 ;
601+ state.lastTokenBytes = tokenIt->second ;
602+ matched = true ;
603+ break ;
604+ }
605+ }
606+
575607 /* needMinMunch => minimum token length, otherwise maximal munch */
576608 for (size_t currentLength = needMinMunch ? 1 : maxTokSearchLen;
577- needMinMunch ? (currentLength <= maxTokSearchLen) : (currentLength > 0 );
609+ !matched && ( needMinMunch ? (currentLength <= maxTokSearchLen) : (currentLength > 0 ) );
578610 currentLength += (needMinMunch ? 1 : -1 ))
579611 {
580612 std::string currentSubString = str.substr (strCursorPos, currentLength);
@@ -634,11 +666,6 @@ namespace tivars::TypeHandlers
634666 return data;
635667 }
636668
637- static data_t tokenize_source_to_raw_bytes (const std::string& str, bool detect_strings = true )
638- {
639- return tokenize_source_to_raw_bytes (str, detect_strings, TokenScanState{});
640- }
641-
642669 static void advance_token_scan_state (const std::string& str, TokenScanState& state)
643670 {
644671 scan_source_tokens_impl (str, true , state,
@@ -659,6 +686,30 @@ namespace tivars::TypeHandlers
659686 }
660687 }
661688
689+ static bool append_can_merge_with_previous_token (const std::string& existing, const std::string& appended)
690+ {
691+ if (existing.empty () || appended.empty () || lengthOfLongestTokenName < 2 )
692+ {
693+ return false ;
694+ }
695+
696+ const size_t maxSuffixLen = std::min (existing.size (), static_cast <size_t >(lengthOfLongestTokenName - 1 ));
697+ for (size_t suffixLen = 1 ; suffixLen <= maxSuffixLen; suffixLen++)
698+ {
699+ const std::string suffix = existing.substr (existing.size () - suffixLen);
700+ const size_t maxPrefixLen = std::min (appended.size (), static_cast <size_t >(lengthOfLongestTokenName) - suffixLen);
701+ for (size_t prefixLen = 1 ; prefixLen <= maxPrefixLen; prefixLen++)
702+ {
703+ if (tokens_NameToBytes.contains (suffix + appended.substr (0 , prefixLen)))
704+ {
705+ return true ;
706+ }
707+ }
708+ }
709+
710+ return false ;
711+ }
712+
662713 static std::vector<std::string> get_detok_alias_candidates (uint16_t bytesKey, uint8_t langIdx, const std::string& primaryDisplay)
663714 {
664715 using TH_Tokenized::LANG_EN;
@@ -682,13 +733,13 @@ namespace tivars::TypeHandlers
682733 }
683734 };
684735
685- append_aliases (tokenNames.variants , langIdx);
686736 append_aliases (tokenNames.accessibles , langIdx);
737+ append_aliases (tokenNames.variants , langIdx);
687738
688739 if (langIdx != LANG_EN)
689740 {
690- append_aliases (tokenNames.variants , LANG_EN);
691741 append_aliases (tokenNames.accessibles , LANG_EN);
742+ append_aliases (tokenNames.variants , LANG_EN);
692743 }
693744
694745 return candidates;
@@ -785,9 +836,7 @@ namespace tivars::TypeHandlers
785836 auto & tokenNames = tokens_BytesToNames[bytes];
786837 auto & aliasList = std::strcmp (which, " variant" ) == 0 ? tokenNames.variants [langIdx] : tokenNames.accessibles [langIdx];
787838 append_unique_string (aliasList, s);
788- tokens_NameToBytes[s] = bytes;
789- if (s.size () > lengthOfLongestTokenName)
790- lengthOfLongestTokenName = (uint8_t )s.size ();
839+ register_token_lookup_name (s, bytes);
791840 }
792841 }
793842 }
@@ -803,13 +852,9 @@ namespace tivars::TypeHandlers
803852 if (!en.empty () || !fr.empty ())
804853 tokens_BytesToNames[bytes].display = { en, fr.empty () ? en : fr };
805854 if (!en.empty ())
806- tokens_NameToBytes[en] = bytes;
855+ register_token_lookup_name (en, bytes) ;
807856 if (!fr.empty ())
808- tokens_NameToBytes[fr] = bytes;
809- if (en.size () > lengthOfLongestTokenName)
810- lengthOfLongestTokenName = (uint8_t )en.size ();
811- if (fr.size () > lengthOfLongestTokenName)
812- lengthOfLongestTokenName = (uint8_t )fr.size ();
857+ register_token_lookup_name (fr, bytes);
813858
814859 register_aliases (bytes, tokenNode);
815860 };
@@ -975,46 +1020,28 @@ namespace tivars::TypeHandlers
9751020 std::string str;
9761021 data_t verifiedRawBytes;
9771022 TokenScanState detokState{};
978- std::vector<TokenScanCheckpoint> detokCheckpoints = { {0 , 0 , detokState} };
979- const size_t detokValidationLookbehind = std::max<size_t >(lengthOfLongestTokenName, 6 ) + 1 ;
9801023
9811024 auto validate_detok_token = [&](const std::string& token, const data_t & tokenRawBytes)
9821025 {
983- const size_t tailMinPos = str.size () > detokValidationLookbehind ? str.size () - detokValidationLookbehind : 0 ;
984- auto checkpointIt = std::upper_bound (detokCheckpoints.begin (), detokCheckpoints.end (), tailMinPos,
985- [](size_t pos, const TokenScanCheckpoint& checkpoint)
986- {
987- return pos < checkpoint.strPos ;
988- });
989- if (checkpointIt != detokCheckpoints.begin ())
1026+ if (tokenize_source_to_raw_bytes (token, true , detokState) != tokenRawBytes)
9901027 {
991- checkpointIt-- ;
1028+ return false ;
9921029 }
993-
994- std::string candidateSegment = str.substr (checkpointIt->strPos );
995- candidateSegment += token;
996-
997- data_t expectedSegmentBytes (verifiedRawBytes.begin () + (long long )checkpointIt->rawLen , verifiedRawBytes.end ());
998- tivars::vector_append (expectedSegmentBytes, tokenRawBytes);
999-
1000- const data_t actualBytes = tokenize_source_to_raw_bytes (candidateSegment, true , checkpointIt->state );
1001- if (actualBytes == expectedSegmentBytes)
1030+ if (!append_can_merge_with_previous_token (str, token))
10021031 {
10031032 return true ;
10041033 }
10051034
1006- const std::string fullCandidate = str + token;
10071035 data_t expectedFullBytes = verifiedRawBytes;
10081036 tivars::vector_append (expectedFullBytes, tokenRawBytes);
1009- return tokenize_source_to_raw_bytes (fullCandidate ) == expectedFullBytes;
1037+ return tokenize_source_to_raw_bytes (str + token, true , TokenScanState{} ) == expectedFullBytes;
10101038 };
10111039
10121040 auto accept_detok_token = [&](const std::string& token, const data_t & tokenRawBytes)
10131041 {
10141042 str += token;
10151043 tivars::vector_append (verifiedRawBytes, tokenRawBytes);
10161044 advance_token_scan_state (token, detokState);
1017- detokCheckpoints.push_back ({str.size (), verifiedRawBytes.size (), detokState});
10181045 };
10191046
10201047 for (size_t i = fromRawBytes ? 0 : 2 ; i < dataSize; i++)
@@ -1054,24 +1081,21 @@ namespace tivars::TypeHandlers
10541081 {
10551082 bool acceptedFallback = false ;
10561083
1057- const std::string escapedToken = " \\ " + tokStr;
1058- if (validate_detok_token (escapedToken, currentRawBytes))
1084+ for (const auto & aliasCandidate : get_detok_alias_candidates (bytesKey, langIdx, tokStr))
10591085 {
1060- accept_detok_token (escapedToken, currentRawBytes);
1061- acceptedFallback = true ;
1086+ if (validate_detok_token (aliasCandidate, currentRawBytes))
1087+ {
1088+ accept_detok_token (aliasCandidate, currentRawBytes);
1089+ acceptedFallback = true ;
1090+ break ;
1091+ }
10621092 }
10631093
1064- if (!acceptedFallback)
1094+ const std::string escapedToken = " \\ " + tokStr;
1095+ if (!acceptedFallback && validate_detok_token (escapedToken, currentRawBytes))
10651096 {
1066- for (const auto & aliasCandidate : get_detok_alias_candidates (bytesKey, langIdx, tokStr))
1067- {
1068- if (validate_detok_token (aliasCandidate, currentRawBytes))
1069- {
1070- accept_detok_token (aliasCandidate, currentRawBytes);
1071- acceptedFallback = true ;
1072- break ;
1073- }
1074- }
1097+ accept_detok_token (escapedToken, currentRawBytes);
1098+ acceptedFallback = true ;
10751099 }
10761100
10771101 if (!acceptedFallback)
0 commit comments