@@ -753,6 +753,35 @@ static std::vector<size_t> unicode_regex_split_custom_afmoe(const std::string &
753753 return bpe_offsets;
754754}
755755
756+ // regex: [^\n]+|[\n]+
757+ // splits text into runs of non-newline characters and runs of newline characters
758+ static std::vector<size_t > unicode_regex_split_custom_newlines (const std::string & text, const std::vector<size_t > & offsets) {
759+ std::vector<size_t > bpe_offsets;
760+ bpe_offsets.reserve (offsets.size ());
761+
762+ const auto cpts = unicode_cpts_from_utf8 (text);
763+
764+ size_t start = 0 ;
765+ for (auto offset : offsets) {
766+ const size_t offset_ini = start;
767+ const size_t offset_end = start + offset;
768+ assert (offset_end <= cpts.size ());
769+ start = offset_end;
770+
771+ size_t pos = offset_ini;
772+ while (pos < offset_end) {
773+ const bool is_newline = (cpts[pos] == ' \n ' );
774+ const size_t run_start = pos;
775+ while (pos < offset_end && (cpts[pos] == ' \n ' ) == is_newline) {
776+ pos++;
777+ }
778+ bpe_offsets.push_back (pos - run_start);
779+ }
780+ }
781+
782+ return bpe_offsets;
783+ }
784+
756785static std::vector<size_t > unicode_regex_split_custom (const std::string & text, const std::string & regex_expr, const std::vector<size_t > & offsets) {
757786 std::vector<size_t > bpe_offsets;
758787
@@ -769,6 +798,8 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
769798 } else if (regex_expr == " \\ p{AFMoE_digits}" ) {
770799 // AFMOE digit pattern - use custom implementation for proper splitting
771800 bpe_offsets = unicode_regex_split_custom_afmoe (text, offsets);
801+ } else if (regex_expr == " [^\\ n]+|[\\ n]+" ) {
802+ bpe_offsets = unicode_regex_split_custom_newlines (text, offsets);
772803 } else if (regex_expr == " \\ d{1,3}(?=(?:\\ d{3})*\\ b)" ) {
773804 // tiny_aya digit grouping pattern from tokenizer.json:
774805 // {"type": "Split", "pattern": {"Regex": "\\d{1,3}(?=(?:\\d{3})*\\b)"}, "behavior": "Isolated"}
0 commit comments