11from typing import List
2+ import string
3+ import orjson
4+ import logging
5+
6+ logger = logging .getLogger (__name__ )
27
38
49class JSONChunkProcessor :
510 """
611 JSON-aware chunk processor.
712
813 Responsible for splitting JSON or plain-text content into chunks
9- without breaking top-level key-value semantics when possible.
14+ without breaking top-level key-value semantics when possible,
15+ and without splitting escape sequences like \\ ", \\ n, etc.
1016 """
1117
1218 def __init__ (self , max_characters : int ):
@@ -31,14 +37,33 @@ def split(self, file_data: bytes) -> List[str]:
3137 Returns:
3238 List of text chunks
3339 """
34- import orjson
35-
3640 try :
3741 data = orjson .loads (file_data )
38- except Exception :
42+ except orjson . JSONDecodeError :
3943 return self ._split_plain (
4044 file_data .decode ("utf-8" , errors = "ignore" )
4145 )
46+ except TypeError :
47+ try :
48+ if isinstance (file_data , (bytes , bytearray )):
49+ text_content = file_data .decode ("utf-8" , errors = "ignore" )
50+ elif isinstance (file_data , str ):
51+ text_content = file_data
52+ else :
53+ text_content = str (file_data )
54+ return self ._split_plain (text_content )
55+
56+ except Exception as inner_e :
57+ logger .error (
58+ f"Failed to fallback to plain text due to: { inner_e } " )
59+ return []
60+
61+ except Exception as e :
62+ logger .error (f"Unexpected error while parsing JSON: { e } " )
63+ return self ._split_plain (
64+ file_data .decode (
65+ "utf-8" , errors = "ignore" ) if isinstance (file_data , bytes ) else str (file_data )
66+ )
4267
4368 def dump (v ): return orjson .dumps (v ).decode ("utf-8" )
4469 chunks : List [str ] = []
@@ -65,15 +90,30 @@ def _split_plain(self, text: str) -> List[str]:
6590 List of text chunks
6691 """
6792 out : List [str ] = []
68- PUNCTS = set (",.(){}[],。\" ' " )
93+ all_punct = set (string .punctuation )
94+ opening_punct = set ("([{<'\" ‘“" )
95+ SAFE_BREAKS = (all_punct - opening_punct ) | {" " }
6996
7097 while len (text ) > self ._max :
7198 i = self ._max
72- while i > 0 and text [i - 1 ] not in PUNCTS :
99+
100+ while i > 0 and text [i - 1 ] not in SAFE_BREAKS :
101+ i -= 1
102+
103+ if i == 0 :
104+ i = self ._max
105+
106+ while i > 0 and self ._ends_with_unescaped_backslash (text [:i ]):
73107 i -= 1
74- i = i or self ._max
75- out .append (text [:i ])
76- text = text [i :]
108+ if i <= 1 :
109+ break
110+
111+ if i == 0 :
112+ i = 1
113+
114+ chunk = text [:i ]
115+ text = text [i :].lstrip ()
116+ out .append (chunk )
77117
78118 if text :
79119 out .append (text )
@@ -94,19 +134,21 @@ def _split_json_text(self, text: str) -> List[str]:
94134 cur = text
95135
96136 while len (cur ) > self ._max :
97- cut = self ._find_last_top_kv (cur [: self ._max ] )
137+ cut = self ._find_last_top_kv (cur , self ._max )
98138 if cut is None :
139+ # No safe top-level cut → use plain splitter (with escape safety)
99140 return out + self ._split_plain (cur )
100141
101- out .append (cur [:cut ])
102- cur = cur [cut :]
142+ chunk = cur [:cut ]
143+ cur = cur [cut :].lstrip ()
144+ out .append (chunk )
103145
104146 if cur :
105147 out .append (cur )
106148
107149 return out
108150
109- def _find_last_top_kv (self , text : str ) -> int | None :
151+ def _find_last_top_kv (self , text : str , max_len : int ) -> int | None :
110152 """
111153 Find the split position of the last top-level key-value pair.
112154
@@ -120,27 +162,51 @@ def _find_last_top_kv(self, text: str) -> int | None:
120162 depth = 0
121163 in_str = False
122164 esc = False
165+ last_safe_cut = None
123166
124- for i in range (len (text ) - 1 , - 1 , - 1 ):
125- c = text [i ]
167+ for i , c in enumerate (text ):
168+ if i >= max_len :
169+ break
126170
127171 if esc :
128172 esc = False
129173 continue
174+
130175 if c == "\\ " :
131176 esc = True
132177 continue
178+
133179 if c == '"' :
134180 in_str = not in_str
135181 continue
182+
136183 if in_str :
137184 continue
138185
139- if c in "}]" :
186+ # Process structural characters only outside strings
187+ if c in "{[" :
140188 depth += 1
141- elif c in "{[ " :
189+ elif c in "}] " :
142190 depth -= 1
143- elif c == "," and depth == 1 :
144- return i + 1
191+ elif c == ',' and depth == 1 :
192+ candidate = i + 1
193+ # Only accept if prefix doesn't end with unescaped backslash
194+ if not self ._ends_with_unescaped_backslash (text [:candidate ]):
195+ last_safe_cut = candidate
196+
197+ return last_safe_cut
145198
146- return None
199+ @staticmethod
200+ def _ends_with_unescaped_backslash (s : str ) -> bool :
201+ """
202+ Check if the string ends with an odd number of consecutive backslashes.
203+ If so, the last backslash is escaping the next character (which isn't in s),
204+ so cutting here would break an escape sequence.
205+ """
206+ count = 0
207+ for char in reversed (s ):
208+ if char == '\\ ' :
209+ count += 1
210+ else :
211+ break
212+ return count % 2 == 1
0 commit comments