@@ -12,7 +12,7 @@ class JSONChunkProcessor:
1212
1313 Responsible for splitting JSON or plain-text content into chunks
1414 without breaking top-level key-value semantics when possible,
15- and without splitting escape sequences like \\ ", \ \ n, etc.
15+ and without splitting escape sequences like \" , \n , etc.
1616 """
1717
1818 def __init__ (self , max_characters : int ):
@@ -40,18 +40,10 @@ def split(self, file_data: bytes) -> List[str]:
4040 try :
4141 data = orjson .loads (file_data )
4242 except orjson .JSONDecodeError :
43- return self ._split_plain (
44- file_data .decode ("utf-8" , errors = "ignore" )
45- )
43+ return self ._split_plain (self ._to_text (file_data ))
4644 except TypeError :
4745 try :
48- if isinstance (file_data , (bytes , bytearray )):
49- text_content = file_data .decode ("utf-8" , errors = "ignore" )
50- elif isinstance (file_data , str ):
51- text_content = file_data
52- else :
53- text_content = str (file_data )
54- return self ._split_plain (text_content )
46+ return self ._split_plain (self ._to_text (file_data ))
5547
5648 except Exception as inner_e :
5749 logger .error (
@@ -61,8 +53,7 @@ def split(self, file_data: bytes) -> List[str]:
6153 except Exception as e :
6254 logger .error (f"Unexpected error while parsing JSON: { e } " )
6355 return self ._split_plain (
64- file_data .decode (
65- "utf-8" , errors = "ignore" ) if isinstance (file_data , bytes ) else str (file_data )
56+ self ._to_text (file_data )
6657 )
6758
6859 def dump (v ): return orjson .dumps (v ).decode ("utf-8" )
@@ -91,7 +82,7 @@ def _split_plain(self, text: str) -> List[str]:
9182 """
9283 out : List [str ] = []
9384 all_punct = set (string .punctuation )
94- opening_punct = set ("([{<'\" ‘“ " )
85+ opening_punct = set ("([{<'\" " )
9586 SAFE_BREAKS = (all_punct - opening_punct ) | {" " }
9687
9788 while len (text ) > self ._max :
@@ -136,7 +127,7 @@ def _split_json_text(self, text: str) -> List[str]:
136127 while len (cur ) > self ._max :
137128 cut = self ._find_last_top_kv (cur , self ._max )
138129 if cut is None :
139- # No safe top-level cut → use plain splitter (with escape safety)
130+ # No safe top-level cut -> use plain splitter (with escape safety)
140131 return out + self ._split_plain (cur )
141132
142133 chunk = cur [:cut ]
@@ -186,7 +177,7 @@ def _find_last_top_kv(self, text: str, max_len: int) -> int | None:
186177 # Process structural characters only outside strings
187178 if c in "{[" :
188179 depth += 1
189- elif c in "}] " :
180+ elif c in "]} " :
190181 depth -= 1
191182 elif c == ',' and depth == 1 :
192183 candidate = i + 1
@@ -196,12 +187,27 @@ def _find_last_top_kv(self, text: str, max_len: int) -> int | None:
196187
197188 return last_safe_cut
198189
190+ @staticmethod
191+ def _to_text (file_data ) -> str :
192+ if isinstance (file_data , (bytes , bytearray )):
193+ return file_data .decode ("utf-8" , errors = "ignore" )
194+ if isinstance (file_data , str ):
195+ return file_data
196+ return str (file_data )
197+
199198 @staticmethod
200199 def _ends_with_unescaped_backslash (s : str ) -> bool :
201200 """
202201 Check if the string ends with an odd number of consecutive backslashes.
203202 If so, the last backslash is escaping the next character (which isn't in s),
204203 so cutting here would break an escape sequence.
204+
205+ Args:
206+ s: The string to check.
207+
208+ Returns:
209+ True if the string ends with an unescaped backslash (odd count),
210+ False otherwise.
205211 """
206212 count = 0
207213 for char in reversed (s ):
0 commit comments