Skip to content

Commit 0628b50

Browse files
committed
✨ Add support for new filetypes
1 parent 74c4bcd commit 0628b50

File tree

1 file changed

+22
-16
lines changed

1 file changed

+22
-16
lines changed

sdk/nexent/data_process/json_chunk_processor.py

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ class JSONChunkProcessor:
1212
1313
Responsible for splitting JSON or plain-text content into chunks
1414
without breaking top-level key-value semantics when possible,
15-
and without splitting escape sequences like \\", \\n, etc.
15+
and without splitting escape sequences like \" , \n, etc.
1616
"""
1717

1818
def __init__(self, max_characters: int):
@@ -40,18 +40,10 @@ def split(self, file_data: bytes) -> List[str]:
4040
try:
4141
data = orjson.loads(file_data)
4242
except orjson.JSONDecodeError:
43-
return self._split_plain(
44-
file_data.decode("utf-8", errors="ignore")
45-
)
43+
return self._split_plain(self._to_text(file_data))
4644
except TypeError:
4745
try:
48-
if isinstance(file_data, (bytes, bytearray)):
49-
text_content = file_data.decode("utf-8", errors="ignore")
50-
elif isinstance(file_data, str):
51-
text_content = file_data
52-
else:
53-
text_content = str(file_data)
54-
return self._split_plain(text_content)
46+
return self._split_plain(self._to_text(file_data))
5547

5648
except Exception as inner_e:
5749
logger.error(
@@ -61,8 +53,7 @@ def split(self, file_data: bytes) -> List[str]:
6153
except Exception as e:
6254
logger.error(f"Unexpected error while parsing JSON: {e}")
6355
return self._split_plain(
64-
file_data.decode(
65-
"utf-8", errors="ignore") if isinstance(file_data, bytes) else str(file_data)
56+
self._to_text(file_data)
6657
)
6758

6859
def dump(v): return orjson.dumps(v).decode("utf-8")
@@ -91,7 +82,7 @@ def _split_plain(self, text: str) -> List[str]:
9182
"""
9283
out: List[str] = []
9384
all_punct = set(string.punctuation)
94-
opening_punct = set("([{<'\"‘“")
85+
opening_punct = set("([{<'\"")
9586
SAFE_BREAKS = (all_punct - opening_punct) | {" "}
9687

9788
while len(text) > self._max:
@@ -136,7 +127,7 @@ def _split_json_text(self, text: str) -> List[str]:
136127
while len(cur) > self._max:
137128
cut = self._find_last_top_kv(cur, self._max)
138129
if cut is None:
139-
# No safe top-level cut use plain splitter (with escape safety)
130+
# No safe top-level cut -> use plain splitter (with escape safety)
140131
return out + self._split_plain(cur)
141132

142133
chunk = cur[:cut]
@@ -186,7 +177,7 @@ def _find_last_top_kv(self, text: str, max_len: int) -> int | None:
186177
# Process structural characters only outside strings
187178
if c in "{[":
188179
depth += 1
189-
elif c in "}]":
180+
elif c in "]}":
190181
depth -= 1
191182
elif c == ',' and depth == 1:
192183
candidate = i + 1
@@ -196,12 +187,27 @@ def _find_last_top_kv(self, text: str, max_len: int) -> int | None:
196187

197188
return last_safe_cut
198189

190+
@staticmethod
191+
def _to_text(file_data) -> str:
192+
if isinstance(file_data, (bytes, bytearray)):
193+
return file_data.decode("utf-8", errors="ignore")
194+
if isinstance(file_data, str):
195+
return file_data
196+
return str(file_data)
197+
199198
@staticmethod
200199
def _ends_with_unescaped_backslash(s: str) -> bool:
201200
"""
202201
Check if the string ends with an odd number of consecutive backslashes.
203202
If so, the last backslash is escaping the next character (which isn't in s),
204203
so cutting here would break an escape sequence.
204+
205+
Args:
206+
s: The string to check.
207+
208+
Returns:
209+
True if the string ends with an unescaped backslash (odd count),
210+
False otherwise.
205211
"""
206212
count = 0
207213
for char in reversed(s):

0 commit comments

Comments
 (0)