Skip to content

Commit e75c3b5

Browse files
committed
✨ Add support for new filetypes
1 parent 2dfd444 commit e75c3b5

File tree

3 files changed

+88
-22
lines changed

3 files changed

+88
-22
lines changed

backend/database/attachment_db.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ def get_content_type(file_path: str) -> str:
272272
'.html': 'text/html',
273273
'.htm': 'text/html',
274274
'.json': 'application/json',
275-
'.epub': 'application/epuub',
275+
'.epub': 'application/epub',
276276
'.xml': 'application/xml',
277277
'.zip': 'application/zip',
278278
'.rar': 'application/x-rar-compressed',

frontend/const/knowledgeBase.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ export const EXTENSION_TO_TYPE_MAP = {
156156
[FILE_EXTENSIONS.TXT]: FILE_TYPES.TEXT,
157157
[FILE_EXTENSIONS.MD]: FILE_TYPES.MARKDOWN,
158158
[FILE_EXTENSIONS.CSV]: FILE_TYPES.CSV,
159-
[FILE_EXTENSIONS.JSON]: FILE_EXTENSIONS.JSON,
159+
[FILE_EXTENSIONS.JSON]: FILE_TYPES.JSON,
160160
[FILE_EXTENSIONS.HTML]: FILE_TYPES.HTML,
161161
[FILE_EXTENSIONS.XML]: FILE_TYPES.XML,
162162
[FILE_EXTENSIONS.EPUB]: FILE_TYPES.EPUB

sdk/nexent/data_process/json_chunk_processor.py

Lines changed: 86 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
11
from typing import List
2+
import string
3+
import orjson
4+
import logging
5+
6+
logger = logging.getLogger(__name__)
27

38

49
class JSONChunkProcessor:
510
"""
611
JSON-aware chunk processor.
712
813
Responsible for splitting JSON or plain-text content into chunks
9-
without breaking top-level key-value semantics when possible.
14+
without breaking top-level key-value semantics when possible,
15+
and without splitting escape sequences like \\", \\n, etc.
1016
"""
1117

1218
def __init__(self, max_characters: int):
@@ -31,14 +37,33 @@ def split(self, file_data: bytes) -> List[str]:
3137
Returns:
3238
List of text chunks
3339
"""
34-
import orjson
35-
3640
try:
3741
data = orjson.loads(file_data)
38-
except Exception:
42+
except orjson.JSONDecodeError:
3943
return self._split_plain(
4044
file_data.decode("utf-8", errors="ignore")
4145
)
46+
except TypeError:
47+
try:
48+
if isinstance(file_data, (bytes, bytearray)):
49+
text_content = file_data.decode("utf-8", errors="ignore")
50+
elif isinstance(file_data, str):
51+
text_content = file_data
52+
else:
53+
text_content = str(file_data)
54+
return self._split_plain(text_content)
55+
56+
except Exception as inner_e:
57+
logger.error(
58+
f"Failed to fallback to plain text due to: {inner_e}")
59+
return []
60+
61+
except Exception as e:
62+
logger.error(f"Unexpected error while parsing JSON: {e}")
63+
return self._split_plain(
64+
file_data.decode(
65+
"utf-8", errors="ignore") if isinstance(file_data, bytes) else str(file_data)
66+
)
4267

4368
def dump(v): return orjson.dumps(v).decode("utf-8")
4469
chunks: List[str] = []
@@ -65,15 +90,30 @@ def _split_plain(self, text: str) -> List[str]:
6590
List of text chunks
6691
"""
6792
out: List[str] = []
68-
PUNCTS = set(",.(){}[],。\"' ")
93+
all_punct = set(string.punctuation)
94+
opening_punct = set("([{<'\"‘“")
95+
SAFE_BREAKS = (all_punct - opening_punct) | {" "}
6996

7097
while len(text) > self._max:
7198
i = self._max
72-
while i > 0 and text[i - 1] not in PUNCTS:
99+
100+
while i > 0 and text[i - 1] not in SAFE_BREAKS:
101+
i -= 1
102+
103+
if i == 0:
104+
i = self._max
105+
106+
while i > 0 and self._ends_with_unescaped_backslash(text[:i]):
73107
i -= 1
74-
i = i or self._max
75-
out.append(text[:i])
76-
text = text[i:]
108+
if i <= 1:
109+
break
110+
111+
if i == 0:
112+
i = 1
113+
114+
chunk = text[:i]
115+
text = text[i:].lstrip()
116+
out.append(chunk)
77117

78118
if text:
79119
out.append(text)
@@ -94,19 +134,21 @@ def _split_json_text(self, text: str) -> List[str]:
94134
cur = text
95135

96136
while len(cur) > self._max:
97-
cut = self._find_last_top_kv(cur[: self._max])
137+
cut = self._find_last_top_kv(cur, self._max)
98138
if cut is None:
139+
# No safe top-level cut → use plain splitter (with escape safety)
99140
return out + self._split_plain(cur)
100141

101-
out.append(cur[:cut])
102-
cur = cur[cut:]
142+
chunk = cur[:cut]
143+
cur = cur[cut:].lstrip()
144+
out.append(chunk)
103145

104146
if cur:
105147
out.append(cur)
106148

107149
return out
108150

109-
def _find_last_top_kv(self, text: str) -> int | None:
151+
def _find_last_top_kv(self, text: str, max_len: int) -> int | None:
110152
"""
111153
Find the split position of the last top-level key-value pair.
112154
@@ -120,27 +162,51 @@ def _find_last_top_kv(self, text: str) -> int | None:
120162
depth = 0
121163
in_str = False
122164
esc = False
165+
last_safe_cut = None
123166

124-
for i in range(len(text) - 1, -1, -1):
125-
c = text[i]
167+
for i, c in enumerate(text):
168+
if i >= max_len:
169+
break
126170

127171
if esc:
128172
esc = False
129173
continue
174+
130175
if c == "\\":
131176
esc = True
132177
continue
178+
133179
if c == '"':
134180
in_str = not in_str
135181
continue
182+
136183
if in_str:
137184
continue
138185

139-
if c in "}]":
186+
# Process structural characters only outside strings
187+
if c in "{[":
140188
depth += 1
141-
elif c in "{[":
189+
elif c in "}]":
142190
depth -= 1
143-
elif c == "," and depth == 1:
144-
return i + 1
191+
elif c == ',' and depth == 1:
192+
candidate = i + 1
193+
# Only accept if prefix doesn't end with unescaped backslash
194+
if not self._ends_with_unescaped_backslash(text[:candidate]):
195+
last_safe_cut = candidate
196+
197+
return last_safe_cut
145198

146-
return None
199+
@staticmethod
200+
def _ends_with_unescaped_backslash(s: str) -> bool:
201+
"""
202+
Check if the string ends with an odd number of consecutive backslashes.
203+
If so, the last backslash is escaping the next character (which isn't in s),
204+
so cutting here would break an escape sequence.
205+
"""
206+
count = 0
207+
for char in reversed(s):
208+
if char == '\\':
209+
count += 1
210+
else:
211+
break
212+
return count % 2 == 1

0 commit comments

Comments
 (0)