✨ Add support for new filetypes

yzAiden · yzAiden · commit e75c3b5759bf · 2026-03-02T18:11:55.000+08:00
diff --git a/backend/database/attachment_db.py b/backend/database/attachment_db.py
@@ -272,7 +272,7 @@ def get_content_type(file_path: str) -> str:
                   '.html': 'text/html',
                   '.htm': 'text/html',
                   '.json': 'application/json',
-                  '.epub': 'application/epuub',
+                  '.epub': 'application/epub',
                   '.xml': 'application/xml',
                   '.zip': 'application/zip',
                   '.rar': 'application/x-rar-compressed',
diff --git a/frontend/const/knowledgeBase.ts b/frontend/const/knowledgeBase.ts
@@ -156,7 +156,7 @@ export const EXTENSION_TO_TYPE_MAP = {
   [FILE_EXTENSIONS.TXT]: FILE_TYPES.TEXT,
   [FILE_EXTENSIONS.MD]: FILE_TYPES.MARKDOWN,
   [FILE_EXTENSIONS.CSV]: FILE_TYPES.CSV,
-  [FILE_EXTENSIONS.JSON]: FILE_EXTENSIONS.JSON,
+  [FILE_EXTENSIONS.JSON]: FILE_TYPES.JSON,
   [FILE_EXTENSIONS.HTML]: FILE_TYPES.HTML,
   [FILE_EXTENSIONS.XML]: FILE_TYPES.XML,
   [FILE_EXTENSIONS.EPUB]: FILE_TYPES.EPUB
diff --git a/sdk/nexent/data_process/json_chunk_processor.py b/sdk/nexent/data_process/json_chunk_processor.py
@@ -1,12 +1,18 @@
 from typing import List
+import string
+import orjson
+import logging
+
+logger = logging.getLogger(__name__)
 
 
 class JSONChunkProcessor:
     """
     JSON-aware chunk processor.
 
     Responsible for splitting JSON or plain-text content into chunks
-    without breaking top-level key-value semantics when possible.
+    without breaking top-level key-value semantics when possible,
+    and without splitting escape sequences like \\", \\n, etc.
     """
 
     def __init__(self, max_characters: int):
@@ -31,14 +37,33 @@ def split(self, file_data: bytes) -> List[str]:
         Returns:
             List of text chunks
         """
-        import orjson
-
         try:
             data = orjson.loads(file_data)
-        except Exception:
+        except orjson.JSONDecodeError:
             return self._split_plain(
                 file_data.decode("utf-8", errors="ignore")
             )
+        except TypeError:
+            try:
+                if isinstance(file_data, (bytes, bytearray)):
+                    text_content = file_data.decode("utf-8", errors="ignore")
+                elif isinstance(file_data, str):
+                    text_content = file_data
+                else:
+                    text_content = str(file_data)
+                return self._split_plain(text_content)
+
+            except Exception as inner_e:
+                logger.error(
+                    f"Failed to fallback to plain text due to: {inner_e}")
+                return []
+
+        except Exception as e:
+            logger.error(f"Unexpected error while parsing JSON: {e}")
+            return self._split_plain(
+                file_data.decode(
+                    "utf-8", errors="ignore") if isinstance(file_data, bytes) else str(file_data)
+            )
 
         def dump(v): return orjson.dumps(v).decode("utf-8")
         chunks: List[str] = []
@@ -65,15 +90,30 @@ def _split_plain(self, text: str) -> List[str]:
             List of text chunks
         """
         out: List[str] = []
-        PUNCTS = set(",.(){}[]，。\"' ")
+        all_punct = set(string.punctuation)
+        opening_punct = set("([{<'\"‘“")
+        SAFE_BREAKS = (all_punct - opening_punct) | {" "}
 
         while len(text) > self._max:
             i = self._max
-            while i > 0 and text[i - 1] not in PUNCTS:
+
+            while i > 0 and text[i - 1] not in SAFE_BREAKS:
+                i -= 1
+
+            if i == 0:
+                i = self._max
+
+            while i > 0 and self._ends_with_unescaped_backslash(text[:i]):
                 i -= 1
-            i = i or self._max
-            out.append(text[:i])
-            text = text[i:]
+                if i <= 1:
+                    break
+
+            if i == 0:
+                i = 1
+
+            chunk = text[:i]
+            text = text[i:].lstrip()
+            out.append(chunk)
 
         if text:
             out.append(text)
@@ -94,19 +134,21 @@ def _split_json_text(self, text: str) -> List[str]:
         cur = text
 
         while len(cur) > self._max:
-            cut = self._find_last_top_kv(cur[: self._max])
+            cut = self._find_last_top_kv(cur, self._max)
             if cut is None:
+                # No safe top-level cut → use plain splitter (with escape safety)
                 return out + self._split_plain(cur)
 
-            out.append(cur[:cut])
-            cur = cur[cut:]
+            chunk = cur[:cut]
+            cur = cur[cut:].lstrip()
+            out.append(chunk)
 
         if cur:
             out.append(cur)
 
         return out
 
-    def _find_last_top_kv(self, text: str) -> int | None:
+    def _find_last_top_kv(self, text: str, max_len: int) -> int | None:
         """
         Find the split position of the last top-level key-value pair.
 
@@ -120,27 +162,51 @@ def _find_last_top_kv(self, text: str) -> int | None:
         depth = 0
         in_str = False
         esc = False
+        last_safe_cut = None
 
-        for i in range(len(text) - 1, -1, -1):
-            c = text[i]
+        for i, c in enumerate(text):
+            if i >= max_len:
+                break
 
             if esc:
                 esc = False
                 continue
+
             if c == "\\":
                 esc = True
                 continue
+
             if c == '"':
                 in_str = not in_str
                 continue
+
             if in_str:
                 continue
 
-            if c in "}]":
+            # Process structural characters only outside strings
+            if c in "{[":
                 depth += 1
-            elif c in "{[":
+            elif c in "}]":
                 depth -= 1
-            elif c == "," and depth == 1:
-                return i + 1
+            elif c == ',' and depth == 1:
+                candidate = i + 1
+                # Only accept if prefix doesn't end with unescaped backslash
+                if not self._ends_with_unescaped_backslash(text[:candidate]):
+                    last_safe_cut = candidate
+
+        return last_safe_cut
 
-        return None
+    @staticmethod
+    def _ends_with_unescaped_backslash(s: str) -> bool:
+        """
+        Check if the string ends with an odd number of consecutive backslashes.
+        If so, the last backslash is escaping the next character (which isn't in s),
+        so cutting here would break an escape sequence.
+        """
+        count = 0
+        for char in reversed(s):
+            if char == '\\':
+                count += 1
+            else:
+                break
+        return count % 2 == 1