|
38 | 38 | import tempfile |
39 | 39 | import zipfile |
40 | 40 | from functools import cached_property |
41 | | -from typing import IO, Callable, Iterator, Optional |
| 41 | +from typing import IO, Callable, Iterator, Optional, cast |
42 | 42 |
|
43 | 43 | import filetype as ft |
44 | 44 | from olefile import OleFileIO |
|
54 | 54 | from unstructured.partition.common.metadata import set_element_hierarchy |
55 | 55 | from unstructured.utils import get_call_args_applying_defaults |
56 | 56 |
|
| 57 | +_JSON_DISAMBIGUATION_CHUNK_SIZE = 8192 |
| 58 | +_JSON_DISAMBIGUATION_MAX_CHARS = 1024 * 1024 |
| 59 | + |
57 | 60 | try: |
58 | 61 | importlib.import_module("magic") |
59 | 62 | LIBMAGIC_AVAILABLE = True |
@@ -136,19 +139,50 @@ def is_ndjson_processable( |
136 | 139 | file: Optional[IO[bytes]] = None, |
137 | 140 | file_text: Optional[str] = None, |
138 | 141 | encoding: Optional[str] = "utf-8", |
| 142 | + allow_truncated_single_line: bool = False, |
139 | 143 | ) -> bool: |
140 | | - """True when file looks like a JSON array of objects. |
| 144 | + """True when file looks like newline-delimited JSON objects. |
141 | 145 |
|
142 | | - Uses regex on a file prefix, so not entirely reliable but good enough if you already know the |
143 | | - file is JSON. |
| 146 | + NDJSON is a sequence of one JSON value per line, conventionally an object on each line. A |
| 147 | + payload that parses as a single JSON value (e.g. a multi-line `{...}` object or a `[...]` |
| 148 | + array) is *not* NDJSON and must not be matched here, otherwise `partition_ndjson` will fail |
| 149 | + later when it splits the text by lines and tries to parse each fragment. |
144 | 150 | """ |
145 | 151 | exactly_one(filename=filename, file=file, file_text=file_text) |
146 | 152 |
|
| 153 | + allow_truncated = allow_truncated_single_line |
147 | 154 | if file_text is None: |
148 | | - file_text = _FileTypeDetectionContext.new( |
| 155 | + file_text, allow_truncated = _FileTypeDetectionContext.new( |
149 | 156 | file_path=filename, file=file, encoding=encoding |
150 | | - ).text_head |
151 | | - return file_text.lstrip().startswith("{") |
| 157 | + ).json_disambiguation_text |
| 158 | + |
| 159 | + text = file_text.lstrip() |
| 160 | + if not text or not text.startswith("{"): |
| 161 | + return False |
| 162 | + |
| 163 | + newline_idx = text.find("\n") |
| 164 | + |
| 165 | + if newline_idx == -1: |
| 166 | + # Single-line input. A complete `{...}` parses as a dict and is treated as 1-record |
| 167 | + # NDJSON (existing tests and `partition_ndjson` rely on this). When the caller knows this |
| 168 | + # is a truncated first line from a JSON-like payload, a parse failure is still compatible |
| 169 | + # with a long 1-record NDJSON payload. |
| 170 | + try: |
| 171 | + return isinstance(json.loads(text), dict) |
| 172 | + except json.JSONDecodeError: |
| 173 | + return allow_truncated |
| 174 | + |
| 175 | + # Multi-line input. NDJSON requires each record to be on its own line, so the first line |
| 176 | + # must independently parse as a JSON object. A pretty-printed single JSON object has its |
| 177 | + # first line be just `{` (or similar fragment) which won't parse alone — that's how we |
| 178 | + # distinguish it from real NDJSON. |
| 179 | + first_line = text[:newline_idx].rstrip() |
| 180 | + if not first_line: |
| 181 | + return False |
| 182 | + try: |
| 183 | + return isinstance(json.loads(first_line), dict) |
| 184 | + except json.JSONDecodeError: |
| 185 | + return False |
152 | 186 |
|
153 | 187 |
|
154 | 188 | class _FileTypeDetector: |
@@ -224,12 +258,21 @@ def _file_type_from_content_type(self) -> FileType | None: |
224 | 258 |
|
225 | 259 | @property |
226 | 260 | def _disambiguate_json_file_type(self) -> FileType: |
227 | | - """Disambiguate JSON/NDJSON file-type based on file contents.""" |
228 | | - if is_json_processable(file_text=self._ctx.text_head): |
229 | | - return FileType.JSON |
230 | | - if is_ndjson_processable(file_text=self._ctx.text_head): |
| 261 | + """Disambiguate JSON/NDJSON file-type based on file contents. |
| 262 | +
|
| 263 | + NDJSON is detected first because it has the strictest signature (multiple JSON values |
| 264 | + separated by newlines, with the first line independently parsable). Anything else that |
| 265 | + libmagic flagged as JSON is classified as `FileType.JSON`; the JSON partitioner has its |
| 266 | + own `is_json_processable` schema check and will reject non-conforming payloads with a |
| 267 | + clear error. |
| 268 | + """ |
| 269 | + file_text, allow_truncated_single_line = self._ctx.json_disambiguation_text |
| 270 | + if is_ndjson_processable( |
| 271 | + file_text=file_text, |
| 272 | + allow_truncated_single_line=allow_truncated_single_line, |
| 273 | + ): |
231 | 274 | return FileType.NDJSON |
232 | | - raise ValueError("Unable to process JSON file") |
| 275 | + return FileType.JSON |
233 | 276 |
|
234 | 277 | @property |
235 | 278 | def _file_type_from_guessed_mime_type(self) -> FileType | None: |
@@ -553,13 +596,73 @@ def text_head(self) -> str: |
553 | 596 | with open(file_path, encoding=encoding) as f: |
554 | 597 | return f.read(4096) |
555 | 598 |
|
| 599 | + @cached_property |
| 600 | + def json_disambiguation_text(self) -> tuple[str, bool]: |
| 601 | + """Text prefix for JSON/NDJSON disambiguation and whether the first line was truncated.""" |
| 602 | + |
| 603 | + if file := self._file_arg: |
| 604 | + file.seek(0) |
| 605 | + content, first_line_truncated = self._read_until_newline_or_limit(file) |
| 606 | + file.seek(0) |
| 607 | + if isinstance(content, str): |
| 608 | + return content, first_line_truncated |
| 609 | + return content.decode(encoding=self.encoding, errors="ignore"), first_line_truncated |
| 610 | + |
| 611 | + file_path = self.file_path |
| 612 | + assert file_path is not None # -- guaranteed by `._validate` -- |
| 613 | + |
| 614 | + try: |
| 615 | + with open(file_path, encoding=self.encoding) as f: |
| 616 | + content, first_line_truncated = self._read_until_newline_or_limit(f) |
| 617 | + assert isinstance(content, str) |
| 618 | + return content, first_line_truncated |
| 619 | + except UnicodeDecodeError: |
| 620 | + encoding, _ = detect_file_encoding(filename=file_path) |
| 621 | + with open(file_path, encoding=encoding) as f: |
| 622 | + content, first_line_truncated = self._read_until_newline_or_limit(f) |
| 623 | + assert isinstance(content, str) |
| 624 | + return content, first_line_truncated |
| 625 | + |
556 | 626 | def _validate(self) -> None: |
557 | 627 | """Raise if the context is invalid.""" |
558 | 628 | if self.file_path and not os.path.isfile(self.file_path): |
559 | 629 | raise FileNotFoundError(f"no such file {self._file_path_arg}") |
560 | 630 | if not self.file_path and not self._file_arg: |
561 | 631 | raise ValueError("either `file_path` or `file` argument must be provided") |
562 | 632 |
|
| 633 | + @staticmethod |
| 634 | + def _read_until_newline_or_limit(file: IO) -> tuple[str | bytes, bool]: |
| 635 | + """Read through the first newline, stopping at a bounded prefix if none is found.""" |
| 636 | + chunks: list[str | bytes] = [] |
| 637 | + chars_read = 0 |
| 638 | + |
| 639 | + while chars_read < _JSON_DISAMBIGUATION_MAX_CHARS: |
| 640 | + chars_to_read = min( |
| 641 | + _JSON_DISAMBIGUATION_CHUNK_SIZE, |
| 642 | + _JSON_DISAMBIGUATION_MAX_CHARS - chars_read, |
| 643 | + ) |
| 644 | + chunk = file.read(chars_to_read) |
| 645 | + if not chunk: |
| 646 | + return _FileTypeDetectionContext._join_text_chunks(chunks), False |
| 647 | + |
| 648 | + newline = b"\n" if isinstance(chunk, bytes) else "\n" |
| 649 | + newline_idx = chunk.find(newline) |
| 650 | + if newline_idx != -1: |
| 651 | + chunks.append(chunk[: newline_idx + 1]) |
| 652 | + return _FileTypeDetectionContext._join_text_chunks(chunks), False |
| 653 | + |
| 654 | + chunks.append(chunk) |
| 655 | + chars_read += len(chunk) |
| 656 | + |
| 657 | + return _FileTypeDetectionContext._join_text_chunks(chunks), True |
| 658 | + |
| 659 | + @staticmethod |
| 660 | + def _join_text_chunks(chunks: list[str | bytes]) -> str | bytes: |
| 661 | + """Join chunks without mixing text and bytes types.""" |
| 662 | + if chunks and isinstance(chunks[0], bytes): |
| 663 | + return b"".join(cast(list[bytes], chunks)) |
| 664 | + return "".join(cast(list[str], chunks)) |
| 665 | + |
563 | 666 |
|
564 | 667 | class _OleFileDetector: |
565 | 668 | """Detect and differentiate a CFB file, aka. "OLE" file. |
|
0 commit comments