fix: ndjson file type detection (#4349)

badGarnet · web-flow · commit 8daa154ae4b3 · 2026-05-05T13:46:11.000Z
This PR fixes a bug where njson detection misclassifies multiline single json files as ndjson.  --- > [!NOTE] > **Medium Risk** > Changes JSON/NDJSON detection heuristics and `detect_filetype` routing, which can affect which partitioner is invoked for JSON-like inputs. The logic is more strict and covered by new edge-case tests, but misclassification could still impact downstream parsing behavior. > > **Overview** > Fixes NDJSON file-type detection so **multi-line single JSON objects** (including `.json` and `.ipynb` notebook payloads) are no longer misrouted to `partition_ndjson` and crashing. > > Updates `is_ndjson_processable` to require the *first line* to independently parse as a JSON object (with special handling for potentially truncated long single-line records), adds a bounded read helper (`json_disambiguation_text`) for disambiguation beyond the 4KB `text_head`, and changes JSON/NDJSON disambiguation to default to `FileType.JSON` when NDJSON criteria aren’t met. > > Adds a focused test suite for these NDJSON edge cases, bumps version to `0.22.27`, updates the changelog, and makes CI apt installs more resilient by wrapping them in a retry helper. > > <sup>Reviewed by [Cursor Bugbot](https://cursor.com/bugbot) for commit e82dbd7. Bugbot is set up for automated code reviews on this repo. Configure [here](https://www.cursor.com/dashboard/bugbot).</sup>
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -85,11 +85,24 @@ jobs:
         UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
         TESSERACT_VERSION: "5.5.1"
       run: |
-        sudo apt-get update
-        sudo apt-get install -y libmagic-dev poppler-utils libreoffice
-        sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
-        sudo apt-get update
-        sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
+        retry() {
+          local n=1 max=5 delay=15
+          while true; do
+            "$@" && return 0
+            if (( n >= max )); then
+              echo "Command failed after $n attempts: $*" >&2
+              return 1
+            fi
+            echo "Attempt $n/$max failed for: $*. Retrying in ${delay}s..." >&2
+            sleep "$delay"
+            n=$((n+1))
+          done
+        }
+        retry sudo apt-get update
+        retry sudo apt-get install -y libmagic-dev poppler-utils libreoffice
+        retry sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
+        retry sudo apt-get update -o "APT::Update::Error-Mode=any"
+        retry sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
         tesseract --version
         installed_tesseract_version=$(tesseract --version | grep -oP '(?<=tesseract )\d+\.\d+\.\d+')
         if [ "$installed_tesseract_version" != "${{env.TESSERACT_VERSION}}" ]; then
@@ -161,11 +174,24 @@ jobs:
         uv sync --locked ${{ matrix.uv-extras }} --group test
     - name: Install system dependencies
       run: |
-        sudo apt-get update
-        sudo apt-get install -y libmagic-dev poppler-utils libreoffice
-        sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
-        sudo apt-get update
-        sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
+        retry() {
+          local n=1 max=5 delay=15
+          while true; do
+            "$@" && return 0
+            if (( n >= max )); then
+              echo "Command failed after $n attempts: $*" >&2
+              return 1
+            fi
+            echo "Attempt $n/$max failed for: $*. Retrying in ${delay}s..." >&2
+            sleep "$delay"
+            n=$((n+1))
+          done
+        }
+        retry sudo apt-get update
+        retry sudo apt-get install -y libmagic-dev poppler-utils libreoffice
+        retry sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
+        retry sudo apt-get update -o "APT::Update::Error-Mode=any"
+        retry sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
         tesseract --version
     - name: Test
       env:
@@ -237,13 +263,26 @@ jobs:
         OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
         CI: "true"
       run: |
-        sudo apt-get update
-        sudo apt-get install -y libmagic-dev poppler-utils libreoffice
-        sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
-        sudo apt-get update
-        sudo apt-get install -y tesseract-ocr
-        sudo apt-get install -y tesseract-ocr-kor
-        sudo apt-get install diffstat
+        retry() {
+          local n=1 max=5 delay=15
+          while true; do
+            "$@" && return 0
+            if (( n >= max )); then
+              echo "Command failed after $n attempts: $*" >&2
+              return 1
+            fi
+            echo "Attempt $n/$max failed for: $*. Retrying in ${delay}s..." >&2
+            sleep "$delay"
+            n=$((n+1))
+          done
+        }
+        retry sudo apt-get update
+        retry sudo apt-get install -y libmagic-dev poppler-utils libreoffice
+        retry sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
+        retry sudo apt-get update -o "APT::Update::Error-Mode=any"
+        retry sudo apt-get install -y tesseract-ocr
+        retry sudo apt-get install -y tesseract-ocr-kor
+        retry sudo apt-get install -y diffstat
         tesseract --version
         uv run --no-sync ./test_unstructured_ingest/test-ingest-src.sh
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,9 @@
+## 0.22.27
+
+### Fixes
+
+- **Stop misclassifying multi-line JSON files as NDJSON**: `is_ndjson_processable` previously returned `True` for any text starting with `{`, so `.json` and `.ipynb` files containing a single multi-line JSON object (e.g. Jupyter notebooks) were routed to `partition_ndjson`, which then crashed in its `splitlines()`-based parser.
+
 ## 0.22.26
 
 ### Enhancements
diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
@@ -26,6 +26,7 @@
     _ZipFileDetector,
     detect_filetype,
     is_json_processable,
+    is_ndjson_processable,
 )
 from unstructured.file_utils.model import FileType, create_file_type
 
@@ -538,6 +539,96 @@ def and_it_affirms_JSON_is_NOT_an_array_of_objects_from_text():
     assert is_json_processable(file_text=text) is False
 
 
+# ================================================================================================
+# Describe `is_ndjson_processable()`
+# ================================================================================================
+
+
+def it_recognizes_real_ndjson_with_multiple_object_lines():
+    assert is_ndjson_processable(file_text='{"a": 1}\n{"b": 2}\n') is True
+
+
+def it_recognizes_single_line_ndjson_with_trailing_newline():
+    assert is_ndjson_processable(file_text='{"a": 1}\n{"b": 2}') is True
+
+
+def it_rejects_a_multiline_single_json_object():
+    # The bug: was True; now must be False so partition_ndjson does not get this payload.
+    text = '{\n  "id": "Sample-1",\n  "name": "Sample 1"\n}'
+    assert is_ndjson_processable(file_text=text) is False
+
+
+def it_accepts_a_single_line_json_object_as_one_record_ndjson():
+    """A single-line JSON object is a valid 1-record NDJSON payload.
+
+    `partition_ndjson` parses it via `splitlines()` and yields one record. Existing callers
+    rely on this; only multi-line single objects are pathological.
+    """
+    assert is_ndjson_processable(file_text='{"a": 1}') is True
+
+
+def it_rejects_a_json_array_of_objects():
+    assert is_ndjson_processable(file_text='[{"a": 1}, {"b": 2}]') is False
+
+
+def it_rejects_whitespace_only():
+    assert is_ndjson_processable(file_text="   \n  ") is False
+
+
+def it_rejects_garbage_text():
+    assert is_ndjson_processable(file_text="not json at all") is False
+
+
+def it_rejects_a_jupyter_notebook_payload():
+    """Jupyter notebooks are a single multi-line JSON object — must not route to NDJSON."""
+    notebook_text = (
+        "{\n"
+        ' "cells": [],\n'
+        ' "metadata": {"kernelspec": {"name": "python3"}},\n'
+        ' "nbformat": 4,\n'
+        ' "nbformat_minor": 5\n'
+        "}\n"
+    )
+    assert is_ndjson_processable(file_text=notebook_text) is False
+
+
+def it_rejects_ndjson_first_line_is_a_bare_value_not_an_object():
+    # NDJSON of bare values is uncommon and partition_ndjson expects dicts. Be strict.
+    assert is_ndjson_processable(file_text="1\n2\n3\n") is False
+
+
+def it_routes_not_unstructured_payload_json_away_from_ndjson_via_detect_filetype():
+    file_type = detect_filetype(example_doc_path("not-unstructured-payload.json"))
+    # A multi-line single-object JSON file used to get classified as NDJSON. It should now end up
+    # as JSON (and partition_json will reject it with the existing schema-mismatch error).
+    assert file_type == FileType.JSON
+
+
+def it_classifies_ndjson_correctly_when_first_record_exceeds_text_head_prefix():
+    """NDJSON whose first record is longer than the 4096-char text_head prefix.
+
+    `_disambiguate_json_file_type` reads past `text_head` to find the first newline, so the
+    heuristic must not rely on the first record fitting in the prefix. Both single-record and
+    multi-record cases are exercised — both must round-trip as `FileType.NDJSON`.
+    """
+    big_value = "x" * 5000
+    payload_one_record = json.dumps({"text": big_value, "type": "NarrativeText"}).encode()
+    payload_many_records = (
+        payload_one_record + b"\n" + json.dumps({"text": "tiny", "type": "Title"}).encode()
+    )
+
+    assert detect_filetype(file=io.BytesIO(payload_one_record)) == FileType.NDJSON
+    assert detect_filetype(file=io.BytesIO(payload_many_records)) == FileType.NDJSON
+    assert is_ndjson_processable(file=io.BytesIO(payload_one_record)) is True
+
+
+def it_classifies_multiline_json_as_json_when_first_newline_exceeds_text_head_prefix():
+    big_value = "x" * 5000
+    payload = ('{"text": "' + big_value + '",\n "type": "NarrativeText"\n}').encode()
+
+    assert detect_filetype(file=io.BytesIO(payload)) == FileType.JSON
+
+
 # ================================================================================================
 # MODULE-LEVEL FIXTURES
 # ================================================================================================
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.22.26"  # pragma: no cover
+__version__ = "0.22.27"  # pragma: no cover
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
@@ -38,7 +38,7 @@
 import tempfile
 import zipfile
 from functools import cached_property
-from typing import IO, Callable, Iterator, Optional
+from typing import IO, Callable, Iterator, Optional, cast
 
 import filetype as ft
 from olefile import OleFileIO
@@ -54,6 +54,9 @@
 from unstructured.partition.common.metadata import set_element_hierarchy
 from unstructured.utils import get_call_args_applying_defaults
 
+_JSON_DISAMBIGUATION_CHUNK_SIZE = 8192
+_JSON_DISAMBIGUATION_MAX_CHARS = 1024 * 1024
+
 try:
     importlib.import_module("magic")
     LIBMAGIC_AVAILABLE = True
@@ -136,19 +139,50 @@ def is_ndjson_processable(
     file: Optional[IO[bytes]] = None,
     file_text: Optional[str] = None,
     encoding: Optional[str] = "utf-8",
+    allow_truncated_single_line: bool = False,
 ) -> bool:
-    """True when file looks like a JSON array of objects.
+    """True when file looks like newline-delimited JSON objects.
 
-    Uses regex on a file prefix, so not entirely reliable but good enough if you already know the
-    file is JSON.
+    NDJSON is a sequence of one JSON value per line, conventionally an object on each line. A
+    payload that parses as a single JSON value (e.g. a multi-line `{...}` object or a `[...]`
+    array) is *not* NDJSON and must not be matched here, otherwise `partition_ndjson` will fail
+    later when it splits the text by lines and tries to parse each fragment.
     """
     exactly_one(filename=filename, file=file, file_text=file_text)
 
+    allow_truncated = allow_truncated_single_line
     if file_text is None:
-        file_text = _FileTypeDetectionContext.new(
+        file_text, allow_truncated = _FileTypeDetectionContext.new(
             file_path=filename, file=file, encoding=encoding
-        ).text_head
-    return file_text.lstrip().startswith("{")
+        ).json_disambiguation_text
+
+    text = file_text.lstrip()
+    if not text or not text.startswith("{"):
+        return False
+
+    newline_idx = text.find("\n")
+
+    if newline_idx == -1:
+        # Single-line input. A complete `{...}` parses as a dict and is treated as 1-record
+        # NDJSON (existing tests and `partition_ndjson` rely on this). When the caller knows this
+        # is a truncated first line from a JSON-like payload, a parse failure is still compatible
+        # with a long 1-record NDJSON payload.
+        try:
+            return isinstance(json.loads(text), dict)
+        except json.JSONDecodeError:
+            return allow_truncated
+
+    # Multi-line input. NDJSON requires each record to be on its own line, so the first line
+    # must independently parse as a JSON object. A pretty-printed single JSON object has its
+    # first line be just `{` (or similar fragment) which won't parse alone — that's how we
+    # distinguish it from real NDJSON.
+    first_line = text[:newline_idx].rstrip()
+    if not first_line:
+        return False
+    try:
+        return isinstance(json.loads(first_line), dict)
+    except json.JSONDecodeError:
+        return False
 
 
 class _FileTypeDetector:
@@ -224,12 +258,21 @@ def _file_type_from_content_type(self) -> FileType | None:
 
     @property
     def _disambiguate_json_file_type(self) -> FileType:
-        """Disambiguate JSON/NDJSON file-type based on file contents."""
-        if is_json_processable(file_text=self._ctx.text_head):
-            return FileType.JSON
-        if is_ndjson_processable(file_text=self._ctx.text_head):
+        """Disambiguate JSON/NDJSON file-type based on file contents.
+
+        NDJSON is detected first because it has the strictest signature (multiple JSON values
+        separated by newlines, with the first line independently parsable). Anything else that
+        libmagic flagged as JSON is classified as `FileType.JSON`; the JSON partitioner has its
+        own `is_json_processable` schema check and will reject non-conforming payloads with a
+        clear error.
+        """
+        file_text, allow_truncated_single_line = self._ctx.json_disambiguation_text
+        if is_ndjson_processable(
+            file_text=file_text,
+            allow_truncated_single_line=allow_truncated_single_line,
+        ):
             return FileType.NDJSON
-        raise ValueError("Unable to process JSON file")
+        return FileType.JSON
 
     @property
     def _file_type_from_guessed_mime_type(self) -> FileType | None:
@@ -553,13 +596,73 @@ def text_head(self) -> str:
             with open(file_path, encoding=encoding) as f:
                 return f.read(4096)
 
+    @cached_property
+    def json_disambiguation_text(self) -> tuple[str, bool]:
+        """Text prefix for JSON/NDJSON disambiguation and whether the first line was truncated."""
+
+        if file := self._file_arg:
+            file.seek(0)
+            content, first_line_truncated = self._read_until_newline_or_limit(file)
+            file.seek(0)
+            if isinstance(content, str):
+                return content, first_line_truncated
+            return content.decode(encoding=self.encoding, errors="ignore"), first_line_truncated
+
+        file_path = self.file_path
+        assert file_path is not None  # -- guaranteed by `._validate` --
+
+        try:
+            with open(file_path, encoding=self.encoding) as f:
+                content, first_line_truncated = self._read_until_newline_or_limit(f)
+                assert isinstance(content, str)
+                return content, first_line_truncated
+        except UnicodeDecodeError:
+            encoding, _ = detect_file_encoding(filename=file_path)
+            with open(file_path, encoding=encoding) as f:
+                content, first_line_truncated = self._read_until_newline_or_limit(f)
+                assert isinstance(content, str)
+                return content, first_line_truncated
+
     def _validate(self) -> None:
         """Raise if the context is invalid."""
         if self.file_path and not os.path.isfile(self.file_path):
             raise FileNotFoundError(f"no such file {self._file_path_arg}")
         if not self.file_path and not self._file_arg:
             raise ValueError("either `file_path` or `file` argument must be provided")
 
+    @staticmethod
+    def _read_until_newline_or_limit(file: IO) -> tuple[str | bytes, bool]:
+        """Read through the first newline, stopping at a bounded prefix if none is found."""
+        chunks: list[str | bytes] = []
+        chars_read = 0
+
+        while chars_read < _JSON_DISAMBIGUATION_MAX_CHARS:
+            chars_to_read = min(
+                _JSON_DISAMBIGUATION_CHUNK_SIZE,
+                _JSON_DISAMBIGUATION_MAX_CHARS - chars_read,
+            )
+            chunk = file.read(chars_to_read)
+            if not chunk:
+                return _FileTypeDetectionContext._join_text_chunks(chunks), False
+
+            newline = b"\n" if isinstance(chunk, bytes) else "\n"
+            newline_idx = chunk.find(newline)
+            if newline_idx != -1:
+                chunks.append(chunk[: newline_idx + 1])
+                return _FileTypeDetectionContext._join_text_chunks(chunks), False
+
+            chunks.append(chunk)
+            chars_read += len(chunk)
+
+        return _FileTypeDetectionContext._join_text_chunks(chunks), True
+
+    @staticmethod
+    def _join_text_chunks(chunks: list[str | bytes]) -> str | bytes:
+        """Join chunks without mixing text and bytes types."""
+        if chunks and isinstance(chunks[0], bytes):
+            return b"".join(cast(list[bytes], chunks))
+        return "".join(cast(list[str], chunks))
+
 
 class _OleFileDetector:
     """Detect and differentiate a CFB file, aka. "OLE" file.

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.22.26" # pragma: no cover`
	`1`	`+__version__ = "0.22.27" # pragma: no cover`