apecloud
diff --git a/‎aperag/docparser/chunking.py‎
Lines changed: 72 additions & 3 deletions b/‎aperag/docparser/chunking.py‎
Lines changed: 72 additions & 3 deletions
diff --git a/‎aperag/index/document_parser.py‎
Lines changed: 20 additions & 3 deletions b/‎aperag/index/document_parser.py‎
Lines changed: 20 additions & 3 deletions
diff --git a/‎aperag/service/document_service.py‎
Lines changed: 47 additions & 13 deletions b/‎aperag/service/document_service.py‎
Lines changed: 47 additions & 13 deletions
diff --git a/‎aperag/views/main.py‎
Lines changed: 3 additions & 1 deletion b/‎aperag/views/main.py‎
Lines changed: 3 additions & 1 deletion
@@ -41,8 +41,67 @@ def __init__(self, chunk_size: int, chunk_overlap: int, tokenizer: Callable[[str
 
     def __call__(self, parts: list[Part]) -> list[Part]:
         groups = self._to_groups(parts)
+        groups = self._merge_consecutive_title_groups(groups)
         return self._rechunk(groups)
 
+    def _is_pure_title_group(self, group: Group) -> bool:
+        """A group is considered a pure title if it has a title and only one item."""
+        return group.title_level > 0 and len(group.items) == 1
+
+    def _merge_consecutive_title_groups(self, groups: list[Group]) -> list[Group]:
+        if not groups:
+            return []
+
+        new_groups: list[Group] = []
+        i = 0
+        while i < len(groups):
+            current_group = groups[i]
+
+            if not self._is_pure_title_group(current_group):
+                new_groups.append(current_group)
+                i += 1
+                continue
+
+            # It's a pure title group, let's look ahead to merge.
+            merged_items = list(current_group.items)
+            # The highest level is the smallest number.
+            highest_level = current_group.title_level
+
+            j = i + 1
+            # 1. Merge consecutive pure title groups
+            while j < len(groups):
+                next_group = groups[j]
+                if not self._is_pure_title_group(next_group):
+                    break  # Stop merging titles
+
+                # Check hierarchy: don't merge a higher-level title (e.g., H2 into an H3 group)
+                if next_group.title_level < highest_level:
+                    break
+
+                # Merge it
+                merged_items.extend(next_group.items)
+                j += 1
+
+            # 2. After merging titles, try to merge one more content group
+            if j < len(groups):
+                next_group = groups[j]
+                if not self._is_pure_title_group(next_group):
+                    if next_group.title_level == 0 or next_group.title_level >= current_group.title_level:
+                        merged_items.extend(next_group.items)
+                        j += 1  # This content group is also merged
+
+            # Create the new merged group
+            # The title and title_level of the merged group should be from the first group.
+            new_group = Group(
+                title_level=current_group.title_level,
+                title=current_group.title,
+                items=merged_items,
+            )
+            new_groups.append(new_group)
+            i = j  # Move index to the next un-processed group
+
+        return new_groups
+
     def _to_groups(self, parts: list[Part]) -> list[Group]:
         result: list[Group] = []
         curr_group: Group | None = None
@@ -120,27 +179,37 @@ def _rechunk(self, groups: list[Group]) -> list[Part]:
                     # If the single part is too large, split it into smaller chunks
                     splitter = SimpleSemanticSplitter(self.tokenizer)
                     chunks = splitter.split(part.content, self.chunk_size, self.chunk_overlap)
-                    metadata = part.metadata
+                    metadata = part.metadata.copy()
                     metadata.pop("tokens", None)
+                    metadata["splitted"] = True
                     for chunk in chunks:
                         parts.append(Part(content=chunk, metadata=metadata.copy()))
                 else:
                     parts.append(part)
 
             # Rechunk the parts
             assert last_part is None
-            highest_level_in_last_part = group.title_level  # All parts are in the same group
             tokens_sum = 0
+            prev_part_splitted = False
             for part in parts:
+                curr_part_splitted = part.metadata.get("splitted", False)
                 tokens = self._count_tokens(part)
-                if tokens_sum + tokens > self.chunk_size:
+                # Don't merge parts if too many tokens, or the previous part is splitted.
+                if tokens_sum + tokens > self.chunk_size or (prev_part_splitted and not curr_part_splitted):
                     if last_part is not None:
                         result.append(last_part)
                         last_part = None
                         tokens_sum = 0
 
                 last_part = self._append_part_to_part(part, last_part, titles)
                 tokens_sum += tokens
+                prev_part_splitted = curr_part_splitted
+
+            # Don't merge any group into a partial group
+            if last_part is not None:
+                result.append(last_part)
+                last_part = None
+                highest_level_in_last_part = None
 
         if last_part is not None:
             result.append(last_part)
 
@@ -12,10 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import io
 import logging
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 
+import pikepdf
+
+from aperag.docparser.base import AssetBinPart, MarkdownPart, PdfPart
 from aperag.docparser.doc_parser import DocParser
 from aperag.objectstore.base import get_object_store
 
@@ -61,9 +65,22 @@ def parse_document(
             raise ValueError(f"unsupported file type: {filepath_obj.suffix}")
 
         parts = parser.parse_file(filepath_obj, file_metadata)
+
+        # If there are no PdfPart in parts and the doc is a pdf, then add the doc itself as a PdfPart
+        if filepath_obj.suffix.lower() == ".pdf":
+            if not any(isinstance(p, PdfPart) for p in parts):
+                with open(filepath_obj, "rb") as f:
+                    parts.append(PdfPart(data=f.read()))
+
         logger.info(f"Parsed document {filepath} into {len(parts)} parts")
         return parts
 
+    def linearize_pdf(self, data: bytes) -> bytes:
+        with pikepdf.open(io.BytesIO(data)) as pdf:
+            with io.BytesIO() as buffer:
+                pdf.save(buffer, linearize=True)
+                return buffer.getvalue()
+
     def save_processed_content_and_assets(self, doc_parts: List[Any], object_store_base_path: Optional[str]) -> str:
         """
         Save processed content and assets to object storage.
@@ -78,7 +95,6 @@ def save_processed_content_and_assets(self, doc_parts: List[Any], object_store_b
         Raises:
             Exception: If object storage operations fail
         """
-        from aperag.docparser.base import AssetBinPart, MarkdownPart, PdfPart
 
         content = ""
 
@@ -104,8 +120,9 @@ def save_processed_content_and_assets(self, doc_parts: List[Any], object_store_b
 
             if pdf_part is not None:
                 converted_pdf_upload_path = f"{base_path}/converted.pdf"
-                obj_store.put(converted_pdf_upload_path, pdf_part.data)
-                logger.info(f"uploaded converted pdf to {md_upload_path}, size: {len(pdf_part.data)}")
+                linearized_pdf_data = self.linearize_pdf(pdf_part.data)
+                obj_store.put(converted_pdf_upload_path, linearized_pdf_data)
+                logger.info(f"uploaded converted pdf to {converted_pdf_upload_path}, size: {len(linearized_pdf_data)}")
 
             # Save assets
             asset_count = 0
 
@@ -16,6 +16,7 @@
 import logging
 import mimetypes
 import os
+import re
 from typing import List
 
 from fastapi import HTTPException, UploadFile
@@ -575,14 +576,14 @@ async def _get_document_preview(session):
 
             converted_pdf_object_path = None
             index_data = json.loads(doc_index.index_data) if doc_index and doc_index.index_data else {}
-            if index_data.get("has_pdf_source_map") and not document.name.lower().endswith(".pdf"):
+            if index_data.get("has_pdf_source_map"):
                 # If the parsing result contains pdf_source_map metadata,
                 # it means it is a PDF or has been converted to a PDF.
-                # But only converted documents have a converted.pdf file.
-                pdf_path = f"{document.object_store_base_path()}/converted.pdf"
+                converted_pdf_name = "converted.pdf"
+                pdf_path = f"{document.object_store_base_path()}/{converted_pdf_name}"
                 exists = await async_obj_store.obj_exists(pdf_path)
                 if exists:
-                    converted_pdf_object_path = "converted.pdf"
+                    converted_pdf_object_path = converted_pdf_name
 
             # 5. Construct and return response
             return DocumentPreview(
@@ -596,9 +597,12 @@ async def _get_document_preview(session):
         # Execute query with proper session management
         return await self.db_ops._execute_query(_get_document_preview)
 
-    async def get_document_object(self, user_id: str, collection_id: str, document_id: str, path: str):
+    async def get_document_object(
+        self, user_id: str, collection_id: str, document_id: str, path: str, range_header: str = None
+    ):
         """
         Get a file object associated with a document from the object store.
+        Supports HTTP Range requests.
         """
 
         # Use database operations with proper session management
@@ -622,19 +626,49 @@ async def _get_document_object(session):
             # 2. Get the object from object store
             try:
                 async_obj_store = get_async_object_store()
-                get_obj_result = await async_obj_store.get(full_path)
+                headers = {"Accept-Ranges": "bytes"}
+                content_type, _ = mimetypes.guess_type(full_path)
+                if content_type is None:
+                    content_type = "application/octet-stream"
+                headers["Content-Type"] = content_type
+
+                if range_header:
+                    # For range requests, we need the total size first.
+                    total_size = await async_obj_store.get_obj_size(full_path)
+                    if total_size is None:
+                        raise HTTPException(status_code=404, detail="Object not found at specified path")
+
+                    range_match = re.match(r"bytes=(\d+)-(\d*)", range_header)
+                    if not range_match:
+                        raise HTTPException(status_code=400, detail="Invalid range header format")
+
+                    start_byte = int(range_match.group(1))
+                    end_byte_str = range_match.group(2)
+                    end_byte = int(end_byte_str) if end_byte_str else total_size - 1
+
+                    if start_byte >= total_size or end_byte >= total_size or start_byte > end_byte:
+                        headers["Content-Range"] = f"bytes */{total_size}"
+                        raise HTTPException(status_code=416, headers=headers, detail="Requested range not satisfiable")
 
+                    # Use stream_range to get the partial content
+                    range_result = await async_obj_store.stream_range(full_path, start=start_byte, end=end_byte)
+                    if not range_result:
+                        raise HTTPException(status_code=404, detail="Object not found at specified path")
+
+                    data_stream, content_length = range_result
+                    headers["Content-Range"] = f"bytes {start_byte}-{end_byte}/{total_size}"
+                    headers["Content-Length"] = str(content_length)
+                    return StreamingResponse(data_stream, status_code=206, headers=headers)
+
+                # Full content response - optimized to use size from get()
+                get_obj_result = await async_obj_store.get(full_path)
                 if not get_obj_result:
                     raise HTTPException(status_code=404, detail="Object not found at specified path")
 
-                data_stream, _ = get_obj_result
-
-                # 3. Stream the response
-                content_type, _ = mimetypes.guess_type(full_path)
-                if content_type is None:
-                    content_type = "application/octet-stream"
+                data_stream, file_size = get_obj_result
+                headers["Content-Length"] = str(file_size)
+                return StreamingResponse(data_stream, headers=headers)
 
-                return StreamingResponse(data_stream, media_type=content_type)
             except Exception as e:
                 logger.error(f"Failed to get object for document {document_id} at path {full_path}: {e}", exc_info=True)
                 raise HTTPException(status_code=500, detail="Failed to get object from store")
 
@@ -178,12 +178,14 @@ async def get_document_preview(
     operation_id="get_document_object",
 )
 async def get_document_object(
+    request: Request,
     collection_id: str,
     document_id: str,
     path: str,
     user: User = Depends(current_user),
 ):
-    return await document_service.get_document_object(user.id, collection_id, document_id, path)
+    range_header = request.headers.get("range")
+    return await document_service.get_document_object(user.id, collection_id, document_id, path, range_header)
 
 
 @router.post("/collections/{collection_id}/documents/{document_id}/rebuild_indexes", tags=["documents"])