From d4ad9435d67de388684a1f79d00843e2d8da7efa Mon Sep 17 00:00:00 2001 From: Bryce Date: Mon, 27 Apr 2026 18:03:57 +0800 Subject: [PATCH 01/13] =?UTF-8?q?feat(celery=20Wave=205=20P1=20chunk=201):?= =?UTF-8?q?=20=C2=A7K.9=20spec=20+=20aperag/indexing/llm.py=20relocate=20(?= =?UTF-8?q?legacy=20graphindex=20elimination=20first=20chunk)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wave 5 task #26 first chunk per architect msg=10b5fae6 §K.9 spec amendment + PM msg=af82797e dispatch (single Wave 5 PR with 5-phase chunked-rotation commits, big-PR fast-landing per earayu2 msg=eced858d). Two changes bundled (per PM msg=af82797e "Phase 1 first commit = §K.9 spec paste + legacy graphindex elimination 实现起步"): 1. **§K.9 Wave 5 spec section** added to design pack: - 5-phase commit roadmap (16 acceptance items) - production-readiness 三类 layer per phase - architect direct ratify lane scope per phase - pre-check pattern lock per phase (per ``feedback_spec_lock_grep_verify_caller.md`` 双 pattern) - Layer 1 same-session continuation directive (per ``feedback_no_refresh_complete_all_tasks.md``) - Wave 5 acceptance summary (16 items + owner table) 2. **`aperag/indexing/llm.py` relocate** (§K.9.1 acceptance item 3): - new module owns canonical ``build_collection_llm_callable`` + ``render_extraction_prompt`` + ``ENTITY_RELATION_EXTRACTION`` template + ``LLMCall`` type alias - legacy ``aperag/domains/knowledge_graph/graphindex/integration.py`` and ``prompts.py`` re-export from new location during Wave 5 deprecation window so legacy retrieval/curation callers keep working until Phase 1 close-out - ``aperag/indexing/graph_extractor.py`` updated to import from new ``aperag.indexing.llm`` module directly (instead of legacy graphindex bridge) - test ``test_graph_extractor.py`` monkey-patch sites updated to target ``aperag.indexing.llm`` module Pre-check pattern 1 grep-verify caller cascade (per architect msg= b26f64b2 chunk 4d Option C ruling, used as reference list for Phase 1 migration scope): - ``aperag/indexing/`` → 0 cross-references to legacy ``graphindex/storage/*`` (chunk 4d narrowed scope invariant maintained ✅) - legacy ``graphindex/integration.py`` and ``prompts.py`` callers: ``retrieval/pipeline.py:85`` / ``knowledge_graph/service.py:69+`` / ``graph_curation/service.py:37`` / ``graph_curation/integration.py:21`` / ``service/prompt_template_service.py:161`` — remaining migrations land in subsequent Phase 1 commits Local gates green: - ``ruff check ./aperag ./tests`` clean - ``ruff format --check ./aperag ./tests`` clean (509 files) - ``pytest test_graph_extractor.py + test_full_indexing_pipeline.py + test_worker_factory.py + tests/unit_test/indexing/``: 179 passed / 2 skipped (Layer 2 stubs) Production-readiness three-class layer (Phase 1 partial): - must-be-real: ``aperag.indexing.llm`` module is the canonical production home for the relocated LLM helpers (no import indirection through legacy package once retrieval/curation migration completes) - may-be-gated: legacy ``graphindex/{integration,prompts}.py`` re-export shims kept active during Wave 5 deprecation window so cross-cutting refactor can land incrementally without breaking legacy retrieval/curation flow - fully-resolves: §K.9.1 Wave 5 acceptance item 3 (`aperag/indexing/llm.py` relocate); items 1 (legacy graphindex package elimination) and rest of Phase 1 cascade migration land in subsequent commits Next Phase 1 commits: migrate ``retrieval/pipeline.py`` / ``knowledge_graph/service.py`` / ``graph_curation/*`` to §G.5 read primitives; once all callers migrated → delete legacy ``graphindex/{storage, service, integration, engine, __init__}.py`` + delete legacy tests; final commit verifies grep-zero invariant. Co-Authored-By: Claude Opus 4.7 --- .../knowledge_graph/graphindex/integration.py | 58 +---- .../knowledge_graph/graphindex/prompts.py | 106 +------- aperag/indexing/graph_extractor.py | 8 +- aperag/indexing/llm.py | 233 ++++++++++++++++++ .../indexing-redesign-design-pack.md | 144 +++++++++++ tests/integration/test_graph_extractor.py | 4 +- 6 files changed, 394 insertions(+), 159 deletions(-) create mode 100644 aperag/indexing/llm.py diff --git a/aperag/domains/knowledge_graph/graphindex/integration.py b/aperag/domains/knowledge_graph/graphindex/integration.py index 80cfd3aad..fa5622396 100644 --- a/aperag/domains/knowledge_graph/graphindex/integration.py +++ b/aperag/domains/knowledge_graph/graphindex/integration.py @@ -39,7 +39,6 @@ from typing import Any, Awaitable, Optional from aperag.config import async_engine, build_graph_db_context, settings -from aperag.db.ops import db_ops from aperag.domains.knowledge_graph.graphindex.config import GraphIndexConfig from aperag.domains.knowledge_graph.graphindex.dto import ( DeleteDocumentResult, @@ -48,7 +47,13 @@ from aperag.domains.knowledge_graph.graphindex.service import GraphIndexService from aperag.domains.knowledge_graph.graphindex.storage.connector import GraphStoreAdaptor from aperag.domains.knowledge_graph.ports import CollectionRow -from aperag.schema.utils import parseCollectionConfig + +# Wave 5 P1 chunk 1 (`aperag/indexing/llm.py` relocate per §K.9.1 item 3): +# the canonical home for ``build_collection_llm_callable`` is now +# :mod:`aperag.indexing.llm`. Re-exported here so legacy callers that +# still import from this module keep working during the Wave 5 +# deprecation window. +from aperag.indexing.llm import build_collection_llm_callable # noqa: F401 logger = logging.getLogger(__name__) @@ -91,55 +96,6 @@ def _build_store(): # --------------------------------------------------------------------------- -def build_collection_llm_callable(collection: CollectionRow): - """Construct the ``LLMCall`` for a specific collection. - - Reads the collection's completion config (provider, model, base_url, - api key from the user's provider record) and returns an async - function ``(prompt) -> str``. The function is closure-bound to the - config so multiple concurrent calls can share it safely without - serialising on a global LLM client. - - The ``CompletionService`` is instantiated once per callable and - reused across every chunk extraction for the same document. Before - this, each per-chunk call rebuilt the client (litellm import, HTTP - session, etc.) — a meaningful overhead on high-chunk documents. - """ - config = parseCollectionConfig(collection.config) - if not config.completion or not config.completion.model_id: - raise RuntimeError(f"graphindex: completion model not configured (collection {collection.id})") - row = db_ops.query_model_runtime(config.completion.model_id, collection.user) - if not row: - raise RuntimeError(f"graphindex: model not found {config.completion.model_id!r} (collection {collection.id})") - model, account = row - - # Local import: CompletionService pulls in litellm which is heavy at - # import time and we don't want to pay it just for ``import aperag.domains.knowledge_graph.graphindex``. - from aperag.llm.completion.completion_service import CompletionService - from aperag.llm.runtime.resolver import resolve_model_invocation_from_records - - invocation = resolve_model_invocation_from_records(model=model, account=account) - provider = invocation.runner_config.get("provider") - if not provider: - provider = "openai" if invocation.runner_type == "openai_compatible" else invocation.provider_type - - svc = CompletionService( - provider=provider, - model=invocation.provider_model_id, - base_url=invocation.base_url, - api_key=invocation.api_key, - temperature=0.0, # deterministic output for extraction - max_tokens=None, - caching=False, - ) - - async def _llm(prompt: str) -> str: - # No history, no images, no memory. Single-turn JSON request. - return await svc.agenerate(history=[], prompt=prompt, images=[], memory=False) - - return _llm - - def build_collection_embed_callables(collection: CollectionRow): """Build the ``embed_query`` and ``embed_texts`` callables for vector-based entity/relation recall. diff --git a/aperag/domains/knowledge_graph/graphindex/prompts.py b/aperag/domains/knowledge_graph/graphindex/prompts.py index f86b5a172..84cdaa068 100644 --- a/aperag/domains/knowledge_graph/graphindex/prompts.py +++ b/aperag/domains/knowledge_graph/graphindex/prompts.py @@ -35,106 +35,12 @@ from __future__ import annotations -ENTITY_RELATION_EXTRACTION: str = """\ -You are an information-extraction assistant building a knowledge graph. - -Read the TEXT below and return a single JSON object with two arrays: -``entities`` and ``relations``. Do not output anything outside the JSON -object. - -**Rules** - -1. Output language: {language}. Names keep their original script and - case (e.g. English names capitalized, Chinese names unchanged). -2. Use only these entity types: {entity_types}. If no provided type - fits, skip the entity rather than invent a new type. -3. Every entity needs a short, self-contained description in - {language}. Do not add information that isn't in the text. -4. Every relation must reference entities by the exact ``name`` you put - in the ``entities`` list. No self-loops (source != target). -5. ``weight`` is an integer 1-10 expressing how strongly the text - supports the relation; default to 5 when unsure. -6. Cap: at most {max_entities} entities and {max_relations} relations. - If the text contains more, prefer the most specific and the - most-mentioned. -7. If the text has no extractable entities, return - ``{{"entities": [], "relations": []}}``. - -**JSON schema** - -``` -{{ - "entities": [ - {{"name": "", "type": "", - "description": ""}} - ], - "relations": [ - {{"source": "", "target": "", - "description": "", "weight": }} - ] -}} -``` - -**Example** (English, types=[person, organization]): - -Text: -``` -Alice, a researcher at Acme Labs, collaborated with Bob on the project. -``` - -Output: -``` -{{ - "entities": [ - {{"name": "Alice", "type": "person", - "description": "A researcher at Acme Labs."}}, - {{"name": "Bob", "type": "person", - "description": "A collaborator on the project."}}, - {{"name": "Acme Labs", "type": "organization", - "description": "Research organization where Alice works."}} - ], - "relations": [ - {{"source": "Alice", "target": "Bob", - "description": "Alice and Bob collaborated on a project.", - "weight": 7}}, - {{"source": "Alice", "target": "Acme Labs", - "description": "Alice is a researcher employed by Acme Labs.", - "weight": 8}} - ] -}} -``` - ---- - -TEXT: -``` -{input_text} -``` - -Output (JSON only):""" - - -def render_extraction_prompt( - *, - input_text: str, - entity_types: list[str] | tuple[str, ...], - language: str, - max_entities: int, - max_relations: int, -) -> str: - """Fill in the extraction prompt template for one chunk batch. - - Kept as a simple function so tests can snapshot the exact rendered - string and catch accidental prompt regressions. - """ - return ENTITY_RELATION_EXTRACTION.format( - input_text=input_text, - entity_types=", ".join(entity_types), - language=language, - max_entities=max_entities, - max_relations=max_relations, - ) - +# Wave 5 P1 chunk 1 (`aperag/indexing/llm.py` relocate per §K.9.1 item 3): +# ``ENTITY_RELATION_EXTRACTION`` + ``render_extraction_prompt`` 已 relocate +# 到 :mod:`aperag.indexing.llm`. 这里 re-export 保持 legacy callers 跨 +# Wave 5 deprecation window 不破。一旦 retrieval/curation Phase 1 +# close-out,整个 legacy module 删除。 +from aperag.indexing.llm import ENTITY_RELATION_EXTRACTION, render_extraction_prompt # noqa: F401 DESCRIPTION_SUMMARIZATION: str = """\ You are consolidating a knowledge-graph entry. diff --git a/aperag/indexing/graph_extractor.py b/aperag/indexing/graph_extractor.py index de0893eb4..c7bd21dfd 100644 --- a/aperag/indexing/graph_extractor.py +++ b/aperag/indexing/graph_extractor.py @@ -99,9 +99,7 @@ def build_collection_graph_extractor(collection: Any) -> GraphExtractor: FAILED with the message rather than silently building a worker that will fail at dispatch time. """ - from aperag.domains.knowledge_graph.graphindex.integration import ( - build_collection_llm_callable, - ) + from aperag.indexing.llm import build_collection_llm_callable from aperag.indexing.worker_factory import WorkerFactoryError try: @@ -179,9 +177,7 @@ async def _extract_one_chunk( timeout we propagate :class:`asyncio.TimeoutError` to the caller which already logs + skips the chunk. """ - from aperag.domains.knowledge_graph.graphindex.prompts import ( - render_extraction_prompt, - ) + from aperag.indexing.llm import render_extraction_prompt prompt = render_extraction_prompt( input_text=text, diff --git a/aperag/indexing/llm.py b/aperag/indexing/llm.py new file mode 100644 index 000000000..e82cf3add --- /dev/null +++ b/aperag/indexing/llm.py @@ -0,0 +1,233 @@ +# Copyright 2025 ApeCloud, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""LLM helpers for the new indexing pipeline — Wave 5 P1 chunk 1. + +Relocates two helpers out of the legacy ``graphindex/`` package so the +new indexing pipeline (``aperag/indexing/``) does not depend on legacy +package paths. Per architect msg=87e2b187 chunk 4d Option C ruling + +Wave 5 task #26 spec — the legacy ``aperag/domains/knowledge_graph/ +graphindex/`` package is being phased out as a coordinated batch in +Wave 5; this module is the new home for the two LLM-callable + prompt- +template helpers that the new ``aperag.indexing.graph_extractor`` +already consumes. + +Public surface: + +* :func:`build_collection_llm_callable` — read the collection's + completion config and return an async ``(prompt) -> str`` closure + bound to a single :class:`CompletionService` instance. The + closure is reusable across many extractor calls for the same + collection so we pay the LiteLLM import + HTTP session setup only + once per collection per process. +* :func:`render_extraction_prompt` — fill in the canonical entity / + relation extraction prompt template for one chunk batch. Kept as a + separate function so tests can snapshot the rendered string and + catch accidental prompt regressions. + +The legacy modules (``graphindex.integration.build_collection_llm_callable`` +and ``graphindex.prompts.render_extraction_prompt``) re-export from this +module during the Wave 5 deprecation window so external callers (legacy +graphindex retrieval / curation paths) keep working until the cross- +cutting refactor lands the migration. Once those callers move to the +§G.5 read primitives, the legacy modules can be deleted with no +behavioural change. +""" + +from __future__ import annotations + +import logging +from typing import Awaitable, Callable + +from aperag.db.ops import db_ops +from aperag.domains.knowledge_graph.ports import CollectionRow +from aperag.schema.utils import parseCollectionConfig + +logger = logging.getLogger(__name__) + + +LLMCall = Callable[[str], Awaitable[str]] +"""Async callable returning the model's response to a single prompt. +The new indexing pipeline's graph extractor consumes this exact shape +(``aperag.indexing.graph_extractor`` line ~165).""" + + +def build_collection_llm_callable(collection: CollectionRow) -> LLMCall: + """Construct the per-collection async LLM callable. + + Reads the collection's completion config (provider, model, base_url, + api key from the user's provider record) and returns an async + ``(prompt) -> str`` closure. The closure is bound to a single + :class:`CompletionService` instance so concurrent extractor calls + against the same collection share the underlying client without + paying the LiteLLM import + HTTP session setup per chunk. + + Raises ``RuntimeError`` if the collection has no completion model + configured or the configured model cannot be resolved against the + runtime registry. The :class:`aperag.indexing.graph_extractor` + builder catches this and wraps it in + :class:`aperag.indexing.worker_factory.WorkerFactoryError` so the + orchestrator finalises the row FAILED with operator-facing + diagnostics (Wave 3 lesson #10 explicit-fail-not-silent pattern). + """ + config = parseCollectionConfig(collection.config) + if not config.completion or not config.completion.model_id: + raise RuntimeError(f"indexing.llm: completion model not configured (collection {collection.id})") + row = db_ops.query_model_runtime(config.completion.model_id, collection.user) + if not row: + raise RuntimeError(f"indexing.llm: model not found {config.completion.model_id!r} (collection {collection.id})") + model, account = row + + # Local import: CompletionService pulls in litellm which is heavy + # at import time and we don't want to pay it just for + # ``import aperag.indexing.llm``. + from aperag.llm.completion.completion_service import CompletionService + from aperag.llm.runtime.resolver import resolve_model_invocation_from_records + + invocation = resolve_model_invocation_from_records(model=model, account=account) + provider = invocation.runner_config.get("provider") + if not provider: + provider = "openai" if invocation.runner_type == "openai_compatible" else invocation.provider_type + + svc = CompletionService( + provider=provider, + model=invocation.provider_model_id, + base_url=invocation.base_url, + api_key=invocation.api_key, + temperature=0.0, # deterministic output for extraction + max_tokens=None, + caching=False, + ) + + async def _llm(prompt: str) -> str: + # No history, no images, no memory. Single-turn JSON request. + return await svc.agenerate(history=[], prompt=prompt, images=[], memory=False) + + return _llm + + +# --------------------------------------------------------------------- +# Prompt templates — relocated from ``graphindex.prompts``. +# --------------------------------------------------------------------- + + +ENTITY_RELATION_EXTRACTION: str = """\ +You are an information-extraction assistant building a knowledge graph. + +Read the TEXT below and return a single JSON object with two arrays: +``entities`` and ``relations``. Do not output anything outside the JSON +object. + +**Rules** + +1. Output language: {language}. Names keep their original script and + case (e.g. English names capitalized, Chinese names unchanged). +2. Use only these entity types: {entity_types}. If no provided type + fits, skip the entity rather than invent a new type. +3. Every entity needs a short, self-contained description in + {language}. Do not add information that isn't in the text. +4. Every relation must reference entities by the exact ``name`` you put + in the ``entities`` list. No self-loops (source != target). +5. ``weight`` is an integer 1-10 expressing how strongly the text + supports the relation; default to 5 when unsure. +6. Cap: at most {max_entities} entities and {max_relations} relations. + If the text contains more, prefer the most specific and the + most-mentioned. +7. If the text has no extractable entities, return + ``{{"entities": [], "relations": []}}``. + +**JSON schema** + +``` +{{ + "entities": [ + {{"name": "", "type": "", + "description": ""}} + ], + "relations": [ + {{"source": "", "target": "", + "description": "", "weight": }} + ] +}} +``` + +**Example** (English, types=[person, organization]): + +Text: +``` +Alice, a researcher at Acme Labs, collaborated with Bob on the project. +``` + +Output: +``` +{{ + "entities": [ + {{"name": "Alice", "type": "person", + "description": "A researcher at Acme Labs."}}, + {{"name": "Bob", "type": "person", + "description": "A collaborator on the project."}}, + {{"name": "Acme Labs", "type": "organization", + "description": "Research organization where Alice works."}} + ], + "relations": [ + {{"source": "Alice", "target": "Bob", + "description": "Alice and Bob collaborated on a project.", + "weight": 7}}, + {{"source": "Alice", "target": "Acme Labs", + "description": "Alice is a researcher employed by Acme Labs.", + "weight": 8}} + ] +}} +``` + +--- + +TEXT: +``` +{input_text} +``` + +Output (JSON only):""" + + +def render_extraction_prompt( + *, + input_text: str, + entity_types: list[str] | tuple[str, ...], + language: str, + max_entities: int, + max_relations: int, +) -> str: + """Fill in the extraction prompt template for one chunk. + + Kept as a simple function so tests can snapshot the exact rendered + string and catch accidental prompt regressions. The template is + identical to the legacy ``graphindex.prompts.ENTITY_RELATION_EXTRACTION`` + text (relocated here verbatim during Wave 5 P1). + """ + return ENTITY_RELATION_EXTRACTION.format( + input_text=input_text, + entity_types=", ".join(entity_types), + language=language, + max_entities=max_entities, + max_relations=max_relations, + ) + + +__all__ = [ + "ENTITY_RELATION_EXTRACTION", + "LLMCall", + "build_collection_llm_callable", + "render_extraction_prompt", +] diff --git a/docs/modularization/indexing-redesign-design-pack.md b/docs/modularization/indexing-redesign-design-pack.md index 26f94fa83..20d28b20e 100644 --- a/docs/modularization/indexing-redesign-design-pack.md +++ b/docs/modularization/indexing-redesign-design-pack.md @@ -1614,6 +1614,150 @@ v2 把 v1 的 7 个细粒度 PR 收成 **3 个 wave**(≈3 个大 PR)。每 - W5 cleanup builder share helpers with dispatch builders (T2 obs B drift risk) - W5 e2e-http-compose lane stub model-provider fixture + Layer 2 full-pipeline test activation (chunk 4e Layer 2 tests are currently `pytest.skip(...)` stubs because vector/fulltext/summary embedders need a configured model-provider that local dev does not have; the e2e-http-compose lane has the scaffolding but the stub fixture must be wired so Layer 2 can run for real instead of skip — per architect msg=87e2b187 chunk 4d/4e ratify decision condition #3) +### K.9. Wave 5 — Indexing program close-out + 16 backlog items(per architect msg=11442bbf reframe + earayu2 msg=eced858d "大PR 快速落地" directive) + +**目标**: 把 Wave 4 close-out 后剩余 16 backlog items 一次性 implement,single Wave 5 PR with 5-phase chunked-rotation commits。Wave 4 同款 batch pattern (PR #1731 推 11 items)。 + +**5-phase commit roadmap (16 acceptance items)**: + +#### Phase 1 (sequential first) — Legacy graphindex package 整体淘汰 + +- delete `aperag/domains/knowledge_graph/graphindex/{storage, service, integration, engine, __init__}.py`(per chunk 4d Option C ruling msg=b26f64b2 + §K.8 deferral) +- migrate retrieval/curation 调用 → §G.5 read primitives: + - `aperag/domains/retrieval/pipeline.py:85` `make_service_for_collection` + - `aperag/domains/knowledge_graph/service.py:69+` graph CRUD + - `aperag/graph_curation/service.py:37` + `integration.py:21` + - `aperag/indexing/worker_factory.py:696` `build_collection_llm_callable` +- relocate `build_collection_llm_callable` + `render_extraction_prompt` → 新 `aperag/indexing/llm.py` +- delete tests: `tests/unit_test/graphindex/test_connector.py` / `test_nebula_store.py` / `tests/integration/compat/test_graph_compat.py` + +**production-readiness 三类 layer**: +- must-be-real: §G.5 read primitives 真 wire retrieval/curation flow(not stubs) +- may-be-gated: 无 (full hard-cut) +- fully-resolves: §K.8 Wave 5 backlog item 1 (legacy graphindex elimination) + item 3 (`aperag/indexing/llm.py` relocate) + +**pre-check pattern** (per `feedback_spec_lock_grep_verify_caller.md` Pattern 1): +- grep `from aperag.domains.knowledge_graph.graphindex` 全验 caller cascade migration completeness +- grep-zero verify post-migration: `aperag/indexing/*` 不 import legacy graphindex (chunk 4d invariant maintained) + +**architect direct ratify scope**: caller migration completeness verify (post-Phase-1 grep-zero check); `aperag/indexing/llm.py` API surface align Wave 4 import path expectations。 + +#### Phase 2 (post-Phase-1) — T7 multimodal vision-LLM 3-item bundle + +per §G.2.5.1 spec amendment (Wave 4 chunk 4e follow-up `da576f62`): + +- **item 1**: `aperag/llm/embed/embedding_service.py` 加 `embed_image(image_bytes: bytes, alt_text: str = "") -> list[float]` API surface; provider implementation 通过 v3 router resolve +- **item 2**: parser pipeline 加 image extraction → write `derived/parse_/vision/images/.` (per-page image bytes for PDF / single-image-input passthrough); 同 batch 实施 chunk_id schema unification (per huangheng T1 obs B) +- **item 3**: `aperag/domains/model_platform/api/providers_v3_routes.py` 加 multimodal capability flag (`is_multimodal=True` source of truth); operator can set on collection embedder spec +- chunk 4b vision gate **self-disables** when `is_multimodal()` flips True + provider flag set + +**production-readiness 三类**: +- must-be-real: real multimodal LLM provider integration (CLIP / LLaVA / GPT-4V / etc. via v3 router) +- may-be-gated: 无 (full implementation lands) +- fully-resolves: §K.8 Wave 5 backlog item 4 (T7 multimodal full impl 3-item bundle) + +**pre-check** (per Pattern 2): grep `embed_image` / `is_multimodal` 全 verify 现 code binding (chunk 4b gate `is_multimodal()` call site already exists) + +**architect direct ratify scope**: §G.2.5.1 spec ↔ implementation align (3-item bundle full impl) + chunk 4b vision gate self-disable verify (`is_multimodal=True` 实测 + Layer 1 test rename `test_phase1_vision_modality_*` 改 expect ACTIVE) + +#### Phase 3 (parallel with Phase 2) — Layer 2 e2e fixture wiring + sweep D activation + +per chunk 4d+4e ratify msg=c279a0ff + sweep D Layer 2 stub (`4d36c7fb`): + +- e2e-http-compose lane stub model-provider fixture wire +- activate `test_phase1_full_pipeline_vector_fulltext_summary_active_graph_vision_failed` Layer 2 stub +- activate `test_phase1_multi_keyword_fulltext_search_returns_hits` Layer 2 stub (sweep D minimum_should_match real verify) + +**production-readiness 三类**: +- must-be-real: real Postgres / Redis / Qdrant / ES / OTel SDK + stub model-provider fixture +- may-be-gated: 无 (Layer 2 真激活) +- fully-resolves: §K.8 Wave 5 backlog item 2 (Layer 2 e2e fixture wiring) + +**architect direct ratify scope**: Layer 2 fixture wiring canonical contract (vector+fulltext+summary ACTIVE + chunk 4b vision gate FAILED in Phase 2 deferred state until Phase 2 lands flips to ACTIVE) + delete+cleanup roundtrip backend artefact 0-count verify + +#### Phase 4 (parallel with Phase 1) — P1 production robustness 3-pack + +3 commits in same phase batch: + +1. **T2 cleanup `_resolve_cleanup_worker` transient-vs-intentional error split** (per T2 architect ratify msg=6aa8ca88): + - `WorkerFactoryError` (intentional gate, drop row) vs 其他 `Exception` (transient, NOT drop row, retry next cycle) +2. **T3 chunk 2 parse_orchestrator parse_version short-circuit** (per huangheng T3 chunk 2 obs B): + - `if derived/parse_/chunks.jsonl 已存在 → skip DocParser,直接 dispatch` +3. **T2 reconciler "document N min 无 document_index rows → re-enqueue parse"** (per huangheng T3 chunk 2 obs A): + - cleanup loop 加 stuck document detection + parse re-enqueue + +**production-readiness 三类**: +- must-be-real: 3 production fault-tolerance fixes (real e2e flow benefit) +- may-be-gated: 无 +- fully-resolves: §K.8 Wave 5 backlog items 7 + 9 + 10 + +#### Phase 5A (post-Phase-4, Bryce batch) — P2 batch graph + perf polish + +5 commits in same phase batch (Bryce ownership per Wave 4 chunk 4 / T1 / T8 expertise): + +1. **T1 graph extractor per-collection config tunability** (per huangheng T1 obs A): `collection.config.knowledge_graph_config.{per_chunk_timeout, max_entities_per_chunk, max_relations_per_chunk}` per-collection override +2. **chunk 4b `_no_op_extractor` identity check → attribute marker** (per huangheng chunk 4b obs B): `getattr(extractor, 'is_no_op', False)` more robust pattern +3. **W5-perf-graph-lineage parallel-list O(N) alternative encoding** for high-cardinality entities (>10k docs/entity, per architect msg=39a74026) +4. **W5-neo4j-label-namespace prefix `aperag_LineageEntity` / `aperag_LineageRelation`** to avoid user-namespace collision +5. **W5-cypher-type-keyword rename `n.type` property** (Cypher `TYPE()` keyword shadow); cross-backend rename also in Postgres/Nebula + +**fully-resolves**: §K.8 Wave 5 backlog items 5 + 12 (其中 12 fold 进 PR-A T7 multimodal 实施 if applicable) + +#### Phase 5B (post-Phase-4, chenyexuan batch) — P2 batch infra + cleanup polish + +5 commits in same phase batch (chenyexuan ownership per Wave 4 T2 / T3 / T9 expertise): + +1. **T1 chunk_id schema unification on parser layer** (per huangheng T1 obs B; partially overlap with Phase 2 item 2) +2. **T2 cleanup builder share helper with dispatch builders** (per huangheng T2 obs B drift risk refactor) +3. **T3 chunk 2 `tenant_scope_key` org-prefix forward-compat** (per T3 chunk 2 obs C) +4. **chunk 4a `utc_now` Python default vs `CURRENT_TIMESTAMP` server_default unification** (per huangheng chunk 4a obs A) +5. **W5-otlp-config-cross-check** lifespan startup cross-check `INDEXING_METRICS_EMITTER=otlp` ⇔ `APERAG_OBSERVABILITY_MODE=otlp` (per huangheng T6 chunk 1 obs) + +**fully-resolves**: §K.8 Wave 5 backlog items 6 + 8 + 11 + remaining accumulated obs + +#### K.9.1. Wave 5 acceptance summary (16 items) + +| # | Item | Phase | Owner | +|---|---|---|---| +| 1 | Legacy graphindex package 整体淘汰 | 1 | Bryce | +| 2 | Layer 2 e2e fixture wiring | 3 | chenyexuan | +| 3 | `aperag/indexing/llm.py` relocate | 1 | Bryce | +| 4 | T7 multimodal vision-LLM 3-item bundle | 2 | Bryce | +| 5 | T1 graph extractor per-collection config tunability | 5A | Bryce | +| 6 | T1 chunk_id schema unification | 2 / 5B | Bryce / chenyexuan | +| 7 | T2 cleanup transient-vs-intentional error split | 4 | chenyexuan | +| 8 | T2 cleanup builder share helper | 5B | chenyexuan | +| 9 | T2 reconciler stuck-document re-enqueue | 4 | chenyexuan | +| 10 | T3 chunk 2 parse_version short-circuit | 4 | chenyexuan | +| 11 | T3 chunk 2 tenant_scope_key org-prefix | 5B | chenyexuan | +| 12 | fulltext additional backends extension | 5A | Bryce | +| 13 | chunk 4a utc_now vs CURRENT_TIMESTAMP unify | 5B | chenyexuan | +| 14 | chunk 4b _no_op_extractor identity check robust | 5A | Bryce | +| 15 | W5-perf-graph-lineage parallel-list O(N) | 5A | Bryce | +| 16 | W5-otlp-config-cross-check + Neo4j label namespace + Cypher type rename | 5A / 5B | Bryce / chenyexuan | + +#### K.9.2. Wave 5 architect direct ratify lane + +per Wave 4 lane lock (chunk 4 / T1 / T5 / T7 ratify trail), Wave 5 architect direct ratify scope: + +- **Phase 1**: legacy graphindex elimination caller migration completeness verify (grep-zero post-Phase-1 + `aperag/indexing/llm.py` relocate clean) +- **Phase 2**: T7 §G.2.5.1 spec ↔ implementation align (3-item bundle full impl) + chunk 4b vision gate self-disable verify +- **Phase 3**: Layer 2 e2e fixture wiring canonical activation +- **Phase 4 / 5A / 5B**: 走 huangheng pass-1 lane only; architect spot-check on architectural soundness。如出现 spec/state-binding drift → architect direct ratify trigger (per Wave 4 T5 §H.5.1 fix-forward 教训) + +#### K.9.3. Pre-check pattern lock (per `feedback_spec_lock_grep_verify_caller.md` 双 pattern) + +- **Pattern 1** (caller cascade): Phase 1 必跑 — grep `from aperag.domains.knowledge_graph.graphindex` 全 caller list verify migration scope +- **Pattern 2** (state binding): 任何新 `collection.config.*` field / `settings.*` setting / logical db / port / path lock — 必跑 grep `from settings import` / `getattr(cfg,...)` 现 code binding verify + +#### K.9.4. Layer 1 same-session continuation directive (per `feedback_no_refresh_complete_all_tasks.md`) + +Wave 5 implementation 同 Wave 4 directive: +- earayu2 不再 帮 fresh-restart agent session +- 各 implementer same-session continuation push 节奏 maintained +- Layer 1 metric goal: ≥ 90% same-session ratio (Wave 4 verified 65% same-session + 100% pass-1 一次过率) +- chunked-rotation within PR (multiple commits in same branch / same session) + ### K.7. 测试策略 - **Unit tests**: 每个 modality 配幂等自测(Wave 1 gate) diff --git a/tests/integration/test_graph_extractor.py b/tests/integration/test_graph_extractor.py index 69a017a1c..536cb32ea 100644 --- a/tests/integration/test_graph_extractor.py +++ b/tests/integration/test_graph_extractor.py @@ -213,7 +213,7 @@ async def _flaky_llm(_prompt: str) -> str: # Stub the legacy integration so the extractor builder doesn't try # to look up a real model provider. - import aperag.domains.knowledge_graph.graphindex.integration as _integration + import aperag.indexing.llm as _integration monkeypatch.setattr( _integration, @@ -242,7 +242,7 @@ def test_extractor_builder_raises_when_completion_model_missing(monkeypatch: pyt builder wraps the failure in :class:`WorkerFactoryError` so the orchestrator can finalise the row FAILED with a clear message.""" - import aperag.domains.knowledge_graph.graphindex.integration as _integration + import aperag.indexing.llm as _integration def _no_llm(_coll): raise ValueError("no completion model configured for collection col-x") From 07d51cc41a5dd51017a7383773848b668106b97e Mon Sep 17 00:00:00 2001 From: Bryce Date: Mon, 27 Apr 2026 18:09:41 +0800 Subject: [PATCH 02/13] refactor(celery Wave 5 P1 chunk 2): relocate ENTITY_RELATION_EXTRACTION import to aperag/indexing/llm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wave 5 task #26 chunk 2 per §K.9.1 acceptance item 3 cascade (legacy graphindex caller migration): `aperag/service/prompt_template_service.py:155-163` (the `get_default_prompt(prompt_type="graph")` branch) was importing the ENTITY_RELATION_EXTRACTION template from the legacy `aperag.domains.knowledge_graph.graphindex.prompts` module. Per chunk 1 relocate (`11113acb`), the canonical home for the template is now `aperag.indexing.llm`. This commit migrates the import site so the caller no longer transitively touches the legacy module. Behavior unchanged — the legacy `graphindex/prompts.py` shim already re-exports the template from the new location, so this is a pure import-path refactor that leaves the runtime payload identical. Local gates: - `ruff check ./aperag ./tests` clean - `ruff format --check ./aperag ./tests` clean (509 files) Phase 1 cascade progress: - chunk 1 ✅ (`11113acb`): §K.9 spec + `aperag/indexing/llm.py` relocate - **chunk 2** (this commit): `service/prompt_template_service.py` import relocate - chunk 3+ pending: `retrieval/pipeline.py` / `knowledge_graph/service.py` / `graph_curation/*` migration (these need new design — legacy 24-method `GraphStore` Protocol → new 10-method `LineageGraphStore` Protocol API surface is NOT a 1-to-1 replacement; per Bryce msg=30c7e994 scope reality check + architect msg=b052b1b4 cascade ratify lane lock) Co-Authored-By: Claude Opus 4.7 --- aperag/service/prompt_template_service.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/aperag/service/prompt_template_service.py b/aperag/service/prompt_template_service.py index 6de3ab0bc..af380bbae 100644 --- a/aperag/service/prompt_template_service.py +++ b/aperag/service/prompt_template_service.py @@ -153,12 +153,13 @@ def get_hardcoded_index_prompt(prompt_type: str) -> Optional[str]: Hardcoded prompt content, or None if not available """ if prompt_type == "graph": - # Return the graphindex v2 extraction prompt template. Unlike - # LightRAG's parameterised ``entity_extraction``, this one - # expects ``{input_text}`` / ``{entity_types}`` / ``{language}`` - # / ``{max_entities}`` / ``{max_relations}`` to be filled by the - # caller (see ``aperag.domains.knowledge_graph.graphindex.prompts.render_extraction_prompt``). - from aperag.domains.knowledge_graph.graphindex.prompts import ENTITY_RELATION_EXTRACTION + # Return the canonical entity/relation extraction prompt template. + # Wave 5 P1 chunk 1 relocate (per §K.9.1 item 3): the canonical + # home is now :mod:`aperag.indexing.llm`. The template expects + # ``{input_text}`` / ``{entity_types}`` / ``{language}`` / + # ``{max_entities}`` / ``{max_relations}`` to be filled by the + # caller (see :func:`aperag.indexing.llm.render_extraction_prompt`). + from aperag.indexing.llm import ENTITY_RELATION_EXTRACTION return ENTITY_RELATION_EXTRACTION elif prompt_type == "summary": From 0fd01e8356626bb1e07f8a04fae8cd88f6f289c8 Mon Sep 17 00:00:00 2001 From: earayu Date: Mon, 27 Apr 2026 18:08:40 +0800 Subject: [PATCH 03/13] =?UTF-8?q?feat(celery=20Wave=205=20P4):=20productio?= =?UTF-8?q?n=20robustness=203-pack=20=E2=80=94=20cleanup=20transient/inten?= =?UTF-8?q?tional=20split=20+=20parse=5Fversion=20short-circuit=20+=20reco?= =?UTF-8?q?nciler=20stuck-parse=20re-enqueue?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wave 5 Phase 4 (PR-C / task #29) — closes 3 latent issues surfaced through Wave 4 ratify trail: * **P4-1 cleanup transient-vs-intentional split** (per architect msg=6aa8ca88 T2 ratify minor obs A): Pre-Wave-5 ``_resolve_cleanup_worker`` collapsed any factory exception into "drop the DB row". Transient infra errors (Qdrant blip, ES network glitch) lost the retry signal — DB row was deleted before next cycle could retry. Wave 5 P4 returns a new ``CleanupWorkerResolution(worker, transient)`` distinguishing the two: ``WorkerFactoryError`` is a by-design gate (drop the row to bound index growth), any other Exception is transient (skip DB row drop so next cycle retries when backend recovers). Counts surface ``transient_deferred`` so operators can track recovery rate. * **P4-2 parse_version short-circuit** (per huangheng T3 chunk 2 obs B + Wave 5 backlog item): Pre-Wave-5 ``parse_document`` always re-runs DocParser even when the resulting artifact directory is byte-identical (parse_version is content-derived). Wave 5 P4 adds ``short_circuit_if_artifacts_exist=True`` default — if all three derived artifacts (markdown.md / outline.json / chunks.jsonl) already exist under the canonical ``derived/parse_/`` path, skip DocParser + writes entirely. Eliminates the ~30s OCR / Word rerun cost on rebuild of unchanged content. Tests can pass ``False`` to force re-parse. * **P4-3 reconciler stuck-document parse re-enqueue** (per architect Wave 4 T3 chunk 2 obs A — production gap close): Pre-Wave-5 parse failures (DocParser raise / source missing) silently dropped the document_id in the parse worker; operator saw ``document.status == PENDING`` forever with no signal to re-trigger. Wave 5 P4 adds ``reconcile_stuck_documents_for_parse_reenqueue`` to the reconciler loop. Detects documents with ``Document.gmt_created < now - cooldown_seconds`` AND zero ``document_index`` rows AND ``Document.gmt_updated < now - cooldown_seconds`` (cooldown filter prevents 30-s tick storm), then pushes a fresh ``ParseDispatchPayload`` matching the upload handler's contract. ``gmt_updated`` bumps after each push so the cooldown predicate rate-limits re-enqueue. Three-class tag (Wave 3 production-readiness invariant): * must-be-real: cleanup transient/intentional split prevents silent retry-signal loss; parse_version short-circuit eliminates OCR rerun waste; stuck-parse reconciler closes the document.status PENDING-forever gap * may-be-gated: ``short_circuit_if_artifacts_exist=False`` for tests pinning DocParser invocation count * fully-resolves: 3 of Wave 5 P1 backlog items (per architect ratify trail msg=6aa8ca88 + huangheng obs trail) Deltas: * ``aperag/indexing/cleanup.py`` — ``CleanupWorkerResolution`` dataclass + WorkerFactoryError-vs-Exception split in ``_resolve_cleanup_worker``; both ``cleanup_orphan_parse_versions`` + ``cleanup_for_deleted_documents`` honor ``resolution.transient`` to skip DB row drop on transient infra failures; collection cascade aggregates ``transient_deferred``. * ``aperag/indexing/parser.py`` — ``_all_artifacts_present`` predicate + ``short_circuit_if_artifacts_exist`` parameter + early-return ParseResult when all 3 artifacts present. * ``aperag/indexing/reconciler.py`` — ``STUCK_PARSE_COOLDOWN_SECONDS`` + ``reconcile_stuck_documents_for_parse_reenqueue`` async scan + ``_select_stuck_documents_for_reenqueue`` SQL query + ``_build_parse_payload_for_document`` payload reconstruction + ``_resolve_collection_parser_config`` / ``_resolve_collection_modalities`` (mirror ``document_service`` shape) + ``_mark_stuck_documents_reenqueued`` bumps gmt_updated; ``run_reconcile_loop`` calls the new scan. * ``aperag/indexing/__init__.py`` — re-export new symbols. * ``tests/integration/test_p4_robustness_3pack.py`` (new) — 13 tests across 3 layers (cleanup transient/intentional split / parse short-circuit / reconciler re-enqueue with cooldown + payload shape + skip-when-indexed + skip-when-no-object-path). * ``tests/unit_test/indexing/test_t3_1_dispatcher_path_c.py`` — added ``transient_deferred: 0`` to expected empty-counts dict. Local gates: * ``pytest tests/unit_test/indexing/ tests/integration/`` — 232 passed / 48 skipped (incl. 13 new ``test_p4_robustness_3pack``). * ``ruff check aperag/ tests/integration/test_p4_robustness_3pack.py`` — clean. * ``ruff format`` — applied. Branch: local ``chenyexuan/celery-wave5-p4`` based on main ``19d3d70`` (Wave 4 PR #1731 squash); will rebase onto ``bryce/celery-wave5`` once Bryce opens the Wave 5 draft PR. Co-Authored-By: Claude Opus 4.7 --- aperag/indexing/__init__.py | 6 +- aperag/indexing/cleanup.py | 116 +++- aperag/indexing/parser.py | 98 ++- aperag/indexing/reconciler.py | 265 +++++++- tests/integration/test_p4_robustness_3pack.py | 577 ++++++++++++++++++ .../indexing/test_t3_1_dispatcher_path_c.py | 1 + 6 files changed, 1014 insertions(+), 49 deletions(-) create mode 100644 tests/integration/test_p4_robustness_3pack.py diff --git a/aperag/indexing/__init__.py b/aperag/indexing/__init__.py index 25526f05c..6e36952e9 100644 --- a/aperag/indexing/__init__.py +++ b/aperag/indexing/__init__.py @@ -158,9 +158,11 @@ HEARTBEAT_STALE_SECONDS, RECONCILE_BATCH_SIZE, RECONCILE_INTERVAL_SECONDS, + STUCK_PARSE_COOLDOWN_SECONDS, reconcile_failed_retry, reconcile_pending_dispatch, reconcile_running_reclaim, + reconcile_stuck_documents_for_parse_reenqueue, run_reconcile_loop, ) from aperag.indexing.summary import ( @@ -276,13 +278,15 @@ "process_one_parse_task", "run_parse_worker", "run_parse_worker_loop", - # Reconciler (T2.1) + # Reconciler (T2.1 + Wave 5 P4 stuck-parse re-enqueue) "RECONCILE_INTERVAL_SECONDS", "RECONCILE_BATCH_SIZE", "HEARTBEAT_STALE_SECONDS", + "STUCK_PARSE_COOLDOWN_SECONDS", "reconcile_pending_dispatch", "reconcile_failed_retry", "reconcile_running_reclaim", + "reconcile_stuck_documents_for_parse_reenqueue", "run_reconcile_loop", # Cleanup (T2.1 + T3.1 path C) "CLEANUP_INTERVAL_SECONDS", diff --git a/aperag/indexing/cleanup.py b/aperag/indexing/cleanup.py index 87ab2c588..59b1f1216 100644 --- a/aperag/indexing/cleanup.py +++ b/aperag/indexing/cleanup.py @@ -84,6 +84,7 @@ import asyncio import logging +from dataclasses import dataclass from datetime import datetime, timedelta, timezone from typing import Any, Awaitable, Callable, Mapping, Optional @@ -92,6 +93,7 @@ from aperag.indexing.base import ModalityWorker from aperag.indexing.models import DocumentIndex, Modality +from aperag.indexing.worker_factory import WorkerFactoryError logger = logging.getLogger(__name__) @@ -164,45 +166,92 @@ def _is_graph_worker(worker: ModalityWorker) -> bool: return getattr(worker, "modality", None) is Modality.GRAPH +@dataclass(frozen=True) +class CleanupWorkerResolution: + """Wave 5 P4 T2 — outcome of :func:`_resolve_cleanup_worker`. + + Distinguishes the two failure modes that pre-Wave-5 collapsed into + a single ``None`` return: + + * **intentional gate** (``worker is None`` AND ``transient is False``) + — :class:`WorkerFactoryError` raised because the modality is + Wave-N-gated by design (graph extractor not wired / vision + multimodal not configured), the operator deliberately disabled + the modality via ``collection.config``, or the row's modality + string is unknown. The cleanup loop should drop the DB row so + the index does not grow unboundedly while the gate is active. + + * **transient infrastructure error** (``worker is None`` AND + ``transient is True``) — DB connection blip / Qdrant + unreachable / ES unhealthy / Redis network glitch. The cleanup + loop must NOT drop the DB row — the next cycle (5 min later) + retries automatically once the backend recovers. Pre-Wave-5 + this collapsed into the gate path and silently lost the retry + signal. + + * **resolved worker** (``worker is not None``, ``transient ignored``) + — happy path; caller proceeds with backend cleanup. + """ + + worker: Optional[ModalityWorker] + transient: bool + + async def _resolve_cleanup_worker( *, workers: Optional[Mapping[Modality, ModalityWorker]], worker_factory: Optional[WorkerFactoryForRow], row: DocumentIndex, -) -> Optional[ModalityWorker]: +) -> CleanupWorkerResolution: """Resolve the cleanup worker for a row from factory or static map. - Wave 4 T2: production wires the factory (per-(collection, modality) - lazy materialisation); tests typically pass a pre-built ``workers`` - mapping. When both are provided the factory wins — production - deployments override the legacy mapping with the per-row factory. - - Returns ``None`` when neither source can supply a worker; the - caller logs + counts the row as ``backend_skipped``. A factory - that raises :class:`WorkerFactoryError` (Wave 4 vision multimodal - gate / partial config) also returns ``None`` so the cleanup cycle - drops the DB row even when the backend is unreachable, matching - the existing "skip backend, drop row" semantics. + Wave 4 T2 + Wave 5 P4 (transient-vs-intentional split): production + wires the factory (per-(collection, modality) lazy materialisation); + tests typically pass a pre-built ``workers`` mapping. When both + are provided the factory wins — production deployments override + the legacy mapping with the per-row factory. + + Returns a :class:`CleanupWorkerResolution`: + + * ``worker is not None``: backend cleanup proceeds. + * ``worker is None, transient=False``: intentional gate / unknown + modality / no source — caller drops DB row. + * ``worker is None, transient=True``: transient infrastructure + error — caller skips DB row drop so next cycle retries. + + Pre-Wave-5 this returned ``None`` for both failure modes, + causing transient errors to silently lose their retry signal. """ if worker_factory is not None: try: - return await worker_factory(row) - except Exception as exc: # noqa: BLE001 — surface via log, count as skip + return CleanupWorkerResolution(worker=await worker_factory(row), transient=False) + except WorkerFactoryError as exc: logger.warning( - "cleanup worker_factory failed modality=%s row id=%d collection=%s: %s — counting as backend_skipped", + "cleanup worker_factory gate raised modality=%s row id=%d collection=%s: %s — " + "counting as backend_skipped (intentional gate, dropping DB row)", row.modality, row.id, row.collection_id, exc, ) - return None + return CleanupWorkerResolution(worker=None, transient=False) + except Exception as exc: # noqa: BLE001 — transient infra error, retry next cycle + logger.warning( + "cleanup worker_factory transient failure modality=%s row id=%d collection=%s: %s — " + "skipping DB row drop, will retry next cycle", + row.modality, + row.id, + row.collection_id, + exc, + ) + return CleanupWorkerResolution(worker=None, transient=True) if workers is None: - return None + return CleanupWorkerResolution(worker=None, transient=False) try: modality = Modality(row.modality) except ValueError: - return None - return workers.get(modality) + return CleanupWorkerResolution(worker=None, transient=False) + return CleanupWorkerResolution(worker=workers.get(modality), transient=False) # --------------------------------------------------------------------- @@ -301,6 +350,7 @@ async def cleanup_orphan_parse_versions( "rows_deleted": 0, "graph_noop": 0, "backend_skipped": 0, + "transient_deferred": 0, } delete_ids: list[int] = [] @@ -315,11 +365,18 @@ async def cleanup_orphan_parse_versions( ) continue - worker = await _resolve_cleanup_worker( + resolution = await _resolve_cleanup_worker( workers=workers, worker_factory=worker_factory, row=row, ) + if resolution.transient: + # Wave 5 P4: transient infra error — skip both backend + # delete AND DB row drop so the next cleanup cycle (5 min + # later) retries automatically once the backend recovers. + counts["transient_deferred"] += 1 + continue + worker = resolution.worker if worker is None: logger.warning( "cleanup no worker registered for modality=%s row id=%d — skipping backend delete", @@ -424,6 +481,7 @@ async def cleanup_for_deleted_documents( "graph_lineage_cleaned": 0, "rows_deleted": 0, "backend_skipped": 0, + "transient_deferred": 0, } # Per-document, per-modality dedup so graph lineage cleanup runs @@ -444,11 +502,18 @@ async def cleanup_for_deleted_documents( ) continue - worker = await _resolve_cleanup_worker( + resolution = await _resolve_cleanup_worker( workers=workers, worker_factory=worker_factory, row=row, ) + if resolution.transient: + # Wave 5 P4: transient infra error — skip backend delete + # AND DB row drop so the caller can retry on a later cycle + # / re-invocation once the backend recovers. + counts["transient_deferred"] += 1 + continue + worker = resolution.worker if worker is None: logger.warning( "cleanup no worker for modality=%s row id=%d document=%s — skipping backend", @@ -623,6 +688,7 @@ async def cleanup_for_deleted_collections( "graph_lineage_cleaned": 0, "rows_deleted": 0, "backend_skipped": 0, + "transient_deferred": 0, "collections_cleaned": 0, } if not collection_ids: @@ -641,7 +707,13 @@ async def cleanup_for_deleted_collections( worker_factory=worker_factory, document_ids=document_ids, ) - for key in ("backend_deleted", "graph_lineage_cleaned", "rows_deleted", "backend_skipped"): + for key in ( + "backend_deleted", + "graph_lineage_cleaned", + "rows_deleted", + "backend_skipped", + "transient_deferred", + ): counts[key] += sub_counts[key] # Sweep any rows that path B did not catch (no document_id match diff --git a/aperag/indexing/parser.py b/aperag/indexing/parser.py index ddfdee052..f62d8edac 100644 --- a/aperag/indexing/parser.py +++ b/aperag/indexing/parser.py @@ -329,6 +329,29 @@ def _document_md5(source_bytes: bytes) -> str: return hashlib.md5(source_bytes).hexdigest() +def _all_artifacts_present( + *, + store: _SyncObjectStore, + markdown_path: str, + outline_path: str, + chunks_path: str, +) -> bool: + """Wave 5 P4 short-circuit predicate: all three canonical + derived artifacts must exist for the cached parse to be valid. + + Uses ``ObjectStore.obj_exists`` (cheap metadata check) rather + than ``read_or_none`` so the predicate stays cost-bounded for + every parse call. ``chunks.jsonl`` is checked last because it is + the only artifact downstream modality workers actually read; if + it is missing the previous parse was interrupted mid-write and + re-parsing is required regardless. + """ + try: + return store.obj_exists(markdown_path) and store.obj_exists(outline_path) and store.obj_exists(chunks_path) + except Exception: # noqa: BLE001 — predicate fails closed (re-parse) + return False + + def _docparser_extract_markdown( *, source_bytes: bytes, @@ -398,6 +421,7 @@ def parse_document( source_filename: str | None = None, parser_config: dict[str, Any] | None = None, config: ParseConfig | None = None, + short_circuit_if_artifacts_exist: bool = True, ) -> ParseResult: """Parse the source bytes and persist the three shared artifacts. @@ -406,6 +430,16 @@ def parse_document( them atomically). The artifact paths are the canonical ``derived/parse_/`` layout per design pack §C.1. + Wave 5 P4 short-circuit: when ``short_circuit_if_artifacts_exist`` + is True (default) and all three derived artifacts (``markdown.md`` + / ``outline.json`` / ``chunks.jsonl``) already exist in the object + store under the canonical ``derived/parse_/`` path, the + parser **skips DocParser + writes entirely** and returns the + existing :class:`ParseResult`. This eliminates the ~30s OCR / Word + rerun cost when a document is re-uploaded with identical content + or a rebuild is dispatched against an already-parsed version + (per huangheng T3 chunk 2 obs B + architect Wave 5 P4 lock). + Dispatch (Wave 4 T3 chunk 1): - ``source_filename`` ends in a known text extension (``.md`` / ``.markdown`` / ``.txt`` / ``.text``) **or** is ``None`` → @@ -433,6 +467,12 @@ def parse_document( to the real parser chain. Ignored on the simulator path. config: Parsing knobs that influence the parse_version. Pass ``None`` to use simulator defaults. + short_circuit_if_artifacts_exist: When True (default), reuse + existing canonical artifacts if all three are already in + the object store under the resolved + ``derived/parse_/`` path. Pass ``False`` to force + a re-parse + re-write (used by tests pinning DocParser + invocation count). """ from aperag.mcp.tools.parse_version import compute_parse_version @@ -446,6 +486,45 @@ def parse_document( chunking_config=cfg.chunking.serialize(), ) + markdown_path = derived_artifact( + collection_id=collection_id, + document_id=document_id, + parse_version=parse_version, + filename="markdown.md", + ) + outline_path = derived_artifact( + collection_id=collection_id, + document_id=document_id, + parse_version=parse_version, + filename="outline.json", + ) + chunks_path = derived_artifact( + collection_id=collection_id, + document_id=document_id, + parse_version=parse_version, + filename="chunks.jsonl", + ) + + if short_circuit_if_artifacts_exist and _all_artifacts_present( + store=store, + markdown_path=markdown_path, + outline_path=outline_path, + chunks_path=chunks_path, + ): + logger.info( + "indexing parser short-circuit collection=%s document=%s parse_version=%s " + "(all derived artifacts already present; skipping DocParser + writes)", + collection_id, + document_id, + parse_version, + ) + return ParseResult( + parse_version=parse_version, + markdown_path=markdown_path, + outline_path=outline_path, + chunks_path=chunks_path, + ) + if extension is None or extension in _SIMULATOR_EXTENSIONS: try: markdown = source_bytes.decode("utf-8") @@ -470,25 +549,6 @@ def parse_document( outline = _build_outline(markdown) chunks = _split_chunks(markdown, cfg.chunking) - markdown_path = derived_artifact( - collection_id=collection_id, - document_id=document_id, - parse_version=parse_version, - filename="markdown.md", - ) - outline_path = derived_artifact( - collection_id=collection_id, - document_id=document_id, - parse_version=parse_version, - filename="outline.json", - ) - chunks_path = derived_artifact( - collection_id=collection_id, - document_id=document_id, - parse_version=parse_version, - filename="chunks.jsonl", - ) - write_atomic(store, markdown_path, markdown.encode("utf-8")) write_atomic( store, diff --git a/aperag/indexing/reconciler.py b/aperag/indexing/reconciler.py index affa1aebd..997abc8f0 100644 --- a/aperag/indexing/reconciler.py +++ b/aperag/indexing/reconciler.py @@ -80,6 +80,14 @@ # cycle will pick up the rest. RECONCILE_BATCH_SIZE = 100 +# Wave 5 P4: cooldown before a document with zero ``document_index`` +# rows is considered "stuck after upload" and re-enqueued onto +# ``q:parse``. Set to 5 minutes so a normal parse run (PDF ~30s, +# Office ~60s, OCR ~3min) never trips it. Operators can shorten the +# cooldown via ``stuck_parse_cooldown_seconds`` for tighter recovery +# on small-document workloads. +STUCK_PARSE_COOLDOWN_SECONDS = 300 + def _utcnow() -> datetime: return datetime.now(timezone.utc) @@ -255,11 +263,6 @@ def reconcile_running_reclaim( # canonical 3-statement TX. -# --------------------------------------------------------------------- -# Run loop — production entrypoint. -# --------------------------------------------------------------------- - - # --------------------------------------------------------------------- # Pattern B periodic hook — collection summary reconciliation. # --------------------------------------------------------------------- @@ -417,6 +420,247 @@ def _reclaim_stale_and_claim_pending() -> list[tuple[str, str, int, str]]: logger.info("collection-summary reconciler dispatched=%d", len(dispatches)) +# --------------------------------------------------------------------- +# Wave 5 P4 — stuck-document parse re-enqueue (closes T3 chunk 2 obs A +# silent-drop loop). +# --------------------------------------------------------------------- +# +# T3 chunk 2 q:parse async promotion logs + drops parse failures +# (DocParser raise / source missing) without persisting any +# ``document_index`` row. Pre-Wave-5 this surfaced as ``document.status +# == PENDING`` forever — the operator had no signal to re-trigger. +# This reconciler scan closes the loop by detecting documents that +# uploaded > N minutes ago but never sprouted a ``document_index`` +# row, then re-enqueueing a parse job. +# +# Architectural note: the scan is **at-most-once-per-cooldown**, not +# per cycle: every re-enqueue advances ``Document.gmt_updated`` so +# consecutive cycles do not multi-push. Without that throttle a +# stuck document would re-queue every 30s, drowning the parse +# worker pool. + + +async def reconcile_stuck_documents_for_parse_reenqueue( + *, + engine: Engine, + queue: WorkQueue, + cooldown_seconds: int = STUCK_PARSE_COOLDOWN_SECONDS, + batch_size: int = RECONCILE_BATCH_SIZE, +) -> int: + """Re-enqueue ``q:parse`` for documents stuck without ``document_index`` rows. + + A document is "stuck" when: + + * ``Document.gmt_deleted IS NULL`` (still active) + * ``Document.gmt_created < now - cooldown_seconds`` (uploaded long + enough ago that a normal parse should have completed and + dispatched) + * **zero** ``document_index`` rows reference its ``document_id`` + (parse never reached :func:`dispatch_indexing` — the parse worker + either died, the source was missing, or DocParser raised) + * ``Document.gmt_updated < now - cooldown_seconds`` (haven't + already been re-enqueued this cooldown window — prevents the + 30-s reconciler tick from drowning ``q:parse`` with the same + stuck doc on every cycle) + + For each stuck document the function pushes a fresh + :class:`ParseDispatchPayload` matching the upload handler's + contract (per ``document_service._create_or_update_document_indexes``). + The parse worker handles retries via its existing log-and-drop + semantics; this scan is the **producer-side recovery loop** that + closes T3 chunk 2 obs A. + + Returns the number of stuck documents re-enqueued. Failure to + reach Redis logs + bumps to next cycle (queue may be transiently + unhealthy; we never crash the loop). + """ + payloads = await asyncio.to_thread( + _select_stuck_documents_for_reenqueue, + engine, + cooldown_seconds, + batch_size, + ) + if not payloads: + return 0 + + pushed_ids: list[str] = [] + for payload_dict in payloads: + try: + await queue.push_parse(payload=payload_dict) + except Exception as exc: # noqa: BLE001 — transient Redis fault, retry next cycle + logger.warning( + "stuck-document parse re-enqueue failed for document=%s: %s — will retry next cycle", + payload_dict.get("document_id", ""), + exc, + ) + continue + pushed_ids.append(str(payload_dict["document_id"])) + + if pushed_ids: + await asyncio.to_thread(_mark_stuck_documents_reenqueued, engine, pushed_ids) + logger.info( + "reconciler stuck-parse: re-enqueued %d documents to q:parse", + len(pushed_ids), + ) + return len(pushed_ids) + + +def _select_stuck_documents_for_reenqueue( + engine: Engine, + cooldown_seconds: int, + batch_size: int, +) -> list[dict[str, object]]: + """Sync DB scan for stuck documents. Returns a list of fully-formed + parse payload dicts ready for ``queue.push_parse``. + + Runs through ``asyncio.to_thread`` from the async caller so the + SQLAlchemy / collection-config decoding cost does not block the + event loop. + """ + from aperag.domains.knowledge_base.db.models import Document + + threshold = _utcnow() - timedelta(seconds=cooldown_seconds) + stuck_dicts: list[dict[str, object]] = [] + with Session(engine) as session: + rows_for_doc = select(DocumentIndex.document_id).where(DocumentIndex.document_id == Document.id).exists() + stmt = ( + select(Document) + .where( + and_( + Document.gmt_deleted.is_(None), + Document.gmt_created < threshold, + Document.gmt_updated < threshold, + ~rows_for_doc, + ) + ) + .order_by(Document.gmt_created) + .limit(batch_size) + ) + for document in session.scalars(stmt): + payload = _build_parse_payload_for_document(session, document) + if payload is None: + continue + stuck_dicts.append(payload.to_dict()) + return stuck_dicts + + +def _build_parse_payload_for_document(session: Session, document): + """Reconstruct the parse payload the upload handler would have + pushed. Mirrors + :func:`aperag.domains.knowledge_base.service.document_service._create_or_update_document_indexes` + so the parse worker sees identical data shape regardless of which + producer enqueued the job. Returns ``None`` to signal an + unrecoverable document (missing object_path / no enabled modalities). + """ + import json as _json + + from aperag.domains.knowledge_base.db.models import Collection + from aperag.indexing.parse_orchestrator import ParseDispatchPayload + + metadata: dict[str, object] = {} + if document.doc_metadata: + try: + metadata = _json.loads(document.doc_metadata) or {} + except (TypeError, ValueError): + metadata = {} + object_path = metadata.get("object_path") + if not object_path: + logger.warning( + "stuck-document parse re-enqueue: skipping document=%s — doc_metadata.object_path missing", + document.id, + ) + return None + + collection = session.get(Collection, document.collection_id) if document.collection_id else None + parser_config = _resolve_collection_parser_config(collection) + modalities = _resolve_collection_modalities(collection) + if not modalities: + logger.warning( + "stuck-document parse re-enqueue: skipping document=%s — collection has no enabled modalities", + document.id, + ) + return None + + return ParseDispatchPayload( + document_id=document.id, + collection_id=document.collection_id or "", + object_path=str(object_path), + tenant_scope_key=f"user:{document.user}", + modalities=tuple(modalities), + parser_config=parser_config, + purge_existing_triples=True, + ) + + +def _resolve_collection_parser_config(collection) -> dict | None: + """Mirror ``document_service._resolve_parser_config_for_collection`` + in sync context. Defensive on every shape (None / non-JSON / + non-dict). + """ + import json as _json + + if collection is None or not collection.config: + return None + try: + config_dict = _json.loads(collection.config) + except (TypeError, ValueError): + return None + if not isinstance(config_dict, dict): + return None + parser_config = config_dict.get("parser_config") + if parser_config is None or not isinstance(parser_config, dict): + return None + return parser_config + + +def _resolve_collection_modalities(collection) -> list[str]: + """Walk ``collection.config.{enable_vector, enable_fulltext, + enable_knowledge_graph, enable_summary, enable_vision}`` to derive + the modality list. Default to vector + fulltext + summary (the + always-Wave-3-real subset) when the collection's config is + missing or malformed. + """ + import json as _json + + defaults = ["vector", "fulltext", "summary"] + if collection is None or not collection.config: + return defaults + try: + cfg = _json.loads(collection.config) + except (TypeError, ValueError): + return defaults + if not isinstance(cfg, dict): + return defaults + + modalities: list[str] = [] + if cfg.get("enable_vector", True): + modalities.append("vector") + if cfg.get("enable_fulltext", True): + modalities.append("fulltext") + if cfg.get("enable_knowledge_graph", False): + modalities.append("graph") + if cfg.get("enable_summary", False): + modalities.append("summary") + if cfg.get("enable_vision", False): + modalities.append("vision") + return modalities or defaults + + +def _mark_stuck_documents_reenqueued(engine: Engine, document_ids: list[str]) -> None: + """Bump ``Document.gmt_updated`` for the re-enqueued documents so + the next reconciler tick does not pick the same documents up + again until the next cooldown window. Re-using ``gmt_updated`` + keeps the scan single-table and avoids an alembic migration for + a recovery-loop bookkeeping field. + """ + if not document_ids: + return + from aperag.domains.knowledge_base.db.models import Document + + with Session(engine) as session, session.begin(): + session.execute(update(Document).where(Document.id.in_(document_ids)).values(gmt_updated=_utcnow())) + + # --------------------------------------------------------------------- # Run loop — production entrypoint. # --------------------------------------------------------------------- @@ -450,12 +694,17 @@ async def run_reconcile_loop( engine=engine, stale_seconds=stale_seconds, ) - if pushed or retried or reclaimed: + stuck_reenqueued = await reconcile_stuck_documents_for_parse_reenqueue( + engine=engine, + queue=queue, + ) + if pushed or retried or reclaimed or stuck_reenqueued: logger.info( - "reconciler cycle: dispatched=%d retried=%d reclaimed=%d", + "reconciler cycle: dispatched=%d retried=%d reclaimed=%d stuck_reenqueued=%d", pushed, retried, reclaimed, + stuck_reenqueued, ) except Exception as exc: # noqa: BLE001 — keep the loop alive logger.exception("reconciler cycle failed: %s", exc) @@ -473,9 +722,11 @@ async def run_reconcile_loop( "HEARTBEAT_STALE_SECONDS", "RECONCILE_BATCH_SIZE", "RECONCILE_INTERVAL_SECONDS", + "STUCK_PARSE_COOLDOWN_SECONDS", "reconcile_collection_summaries_hook", "reconcile_failed_retry", "reconcile_pending_dispatch", "reconcile_running_reclaim", + "reconcile_stuck_documents_for_parse_reenqueue", "run_reconcile_loop", ] diff --git a/tests/integration/test_p4_robustness_3pack.py b/tests/integration/test_p4_robustness_3pack.py new file mode 100644 index 000000000..700b2c681 --- /dev/null +++ b/tests/integration/test_p4_robustness_3pack.py @@ -0,0 +1,577 @@ +# Copyright 2025 ApeCloud, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Wave 5 Phase 4 — production robustness 3-pack. + +Three latent issues from Wave 4 huangheng + architect ratify obs surface +silent-drop / wasted-work bugs that this phase locks down with explicit +behaviour: + +1. **T2 cleanup transient-vs-intentional split** — pre-Wave-5 + ``_resolve_cleanup_worker`` collapsed any factory exception into + "drop the DB row". This loses the retry signal when a transient + infra error (Qdrant blip, ES network glitch) makes a + ``WorkerFactoryError``-look-alike. Wave 5 P4 distinguishes the two + so transient errors keep the row for next-cycle retry; intentional + gates still drop the row to keep the index from growing. + +2. **T3 parse_orchestrator parse_version short-circuit** — pre-Wave-5 + :func:`parse_document` always re-runs DocParser even when the + resulting artifact directory is byte-identical (parse_version is + content-derived). Wave 5 P4 short-circuits when all three derived + artifacts are already present in the object store. + +3. **T2 reconciler stuck-document parse re-enqueue** — pre-Wave-5 + parse failures (DocParser raise / source missing) silently dropped + the document_id; the operator had no recovery path. Wave 5 P4 adds + a reconciler scan that re-enqueues ``q:parse`` for documents that + uploaded > N min ago without sprouting any ``document_index`` rows. + +Tests use SQLAlchemy in-memory SQLite + InMemoryWorkQueue + +InMemoryObjectStore so the suite stays fast (~ms per test) without +external dependencies. +""" + +from __future__ import annotations + +import asyncio +from datetime import datetime, timedelta, timezone + +import pytest +from sqlalchemy import Engine, create_engine, insert, select +from sqlalchemy.orm import Session +from sqlalchemy.pool import StaticPool + +from aperag.db.base import Base +from aperag.indexing import ( + InMemoryObjectStore, + InMemoryWorkQueue, + Modality, + parse_document, + reconcile_stuck_documents_for_parse_reenqueue, +) +from aperag.indexing.cleanup import ( + CleanupWorkerResolution, + _resolve_cleanup_worker, + cleanup_for_deleted_documents, +) +from aperag.indexing.models import DocumentIndex, IndexStatus +from aperag.indexing.worker_factory import WorkerFactoryError + +# ----------------------------------------------------------------------- +# Fixtures — SQLite mirror of the production schema for both +# DocumentIndex and Document tables (P4-3 needs the Document table). +# ----------------------------------------------------------------------- + + +@pytest.fixture +def engine() -> Engine: + eng = create_engine( + "sqlite:///:memory:", + connect_args={"check_same_thread": False}, + poolclass=StaticPool, + ) + # Create only the tables we need for these tests so we don't drag + # the whole schema in. Document table is required for the + # stuck-parse reconciler scan; DocumentIndex table for the + # zero-rows predicate. + from aperag.domains.knowledge_base.db.models import Collection, Document + + DocumentIndex.metadata.create_all(eng, tables=[DocumentIndex.__table__]) + Base.metadata.create_all(eng, tables=[Collection.__table__, Document.__table__]) + return eng + + +def _utcnow() -> datetime: + return datetime.now(timezone.utc) + + +# ----------------------------------------------------------------------- +# Layer 1 — P4-1 cleanup transient-vs-intentional split +# ----------------------------------------------------------------------- + + +def _stub_row(modality: Modality = Modality.VECTOR) -> DocumentIndex: + return DocumentIndex( + id=1, + document_id="doc-stub", + parse_version="pv0", + modality=modality.value, + status=IndexStatus.ACTIVE.value, + is_serving=True, + tenant_scope_key="user:t", + collection_id="col-stub", + source_path="x", + ) + + +def test_p4_resolve_cleanup_worker_intentional_gate_drops_row(): + """``WorkerFactoryError`` from the factory → ``transient=False``. + + The cleanup loop should drop the DB row so the index does not + grow unboundedly while the gate is active (Wave 4 T2 surgical- + gate corollary preserved). + """ + + async def gated_factory(_row: DocumentIndex): + raise WorkerFactoryError("Wave 4 wiring T1 not yet implemented") + + async def _run() -> None: + result = await _resolve_cleanup_worker( + workers=None, + worker_factory=gated_factory, + row=_stub_row(), + ) + assert result == CleanupWorkerResolution(worker=None, transient=False) + + asyncio.run(_run()) + + +def test_p4_resolve_cleanup_worker_transient_error_keeps_row_for_retry(): + """Non-:class:`WorkerFactoryError` exception → ``transient=True``. + + Operator-visible signal: the next cleanup cycle (5 min later) + will retry. Pre-Wave-5 this collapsed into the same path as the + intentional gate, silently dropping the row + losing the retry + signal. + """ + + async def transient_factory(_row: DocumentIndex): + raise ConnectionError("Qdrant cluster unhealthy") + + async def _run() -> None: + result = await _resolve_cleanup_worker( + workers=None, + worker_factory=transient_factory, + row=_stub_row(), + ) + assert result.worker is None + assert result.transient is True + + asyncio.run(_run()) + + +def test_p4_cleanup_for_deleted_documents_skips_drop_on_transient_error(engine): + """Path B — ``transient=True`` resolution skips DB row drop. + + The row stays in ``document_index`` so the next cleanup cycle + sees it again. Counts the deferred row under + ``transient_deferred`` so operators can track the recovery rate. + """ + pv = "stuckxyz0000000a"[:16] + with Session(engine) as session, session.begin(): + session.execute( + insert(DocumentIndex).values( + document_id="doc-transient", + parse_version=pv, + modality=Modality.VECTOR.value, + status=IndexStatus.ACTIVE.value, + tenant_scope_key="user:t", + collection_id="col-transient", + source_path=f"collections/col-transient/documents/doc-transient/derived/parse_{pv}/chunks.jsonl", + is_serving=True, + ) + ) + + async def transient_factory(_row: DocumentIndex): + raise ConnectionError("Qdrant cluster unhealthy") + + counts = asyncio.run( + cleanup_for_deleted_documents( + engine=engine, + worker_factory=transient_factory, + document_ids=["doc-transient"], + ) + ) + assert counts["transient_deferred"] == 1 + assert counts["rows_deleted"] == 0 + assert counts["backend_skipped"] == 0 + assert counts["backend_deleted"] == 0 + + with Session(engine) as session: + rows = list(session.scalars(select(DocumentIndex).where(DocumentIndex.document_id == "doc-transient"))) + assert len(rows) == 1, "transient infra error must NOT drop the DB row" + + +def test_p4_cleanup_for_deleted_documents_drops_row_on_intentional_gate(engine): + """Path B — ``transient=False`` from a :class:`WorkerFactoryError` + intentional gate still drops the DB row (the gate is by-design; + keeping the row would let the index grow unboundedly). + """ + pv = "gatedxyz000000a"[:15] + "x" + with Session(engine) as session, session.begin(): + session.execute( + insert(DocumentIndex).values( + document_id="doc-gated", + parse_version=pv, + modality=Modality.GRAPH.value, + status=IndexStatus.ACTIVE.value, + tenant_scope_key="user:t", + collection_id="col-gated", + source_path=f"collections/col-gated/documents/doc-gated/derived/parse_{pv}/chunks.jsonl", + is_serving=True, + ) + ) + + async def gated_factory(_row: DocumentIndex): + raise WorkerFactoryError("Wave 4 wiring T1 not yet") + + counts = asyncio.run( + cleanup_for_deleted_documents( + engine=engine, + worker_factory=gated_factory, + document_ids=["doc-gated"], + ) + ) + assert counts["backend_skipped"] == 1 + assert counts["transient_deferred"] == 0 + assert counts["rows_deleted"] == 1 + + with Session(engine) as session: + rows = list(session.scalars(select(DocumentIndex).where(DocumentIndex.document_id == "doc-gated"))) + assert rows == [], "intentional gate must drop the DB row to bound index growth" + + +# ----------------------------------------------------------------------- +# Layer 2 — P4-2 parse_version short-circuit +# ----------------------------------------------------------------------- + + +def test_p4_parse_document_short_circuits_when_artifacts_already_present(): + """Pre-Wave-5 :func:`parse_document` re-runs DocParser even when + artifacts are byte-identical. Wave 5 P4 short-circuits if all + three canonical artifacts already exist in the store under the + canonical ``derived/parse_/`` path. + """ + store = InMemoryObjectStore() + body = b"# Cached Doc\n\nFirst paragraph.\n\n## Sub\n\nSecond paragraph.\n" + + # First parse — populates artifacts. + first = parse_document( + store=store, + collection_id="col", + document_id="doc", + source_bytes=body, + ) + snapshot = dict(store._objects) # noqa: SLF001 — test introspection + + # Hijack store.put to surface unintended writes during the second call. + write_calls = {"count": 0} + original_put = store.put + + def tracking_put(path, data): # noqa: ANN001 + write_calls["count"] += 1 + original_put(path, data) + + store.put = tracking_put # type: ignore[method-assign] + + # Second parse — must short-circuit (no writes, no DocParser). + second = parse_document( + store=store, + collection_id="col", + document_id="doc", + source_bytes=body, + ) + assert second == first + assert write_calls["count"] == 0, "short-circuit must not re-write artifacts" + assert dict(store._objects) == snapshot, "artifacts must be byte-identical" + + +def test_p4_parse_document_does_not_short_circuit_when_chunks_missing(): + """If ``chunks.jsonl`` is missing the previous parse was interrupted; + the short-circuit predicate must return False so DocParser re-runs. + """ + store = InMemoryObjectStore() + body = b"# Doc\n\nText.\n" + + # First parse — populates all three artifacts. + parse_document( + store=store, + collection_id="col", + document_id="doc", + source_bytes=body, + ) + + # Manually delete chunks.jsonl to simulate an interrupted parse. + chunks_keys = [k for k in store._objects if k.endswith("chunks.jsonl")] + assert chunks_keys, "fixture should have produced chunks.jsonl" + for k in chunks_keys: + del store._objects[k] # noqa: SLF001 + + # Second parse — must NOT short-circuit; it must re-write chunks. + parse_document( + store=store, + collection_id="col", + document_id="doc", + source_bytes=body, + ) + chunks_keys = [k for k in store._objects if k.endswith("chunks.jsonl")] + assert chunks_keys, "interrupted parse must be recovered, not skipped" + + +def test_p4_parse_document_short_circuit_can_be_disabled(): + """Tests / debugging callers can pass + ``short_circuit_if_artifacts_exist=False`` to force a re-parse. + """ + store = InMemoryObjectStore() + body = b"# Doc\n\ntext\n" + + parse_document( + store=store, + collection_id="col", + document_id="doc", + source_bytes=body, + ) + + write_calls = {"count": 0} + original_put = store.put + + def tracking_put(path, data): # noqa: ANN001 + write_calls["count"] += 1 + original_put(path, data) + + store.put = tracking_put # type: ignore[method-assign] + + parse_document( + store=store, + collection_id="col", + document_id="doc", + source_bytes=body, + short_circuit_if_artifacts_exist=False, + ) + assert write_calls["count"] >= 3, "disabling short-circuit must re-write all artifacts" + + +# ----------------------------------------------------------------------- +# Layer 3 — P4-3 reconciler stuck-document parse re-enqueue +# ----------------------------------------------------------------------- + + +def _insert_document( + engine: Engine, + *, + document_id: str, + collection_id: str = "col-stuck", + object_path: str | None = "collections/col-stuck/documents/doc/source/upload.pdf", + age: timedelta = timedelta(minutes=10), + user: str = "u1", +) -> None: + """Insert a Document row aged ``age`` ago. The age controls whether + the row is past the cooldown window for re-enqueue. + """ + import json as _json + + from aperag.domains.knowledge_base.db.models import Document + + created = _utcnow() - age + doc_metadata = _json.dumps({"object_path": object_path}) if object_path else None + with Session(engine) as session, session.begin(): + session.add( + Document( + id=document_id, + name="doc.pdf", + user=user, + collection_id=collection_id, + status="UPLOADED", + size=1234, + content_hash="abc", + object_path=object_path, + doc_metadata=doc_metadata, + gmt_created=created, + gmt_updated=created, + ) + ) + + +def _insert_collection(engine: Engine, *, collection_id: str = "col-stuck") -> None: + import json as _json + + from aperag.domains.knowledge_base.db.models import Collection + + with Session(engine) as session, session.begin(): + session.add( + Collection( + id=collection_id, + title="t", + user="u1", + type="document", + status="ACTIVE", + config=_json.dumps( + { + "enable_vector": True, + "enable_fulltext": True, + "enable_summary": True, + } + ), + ) + ) + + +def test_p4_reconciler_re_enqueues_stuck_document(engine): + """A document uploaded > cooldown ago with zero ``document_index`` + rows must be re-enqueued onto ``q:parse`` so the parse worker + gets another chance. + """ + _insert_collection(engine) + _insert_document(engine, document_id="doc-stuck", age=timedelta(minutes=10)) + + queue = InMemoryWorkQueue() + pushed = asyncio.run( + reconcile_stuck_documents_for_parse_reenqueue( + engine=engine, + queue=queue, + cooldown_seconds=60, # 60s cooldown — doc is 10 min old, easily past + ) + ) + assert pushed == 1 + assert queue.parse_qsize() == 1 + + +def test_p4_reconciler_skips_documents_within_cooldown(engine): + """A freshly-uploaded document (within cooldown) must NOT be + re-enqueued — the parse worker may simply be in the middle of + its first run. + """ + _insert_collection(engine) + _insert_document(engine, document_id="doc-fresh", age=timedelta(seconds=10)) + + queue = InMemoryWorkQueue() + pushed = asyncio.run( + reconcile_stuck_documents_for_parse_reenqueue( + engine=engine, + queue=queue, + cooldown_seconds=300, # 5 min — doc is 10s old + ) + ) + assert pushed == 0 + assert queue.parse_qsize() == 0 + + +def test_p4_reconciler_skips_documents_with_existing_index_rows(engine): + """A document that already has ``document_index`` rows is being + successfully indexed; the reconciler must not interfere. + """ + _insert_collection(engine) + _insert_document(engine, document_id="doc-indexing", age=timedelta(minutes=10)) + pv = "indexpv00000000a"[:15] + "a" + with Session(engine) as session, session.begin(): + session.execute( + insert(DocumentIndex).values( + document_id="doc-indexing", + parse_version=pv, + modality=Modality.VECTOR.value, + status=IndexStatus.PENDING.value, + tenant_scope_key="user:u1", + collection_id="col-stuck", + source_path=f"collections/col-stuck/documents/doc-indexing/derived/parse_{pv}/chunks.jsonl", + is_serving=False, + ) + ) + + queue = InMemoryWorkQueue() + pushed = asyncio.run( + reconcile_stuck_documents_for_parse_reenqueue( + engine=engine, + queue=queue, + cooldown_seconds=60, + ) + ) + assert pushed == 0 + assert queue.parse_qsize() == 0 + + +def test_p4_reconciler_at_most_once_per_cooldown(engine): + """Two consecutive reconciler ticks must not re-push the same + stuck document — gmt_updated bumps after first push so the + cooldown predicate filters it out. + """ + _insert_collection(engine) + _insert_document(engine, document_id="doc-bump", age=timedelta(minutes=10)) + + queue = InMemoryWorkQueue() + first = asyncio.run( + reconcile_stuck_documents_for_parse_reenqueue( + engine=engine, + queue=queue, + cooldown_seconds=60, + ) + ) + assert first == 1 + second = asyncio.run( + reconcile_stuck_documents_for_parse_reenqueue( + engine=engine, + queue=queue, + cooldown_seconds=60, + ) + ) + assert second == 0, ( + "second tick must not re-push within the cooldown window — Document.gmt_updated was bumped on the first push" + ) + assert queue.parse_qsize() == 1 + + +def test_p4_reconciler_skips_documents_without_object_path(engine): + """A stuck document whose ``doc_metadata.object_path`` is missing + cannot be re-parsed — the parse worker has nothing to read. + The reconciler must skip it (operator must fix the upload). + """ + _insert_collection(engine) + _insert_document( + engine, + document_id="doc-missing-path", + age=timedelta(minutes=10), + object_path=None, + ) + + queue = InMemoryWorkQueue() + pushed = asyncio.run( + reconcile_stuck_documents_for_parse_reenqueue( + engine=engine, + queue=queue, + cooldown_seconds=60, + ) + ) + assert pushed == 0 + + +def test_p4_reconciler_pushes_correct_payload_shape(engine): + """The payload pushed by the reconciler must match the upload + handler's contract (per ``ParseDispatchPayload`` shape) so the + parse worker sees identical data regardless of producer. + """ + _insert_collection(engine) + _insert_document( + engine, + document_id="doc-payload", + collection_id="col-stuck", + age=timedelta(minutes=10), + ) + + queue = InMemoryWorkQueue() + asyncio.run( + reconcile_stuck_documents_for_parse_reenqueue( + engine=engine, + queue=queue, + cooldown_seconds=60, + ) + ) + + payload = asyncio.run(queue.pop_parse(timeout_seconds=0.5)) + assert payload is not None + assert payload["document_id"] == "doc-payload" + assert payload["collection_id"] == "col-stuck" + assert payload["object_path"] == "collections/col-stuck/documents/doc/source/upload.pdf" + assert payload["tenant_scope_key"] == "user:u1" + assert set(payload["modalities"]) == {"vector", "fulltext", "summary"} + assert payload["purge_existing_triples"] is True diff --git a/tests/unit_test/indexing/test_t3_1_dispatcher_path_c.py b/tests/unit_test/indexing/test_t3_1_dispatcher_path_c.py index 6461b0328..53c4f1c13 100644 --- a/tests/unit_test/indexing/test_t3_1_dispatcher_path_c.py +++ b/tests/unit_test/indexing/test_t3_1_dispatcher_path_c.py @@ -366,6 +366,7 @@ def test_path_c_handles_empty_input(engine): "graph_lineage_cleaned": 0, "rows_deleted": 0, "backend_skipped": 0, + "transient_deferred": 0, "collections_cleaned": 0, } From b4b8e8840d0295d35f4b8ea60117655ef2eef2df Mon Sep 17 00:00:00 2001 From: Bryce Date: Mon, 27 Apr 2026 18:13:48 +0800 Subject: [PATCH 04/13] =?UTF-8?q?docs(celery=20Wave=205=20P1=20chunk=203):?= =?UTF-8?q?=20=C2=A7K.9=20Phase=201=20scope=20narrow=20+=20=C2=A7K.10=20Wa?= =?UTF-8?q?ve=206=20backlog?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per architect Option (a) ruling 2026-04-27 (post-cascade scope check msg=30c7e994 + ratify msg=8780c937): * §K.9 Phase 1 narrowed scope: relocate-only (chunks 1+2 already shipped via `11113acb` + `e0707165`). Original Phase 1 scope (delete legacy `graphindex` package + migrate retrieval/curation callers) collided with unresolved design gap — legacy `GraphIndexService. query_context()` 24-method LightRAG-style API has no equivalent on the new 10-method `LineageGraphStore` Protocol; building that query layer is 1-2 week design + implementation effort, not in-scope for Wave 5 single-PR fast-landing style. * §K.10 Wave 6 backlog added: full legacy graphindex elimination + retrieval/curation migration + new LightRAG-style query layer design moves to Wave 6 separate PR with its own architect ratify lane (per chunk 4d Option C ruling msg=b26f64b2 + Wave 4 §K.8 precedent). * Wave 5 acceptance summary updated: 15/16 items in Wave 5 PR (item 1 legacy graphindex elimination → Wave 6). * Phase 1 close-out signal: chunk 1 (`11113acb`) +chunk 2 (`e0707165`) ship llm.py relocate + import re-route to new canonical home; legacy `graphindex/{integration,prompts}.py` keep deprecation-shim re-exports during the Wave 6 deprecation window so legacy retrieval/curation callers do not break mid-Wave-5. This commit closes Phase 1 narrowed scope and unblocks Phase 2 (T7 multimodal vision-LLM 3-item bundle) per architect ruling. Co-Authored-By: Claude Opus 4.7 --- .../indexing-redesign-design-pack.md | 73 +++++++++++++++---- 1 file changed, 57 insertions(+), 16 deletions(-) diff --git a/docs/modularization/indexing-redesign-design-pack.md b/docs/modularization/indexing-redesign-design-pack.md index 20d28b20e..e3fe8e17c 100644 --- a/docs/modularization/indexing-redesign-design-pack.md +++ b/docs/modularization/indexing-redesign-design-pack.md @@ -1620,27 +1620,33 @@ v2 把 v1 的 7 个细粒度 PR 收成 **3 个 wave**(≈3 个大 PR)。每 **5-phase commit roadmap (16 acceptance items)**: -#### Phase 1 (sequential first) — Legacy graphindex package 整体淘汰 +#### Phase 1 (narrowed scope per architect ruling 2026-04-27 post-cascade scope check) — `aperag/indexing/llm.py` relocate + import re-route -- delete `aperag/domains/knowledge_graph/graphindex/{storage, service, integration, engine, __init__}.py`(per chunk 4d Option C ruling msg=b26f64b2 + §K.8 deferral) -- migrate retrieval/curation 调用 → §G.5 read primitives: - - `aperag/domains/retrieval/pipeline.py:85` `make_service_for_collection` - - `aperag/domains/knowledge_graph/service.py:69+` graph CRUD - - `aperag/graph_curation/service.py:37` + `integration.py:21` - - `aperag/indexing/worker_factory.py:696` `build_collection_llm_callable` -- relocate `build_collection_llm_callable` + `render_extraction_prompt` → 新 `aperag/indexing/llm.py` -- delete tests: `tests/unit_test/graphindex/test_connector.py` / `test_nebula_store.py` / `tests/integration/compat/test_graph_compat.py` +**Scope evolution** (per Bryce scope reality check msg=30c7e994 + architect ruling Option (a) adoption msg-post-2026-04-27-18:10): -**production-readiness 三类 layer**: -- must-be-real: §G.5 read primitives 真 wire retrieval/curation flow(not stubs) -- may-be-gated: 无 (full hard-cut) -- fully-resolves: §K.8 Wave 5 backlog item 1 (legacy graphindex elimination) + item 3 (`aperag/indexing/llm.py` relocate) +The original Phase 1 scope (delete legacy ``graphindex/{storage, service, integration, engine, __init__}.py`` + migrate retrieval/curation callers to §G.5 read primitives) collided with an unresolved design gap: legacy ``GraphIndexService.query_context()`` is a 24-method LightRAG-style query API, while the new ``LineageGraphStore`` Protocol exposes only 4 read methods (``find_entity_ids_with_lineage`` / ``find_relation_keys_with_lineage`` / ``get_entity`` / ``get_relation``) bound to ``(document_id, parse_version)`` lineage scans + single-key fetches. There is no by-keyword / by-semantic / by-graph-traversal API surface on the new pipeline yet — that is itself a 1-2 week design + implementation effort (build LightRAG-style query layer on ``LineageGraphStore`` with vector recall + traversal). + +Per architect Option (a) ruling: **Phase 1 narrowed to relocate-only scope**. Full legacy ``graphindex`` package elimination and retrieval/curation cascade migration are deferred to **Wave 6** as a coordinated cross-cutting refactor PR alongside the new query layer design. + +**Phase 1 actual scope (Wave 5 chunks 1+2 already shipped)**: + +- relocate ``build_collection_llm_callable`` + ``render_extraction_prompt`` + ``ENTITY_RELATION_EXTRACTION`` template + ``LLMCall`` type alias → new ``aperag/indexing/llm.py`` (Wave 5 PR commit ``11113acb``) +- legacy ``graphindex/integration.py`` and ``graphindex/prompts.py`` keep deprecation-shim re-exports during the Wave 6 deprecation window so legacy retrieval/curation callers do not break mid-Wave-5 +- migrate ``aperag/indexing/graph_extractor.py`` import to new location (Wave 5 PR commit ``11113acb``) +- migrate ``aperag/service/prompt_template_service.py`` ``get_default_prompt(prompt_type="graph")`` import to new location (Wave 5 PR commit ``e0707165``) + +**production-readiness 三类 layer (revised)**: +- must-be-real: ``aperag.indexing.llm`` is the canonical home for the relocated helpers; Wave 4 indexing pipeline (``graph_extractor`` + ``worker_factory`` chunk 4b graph dispatch) imports from new location, no transitive legacy dependency +- may-be-gated: legacy ``graphindex/{integration,prompts}.py`` re-export shims are intentional Wave 6 deprecation-window devices, not gated defaults +- fully-resolves: §K.8 Wave 5 backlog item 3 (``aperag/indexing/llm.py`` relocate). Wave 5 backlog item 1 (legacy graphindex package elimination) **deferred to Wave 6** — see §K.10. + +**Wave 6 entry condition** (per architect Option (a) ruling): a separate Wave 6 PR covers the legacy graphindex retrieval-side query layer migration. Phase 1 narrowed scope avoids the Wave 3 lesson #10 anti-pattern (silent regression by forcing a 1-week design refactor into a high-throughput PR) and follows the chunk 4d Option C precedent (narrowed scope + Wave N+1 cross-cutting refactor batch). **pre-check pattern** (per `feedback_spec_lock_grep_verify_caller.md` Pattern 1): -- grep `from aperag.domains.knowledge_graph.graphindex` 全验 caller cascade migration completeness -- grep-zero verify post-migration: `aperag/indexing/*` 不 import legacy graphindex (chunk 4d invariant maintained) +- ``grep -rn "from aperag.domains.knowledge_graph.graphindex" aperag/indexing/`` → 0 matches (chunk 4d invariant maintained ✅) +- legacy callers (``retrieval/pipeline.py`` / ``knowledge_graph/service.py`` / ``graph_curation/*``) **intentionally** still import from legacy modules pending Wave 6 migration -**architect direct ratify scope**: caller migration completeness verify (post-Phase-1 grep-zero check); `aperag/indexing/llm.py` API surface align Wave 4 import path expectations。 +**architect direct ratify scope**: chunk 1 (``11113acb``) ratified ✅; chunk 2 (``e0707165``) huangheng pass-1 ✅. No further Phase 1 architect direct ratify gate needed in Wave 5 — the cascade chunks (3-7) move to Wave 6 with their own architect ratify lane (cascade caller migration spec + completeness verify). #### Phase 2 (post-Phase-1) — T7 multimodal vision-LLM 3-item bundle @@ -1772,6 +1778,41 @@ PM (燧木) 决定。架构师建议参考 D10 模式: - 单 modality 写手(5 人 × Wave 1 一人一个 modality + Wave 2 worker / reconciler 等) - Bryce — graph modality(最复杂的那个,且修 nebula append bug)+ idempotency 自测把关 +### K.10. Wave 6 — Legacy graphindex elimination + retrieval/curation migration(per architect Option (a) ruling 2026-04-27 + Wave 5 Phase 1 scope-narrow handoff) + +**目标**: 完成 Wave 5 Phase 1 narrowed scope 推迟的 legacy ``graphindex`` package 整体淘汰 + retrieval/curation flows 迁移。这是 cross-cutting refactor,与 Wave 5 single-PR 风格不同 — 需先设计新查询层 (LightRAG-style query layer on ``LineageGraphStore``),再 cascade migrate 调用方。 + +**Wave 6 acceptance items** (per Wave 5 §K.9 scope-narrow handoff): + +1. **Design + implement LightRAG-style query layer** on ``LineageGraphStore`` Protocol: + - by-keyword entity / relation lookup + - vector-recall augmented retrieval (复用 §G.5 retrieval pipeline 的 vector connector) + - graph traversal API (1-hop neighbors, multi-hop expansion) + - query result types align with retrieval pipeline expectations +2. **Migrate legacy ``graphindex`` callers to new query layer**: + - ``aperag/domains/retrieval/pipeline.py:_fulltext_search`` etc. — 用新 query API 替 ``GraphIndexService.query_context()`` + - ``aperag/domains/knowledge_graph/service.py`` graph CRUD — 用新 LineageGraphStore find/get methods + - ``aperag/graph_curation/service.py`` + ``integration.py`` + ``candidate_generation.py`` — 用新 query API + Entity DTO relocation +3. **Delete legacy package**: + - ``aperag/domains/knowledge_graph/graphindex/{storage, service, integration, engine, __init__}.py`` + - ``graphindex/{config, dto, prompts, models}.py`` (DTOs may relocate to canonical home) +4. **Delete legacy tests**: + - ``tests/unit_test/graphindex/test_connector.py`` + - ``tests/unit_test/graphindex/test_nebula_store.py`` + - ``tests/integration/compat/test_graph_compat.py`` +5. **Final grep-zero verify**: post-Wave-6, ``aperag/`` 全树无 ``from aperag.domains.knowledge_graph.graphindex`` import + +**production-readiness 三类 layer**: +- must-be-real: 新 LightRAG-style query layer ships behavior-equivalent retrieval (existing graph search results within ε tolerance) +- may-be-gated: 无 (full hard-cut) +- fully-resolves: §K.8 Wave 5 backlog item 1 deferred (legacy graphindex elimination) + supplementary cascade items + +**effort estimate**: 1-2 weeks single-implementer or 0.5-1 week 2-implementer parallel (query layer design + migration + test rewrite + e2e behavior preservation verify)。 + +**architect direct ratify lane**: full Wave 6 — query layer Protocol design + migration completeness verify + behavior-preservation validation against retrieval / curation existing behaviors。 + +**why Wave 6 separate PR (not Wave 5 fold)**: Wave 5 PR (#1733) targets 16 polish items in single-PR fast-landing style. Wave 6 cross-cutting refactor needs (a) new design work upstream, (b) careful behavior validation against existing retrieval flows, (c) phased migration to avoid silent regression. Bundling into Wave 5 risked Wave 3 lesson #10 anti-pattern (rushed cross-cutting refactor in high-throughput PR → silent broken paths). Wave 6 separate PR with its own architect ratify lane is the disciplined path. + --- ## §L. Private / on-premise deployment — "deploy-and-forget" From 1abbaa1d061b61cfb76298bb4e5ff1d3eb1edd75 Mon Sep 17 00:00:00 2001 From: earayu Date: Mon, 27 Apr 2026 18:17:39 +0800 Subject: [PATCH 05/13] feat(celery Wave 5 P3): Layer 2 e2e fixture wiring + sweep D activation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wave 5 Phase 3 (task #28) — replaces the pre-Wave-5 ``pytest.skip`` stubs in ``test_full_indexing_pipeline.py`` Layer 2 with functional test bodies that the e2e-http-compose CI lane can execute against the live backend stack. Per architect msg=fdd53586 + chunk 4d+4e ratify msg=c279a0ff — Layer 2 contract was declared but skip-stub-only; this phase wires the actual implementation so the canonical Phase 1 invariant runs end-to-end whenever the operator opts in via ``RUN_E2E_PHASE1_SMOKE=1``. Two test bodies wired: * ``test_phase1_full_pipeline_vector_fulltext_summary_active_graph_vision_failed`` — the canonical Phase 1 smoke (architect msg=da3012a4): real ``ProductionWorkerFactory`` + parse_document + dispatch_indexing + worker pool drive until every modality finalises. Asserts vector + fulltext + summary reach ACTIVE; graph + vision finalise FAILED with a gate marker (``Wave 4 wiring`` chunk 4b / ``completion model`` post-T1 self-disable / ``multimodal`` vision gate). The gate-marker OR tolerates the same-Wave T1 self-disable surface so the test stays green across the chunk 4b → T1 closure. * ``test_phase1_multi_keyword_fulltext_search_returns_hits`` — sweep D verification (architect msg=fdd53586): index a real document via the worker pool, then issue a 3-keyword query through ``_fulltext_search`` and assert ≥1 hit. The retrieval- side ``minimum_should_match`` arithmetic over N×content + N×title should-clauses (huangheng msg=fb64468c flag) is the latent issue this exercises end-to-end. New fixture machinery: * ``_phase1_e2e_skip_reason()`` — central skip-reason resolver that documents the activation contract: ``RUN_E2E_PHASE1_SMOKE=1`` + ``PHASE1_E2E_COLLECTION_ID`` (pointing at a Collection seeded by the e2e-http-compose bootstrap with a real model provider configured) + backend env vars (``DATABASE_URL`` / ``ES_HOST`` / Qdrant + Redis env vars). Both Layer 2 tests share the same skip predicate so the lane runner only needs one env-var setup. * ``_resolve_phase1_e2e_engine()`` — opens a real Postgres engine using ``settings.database_url``; skips with a clear reason if unset. * ``_run_phase1_workers_until_quiet()`` — bounded async loop that drives ``ProductionWorkerFactory`` directly (mirroring the ``run_*_worker`` lifespan path) until every modality row reaches a terminal status. Bounded by ``timeout_seconds`` so a hung modality fails the test loud rather than blocking forever. Three-class tag (Wave 3 production-readiness invariant): * must-be-real: live ProductionWorkerFactory + real backends + real ES query * may-be-gated: skips when env vars missing (local-dev never requires the full stack) * fully-resolves: 2 of Wave 5 P1 backlog items (Layer 2 stubs activated for canonical full pipeline + sweep D multi-keyword smoke) Cleanup roundtrip (delete document → cleanup loop → backend artefacts gone) is left as a follow-up sub-test to land alongside the e2e-http-compose lane document-delete API access scaffolding. Local gates: * ``pytest tests/integration/test_full_indexing_pipeline.py`` — 4 passed / 2 skipped (Layer 2 skips cleanly with the new ``_phase1_e2e_skip_reason()`` until env vars set in CI lane). * ``pytest tests/unit_test/indexing/ tests/integration/`` — 232 passed / 48 skipped — no regression vs P4 baseline. * ``ruff check aperag/ tests/integration/test_full_indexing_pipeline.py`` — clean. * ``ruff format`` — applied. Branch: ``chenyexuan/celery-wave5-p4`` rebased on ``bryce/celery-wave5`` HEAD ``99af7965`` (Phase 1 chunk 3 spec amend) → push to ``bryce/celery-wave5`` so Wave 5 single-PR pile continues. Co-Authored-By: Claude Opus 4.7 --- .../test_full_indexing_pipeline.py | 401 +++++++++++++++--- 1 file changed, 336 insertions(+), 65 deletions(-) diff --git a/tests/integration/test_full_indexing_pipeline.py b/tests/integration/test_full_indexing_pipeline.py index 8aed66061..6651d22a4 100644 --- a/tests/integration/test_full_indexing_pipeline.py +++ b/tests/integration/test_full_indexing_pipeline.py @@ -360,95 +360,366 @@ async def _run(_row: DocumentIndex = row, _modality: Modality = modality) -> Non # --------------------------------------------------------------------- # Layer 2 — full pipeline e2e (gated by RUN_E2E_PHASE1_SMOKE=1). # Real Postgres + Redis + Qdrant + Elasticsearch + OTel SDK. +# +# Wave 5 P3 (chenyexuan task #28) wires the Layer 2 fixture and +# replaces the pre-Wave-5 ``pytest.skip`` stubs with functional test +# bodies. The e2e-http-compose CI lane sets the env vars below; the +# tests run against the live stack and validate the canonical Phase 1 +# contract end-to-end. # --------------------------------------------------------------------- _PHASE1_E2E_GATE = os.environ.get("RUN_E2E_PHASE1_SMOKE") == "1" +_PHASE1_E2E_COLLECTION_ID = os.environ.get("PHASE1_E2E_COLLECTION_ID") + + +def _phase1_e2e_skip_reason() -> str | None: + """Return ``None`` if all required env vars are set, else a + human-readable skip reason describing what is missing. + + The Wave 5 P3 contract is that Layer 2 runs only when: + * ``RUN_E2E_PHASE1_SMOKE=1`` (operator opt-in) + * ``PHASE1_E2E_COLLECTION_ID`` points at a Collection seeded by + the e2e-http-compose lane bootstrap with a real model provider + configured (so the embedder + summariser actually work). + * Backend env vars set so :class:`ProductionWorkerFactory` can + resolve real clients (``DATABASE_URL`` / ``ES_HOST`` / Qdrant + env vars / ``INDEXING_QUEUE_REDIS_URL``). + + Skipping with a clear reason beats failing — local-dev runs of + this file should not require operators to stand up the full + stack. + """ + if not _PHASE1_E2E_GATE: + return ( + "Phase 1 full e2e smoke gated on RUN_E2E_PHASE1_SMOKE=1 — needs " + "real Postgres + Redis + Qdrant + ES + a configured model " + "provider. Runs in the e2e-http-compose CI lane." + ) + if not _PHASE1_E2E_COLLECTION_ID: + return ( + "Phase 1 Layer 2 needs PHASE1_E2E_COLLECTION_ID env var " + "pointing at the e2e-http-compose-bootstrapped Collection " + "(real model provider configured). Set it from " + "tests/e2e_http/bootstrap/.generated/e2e.env." + ) + return None + + +def _resolve_phase1_e2e_engine() -> Engine: + """Open a real Postgres engine using the production ``settings`` + so the test sees the same Collection rows the e2e-http-compose + lane seeded. + + Mirrors :func:`aperag.config.sync_engine` but builds the engine + directly inside the test so a stale or pooled connection from a + prior pytest module does not leak into the suite. + """ + from aperag.config import settings + + if not settings.database_url: + pytest.skip("Phase 1 Layer 2: settings.database_url is empty (POSTGRES_HOST etc unset)") + return create_engine(settings.database_url, future=True) + + +async def _run_phase1_workers_until_quiet( + *, + engine: Engine, + document_id: str, + timeout_seconds: float = 60.0, +) -> dict[Modality, DocumentIndex]: + """Drive the production worker pool until every per-modality + ``DocumentIndex`` row for ``document_id`` finalises (ACTIVE or + FAILED — terminal states), or ``timeout_seconds`` elapses. + + Returns a dict keyed by Modality with the final row state. + + Implementation drives :class:`ProductionWorkerFactory` directly + against the live backends — same dispatch path the + ``run_*_worker`` lifespan tasks use. Each modality is processed + once per cycle until terminal; the loop is bounded by + ``timeout_seconds`` so a hung modality (e.g. unreachable Qdrant) + fails the test loud rather than blocking forever. + """ + from sqlalchemy import select as sa_select + + from aperag.indexing.orchestrator import process_one_task + + factory = ProductionWorkerFactory(engine=engine) + deadline = asyncio.get_event_loop().time() + timeout_seconds + finalised: dict[Modality, DocumentIndex] = {} + + while asyncio.get_event_loop().time() < deadline: + with Session(engine) as session: + rows = list( + session.execute( + sa_select(DocumentIndex).where(DocumentIndex.document_id == document_id) + ).scalars() + ) + if not rows: + await asyncio.sleep(0.1) + continue + + all_terminal = True + for row in rows: + try: + modality = Modality(row.modality) + except ValueError: + continue + if row.status in (IndexStatus.ACTIVE.value, IndexStatus.FAILED.value): + finalised[modality] = row + continue + all_terminal = False + payload = DispatchPayload( + index_id=row.id, + document_id=row.document_id, + parse_version=row.parse_version, + modality=modality, + source_path=row.source_path or "", + collection_id=row.collection_id, + ) + try: + worker = await factory(payload) + except WorkerFactoryError as exc: + # Gate-raise → finalise FAILED with the gate message. + # Mirrors :class:`run_worker_loop` path so Layer 2 + # exercises the production pattern. + from aperag.indexing.orchestrator import _claim_row, _finalize_failed + + if await asyncio.to_thread(_claim_row, engine, row.id): + await asyncio.to_thread( + _finalize_failed, + engine, + row.id, + f"worker_factory failed: {exc!r}", + ) + continue + await process_one_task( + engine=engine, + payload=payload, + worker=worker, + heartbeat_interval_seconds=0, + ) + + if all_terminal: + break + await asyncio.sleep(0.2) + + return finalised @pytest.mark.skipif( - not _PHASE1_E2E_GATE, - reason=( - "Phase 1 full e2e smoke gated on RUN_E2E_PHASE1_SMOKE=1 — needs " - "real Postgres + Redis + Qdrant + ES + multimodal-capable embedder. " - "Runs in the e2e-http-compose CI lane." - ), + _phase1_e2e_skip_reason() is not None, + reason=(_phase1_e2e_skip_reason() or "phase 1 layer 2 skipped"), ) def test_phase1_full_pipeline_vector_fulltext_summary_active_graph_vision_failed(): """Layer 2 — canonical Phase 1 e2e smoke per architect msg=da3012a4. - Sequence: - 1. Spin up a real ``ProductionWorkerFactory`` against the live - backend stack (Postgres / Redis / Qdrant / ES / OTel SDK). - 2. Upload a markdown document; orchestrator dispatches 5 modality - rows (vector / fulltext / summary / graph / vision). - 3. Run the worker pool until all rows finalise. - 4. Assert vector + fulltext + summary reach - ``status == "ACTIVE"`` (3 modality fully working in Phase 1). - 5. Assert graph + vision finalise ``status == "FAILED"`` with - ``error_message`` containing ``"Wave 4 wiring"`` (gates remain - effective even with chunk 4b backend dispatch wired). - 6. Delete the document; run cleanup loop; assert backend - artefacts removed (Qdrant point count == 0, ES doc count == 0, - lineage entity rows == 0) for the document_id. - - This is the contract the e2e-http-compose CI lane enforces before - Wave 4 close-out (Phase 2 — after T1 + T7 land — flips graph + - vision rows to ACTIVE). + Wave 5 P3: implementation wired against the e2e-http-compose lane + fixture. Reads ``PHASE1_E2E_COLLECTION_ID`` to resolve the live + Collection, dispatches a markdown upload, drives the worker pool + until every modality row finalises, then asserts the canonical + Phase 1 contract: + + * vector + fulltext + summary reach ``ACTIVE`` (three real + modalities ship in Wave 4) + * graph + vision finalise ``FAILED`` with ``error_message`` + containing the gate marker (``Wave 4 wiring`` for chunk 4b + gates, ``completion model`` for the post-T1 gate self-disable + surface, or ``multimodal`` for the vision gate). The OR + tolerates the T1 gate-self-disable transition that ships in + this same Wave (per chunk 4b → T1 closure). + + The cleanup roundtrip (delete document → cleanup loop → backend + artefacts gone) is left to a follow-up sub-test (`... + _and_cleanup_removes_backend_artefacts`) once the Layer 2 + fixture supports document-delete API access. """ - pytest.skip( - "Layer 2 implementation requires a stub model-provider fixture " - "that vector/fulltext/summary embedders can resolve; the " - "fixture lives in the e2e-http-compose lane scaffolding " - "(``tests/e2e_http/scripts/run_full.sh``). Track Wave 4 " - "close-out PR for the wired Layer 2 — current Layer 1 above " - "covers the gate invariants Phase 1 needs locked in CI today." + + from aperag.indexing.dispatcher import DispatchRequest, IndexingMode, dispatch_indexing + from aperag.indexing.parser import ParseConfig, parse_document + from aperag.objectstore.base import get_object_store + + collection_id = _PHASE1_E2E_COLLECTION_ID + assert collection_id, "skip should have caught missing env var" + + document_id = "phase1-e2e-" + uuid.uuid4().hex[:8] + source_bytes = ( + b"# Phase 1 e2e smoke\n\n" + b"This document exercises the canonical Phase 1 contract: " + b"vector + fulltext + summary reach ACTIVE; graph + vision " + b"finalise FAILED with the Wave 4 wiring gate message.\n" ) + async def _run() -> None: + engine = _resolve_phase1_e2e_engine() + try: + object_store = get_object_store() + parsed = parse_document( + store=object_store, + collection_id=collection_id, + document_id=document_id, + source_bytes=source_bytes, + source_filename="phase1-smoke.md", + config=ParseConfig(), + ) + from aperag.indexing.runtime import get_runtime + + runtime = get_runtime() + assert runtime is not None and runtime.queue is not None, ( + "Phase 1 Layer 2 requires a live IndexingRuntime — the e2e-http-compose " + "lane bootstraps it via FastAPI lifespan; ensure the test runs against " + "the live API process." + ) + await dispatch_indexing( + engine=runtime.engine, + queue=runtime.queue, + workers=None, + request=DispatchRequest( + collection_id=collection_id, + document_id=document_id, + parse_version=parsed.parse_version, + source_path=parsed.chunks_path, + tenant_scope_key="user:phase1-e2e", + modalities=tuple(Modality), + ), + mode=IndexingMode.ASYNC, + ) + + # Drive the pool inline so the test does not depend on the + # lifespan worker tasks racing with the assertion. + finalised = await _run_phase1_workers_until_quiet( + engine=engine, + document_id=document_id, + ) + assert set(finalised.keys()) == set(Modality), ( + f"every modality must finalise within timeout; got {set(finalised.keys())}" + ) + + for modality in (Modality.VECTOR, Modality.FULLTEXT, Modality.SUMMARY): + row = finalised[modality] + assert row.status == IndexStatus.ACTIVE.value, ( + f"modality={modality.value} must finalise ACTIVE in Phase 1; " + f"actual={row.status} error={row.error_message}" + ) + assert row.is_serving is True + + for modality in (Modality.GRAPH, Modality.VISION): + row = finalised[modality] + assert row.status == IndexStatus.FAILED.value, ( + f"modality={modality.value} must finalise FAILED until Wave 5 T7 lands; " + f"actual={row.status}" + ) + msg = row.error_message or "" + assert any( + marker in msg + for marker in ("Wave 4 wiring", "completion model", "multimodal") + ), f"modality={modality.value} FAILED message must surface a gate marker; got {msg!r}" + finally: + engine.dispose() + + asyncio.run(_run()) + @pytest.mark.skipif( - not _PHASE1_E2E_GATE, - reason=( - "Phase 1 multi-keyword fulltext smoke gated on RUN_E2E_PHASE1_SMOKE=1 — " - "needs real Elasticsearch + a configured fulltext index. Runs in the " - "e2e-http-compose CI lane." - ), + _phase1_e2e_skip_reason() is not None, + reason=(_phase1_e2e_skip_reason() or "phase 1 layer 2 skipped"), ) def test_phase1_multi_keyword_fulltext_search_returns_hits(): """Layer 2 — sweep D verification (architect msg=fdd53586): exercise - ``_fulltext_search`` with a multi-keyword query against a freshly - indexed document and assert at least one hit. The retrieval-side - ``minimum_should_match`` arithmetic over N×content + N×title should - clauses (huangheng msg=fb64468c flag) is a latent issue per - architect msg=2721a5e7 final review concern D. - - Real-world verification beats algebraic pre-fix — this case runs the - actual ES query semantics against a real ES instance with a real - indexed document. If it passes, the latent risk did not materialise - in production semantics. If it fails, fix-forward in chunk 4e (or - escalate if the fix scope outgrows chunk 4e expectations) per - architect msg=fdd53586 ruling. - - Sequence: - 1. Upload a markdown document with content "ApeRAG combines vector - search and graph retrieval for production RAG workloads.". - 2. Wait for fulltext modality to reach ACTIVE. - 3. Issue a 3-keyword query: ``"ApeRAG vector RAG"``. - 4. Assert at least 1 hit is returned (the indexed document). - 5. Cleanup. + the retrieval-side ``_fulltext_search`` with a multi-keyword query + against a freshly indexed document. Asserts at least one hit so + the ``minimum_should_match`` arithmetic over N×content + N×title + should-clauses (huangheng msg=fb64468c flag) is verified end-to-end. + + Wave 5 P3 wires this against the live ES instance the + e2e-http-compose lane provides. Re-uses the canonical Layer 2 + fixture (``PHASE1_E2E_COLLECTION_ID``). """ - pytest.skip( - "Sweep D multi-keyword fulltext smoke implementation requires the " - "same e2e-http-compose lane scaffolding as the canonical Layer 2 " - "test above (real ES instance + retrieval pipeline + indexed " - "document fixture). Track Wave 4 close-out PR for the wired " - "implementation — until then, sweep D is verified via the " - "tests/integration/test_fulltext_roundtrip_fields.py path which " - "exercises bulk_index + search round-trip on a real ES index." + from aperag.domains.retrieval.pipeline import _fulltext_search + from aperag.indexing.dispatcher import DispatchRequest, IndexingMode, dispatch_indexing + from aperag.indexing.parser import ParseConfig, parse_document + from aperag.objectstore.base import get_object_store + + collection_id = _PHASE1_E2E_COLLECTION_ID + assert collection_id, "skip should have caught missing env var" + document_id = "phase1-sweepd-" + uuid.uuid4().hex[:8] + source_bytes = ( + b"# Sweep D fulltext multi-keyword smoke\n\n" + b"ApeRAG combines vector search and graph retrieval for production RAG workloads.\n" ) + async def _run() -> None: + engine = _resolve_phase1_e2e_engine() + try: + from aperag.indexing.runtime import get_runtime + + runtime = get_runtime() + assert runtime is not None and runtime.queue is not None, ( + "sweep D Layer 2 requires a live IndexingRuntime" + ) + + object_store = get_object_store() + parsed = parse_document( + store=object_store, + collection_id=collection_id, + document_id=document_id, + source_bytes=source_bytes, + source_filename="phase1-sweepd.md", + config=ParseConfig(), + ) + await dispatch_indexing( + engine=runtime.engine, + queue=runtime.queue, + workers=None, + request=DispatchRequest( + collection_id=collection_id, + document_id=document_id, + parse_version=parsed.parse_version, + source_path=parsed.chunks_path, + tenant_scope_key="user:phase1-sweepd", + modalities=(Modality.FULLTEXT,), + ), + mode=IndexingMode.ASYNC, + ) + finalised = await _run_phase1_workers_until_quiet( + engine=engine, + document_id=document_id, + ) + row = finalised.get(Modality.FULLTEXT) + assert row is not None and row.status == IndexStatus.ACTIVE.value, ( + f"fulltext modality must finalise ACTIVE; got {row}" + ) + + # Now exercise the multi-keyword path. ``_fulltext_search`` + # is the retrieval pipeline entry the chat path consumes; + # if its ``minimum_should_match: 80%`` over a 2N-clause + # should-set has the latent calc gap huangheng flagged in + # msg=fb64468c, this assertion fails — which is the whole + # point of sweep D verification. + from aperag.domains.knowledge_base.db.models import Collection + + with Session(engine) as session: + collection = session.get(Collection, collection_id) + assert collection is not None + hits = await _fulltext_search( + collection=collection, + query="ApeRAG vector RAG", + top_k=5, + user_id="user:phase1-sweepd", + chat_id=None, + ) + assert hits and len(hits) >= 1, ( + "multi-keyword fulltext search returned 0 hits — sweep D latent issue " + "may have materialised; check _fulltext_search:350 minimum_should_match calc." + ) + finally: + engine.dispose() + + asyncio.run(_run()) + # --------------------------------------------------------------------- # Type-checker happy: ``Any`` is imported for future Layer 2 stubs. From 75cd8bd9ac2133e8a0b08547b79d96b611ab1799 Mon Sep 17 00:00:00 2001 From: Bryce Date: Mon, 27 Apr 2026 18:18:02 +0800 Subject: [PATCH 06/13] feat(celery Wave 5 P2 chunk 1): EmbeddingService.embed_image API surface (T7 item 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wave 5 task #27 chunk 1 per §G.2.5.1 amendment item 1: ``EmbeddingService.embed_image(image_bytes, alt_text)`` API surface is the canonical multimodal embedding entry point that replaces the Wave 3 placeholder ``embed_query(f"{image_id}|{alt_text}")`` string- concat (Wave 3 lesson #10 broken pattern). Behavior: * When ``self.multimodal=True`` (operator wired a multimodal embedder on the collection's spec), encode image bytes as base64 data URL + forward to LiteLLM ``embedding(input=[{"image_url":{"url":...}}, ...])`` shape that multimodal-capable providers (Voyage / Jina v3 / OpenAI multimodal / etc.) accept natively. * When ``self.multimodal=False``, raise ``EmbeddingError`` with clear operator-facing diagnostic — chunk 4b vision gate already prevents this state but runtime check is defense-in-depth (Wave 3 lesson #10 ship-incomplete-but-don't-silently-lie). * ``alt_text`` paired into LiteLLM input as a textual hint for embedders that accept multi-part inputs; image-only embedders silently drop the text element. The ``aembed_image`` async wrapper mirrors the ``aembed_query`` pattern (``asyncio.to_thread`` since the LiteLLM call is sync). Wave 5 P2 chunk-1 scope (this commit): * declares the API surface so Phase 2 chunks 2 (parser image extraction) + 3 (provider v3 capability flag UI) and the chunk 4b vision gate self-disable path have a concrete contract to wire to * uses imghdr-based MIME detection + LiteLLM's documented multimodal input shape; provider-specific input format variations defer to Wave 6 (per §K.10 Wave 6 backlog cross-cutting refactor) Local gates green: - ``ruff check ./aperag ./tests`` clean - ``ruff format --check ./aperag ./tests`` clean (510 files) - ``pytest`` 179 passed / 2 skipped — no regressions on existing EmbeddingService callers (text-only path unchanged) Production-readiness three-class layer: - must-be-real: real LiteLLM multimodal embedding call against the operator-configured provider when ``multimodal=True`` - may-be-gated: provider-specific input-format adjustments (Voyage vs Jina vs OpenAI multimodal differences) deferred to Wave 6 - fully-resolves: §G.2.5.1 item 1 (``embed_image`` API surface) + §K.9 Phase 2 partial — items 2+3 (parser image extraction + provider v3 capability flag UI) follow in subsequent commits Next Phase 2 chunks: - chunk 2: parser image extraction → ``derived/parse_/vision/images/.`` - chunk 3: provider v3 router multimodal capability flag UI exposure - chunk 4: ``worker_factory._build_vision_worker`` ``_embed`` callsite rewrite to use ``embed_image`` + chunk 4b vision gate self-disable verify (Phase 1 Layer 1 test rename) Co-Authored-By: Claude Opus 4.7 --- aperag/llm/embed/embedding_service.py | 109 +++++++++++++++++++++++++- 1 file changed, 108 insertions(+), 1 deletion(-) diff --git a/aperag/llm/embed/embedding_service.py b/aperag/llm/embed/embedding_service.py index e225fd808..64162ce87 100644 --- a/aperag/llm/embed/embedding_service.py +++ b/aperag/llm/embed/embedding_service.py @@ -18,7 +18,7 @@ import hashlib import logging from concurrent.futures import ThreadPoolExecutor, as_completed -from typing import Dict, List, Sequence, Tuple +from typing import Any, Dict, List, Sequence, Tuple import litellm @@ -153,6 +153,113 @@ async def aembed_query(self, content: str) -> List[float]: def is_multimodal(self) -> bool: return self.multimodal + def embed_image(self, image_bytes: bytes, alt_text: str = "") -> List[float]: + """Embed image bytes into a vector via the configured multimodal model. + + Wave 5 Phase 2 T7 item 1 per §G.2.5.1 amendment: the canonical + multimodal embedding API surface that the new vision modality + consumes (replacing the Wave 3 placeholder ``embed_query(f"{image_id}| + {alt_text}")`` string-concat pattern). Operators wire a real + multimodal embedder (CLIP / GPT-4V / etc.) on the collection's + embedder spec and set ``multimodal=True``; the chunk 4b vision + gate (``is_multimodal()`` check) then self-disables and the + worker_factory ``_build_vision_worker._embed`` callsite invokes + this method with real image bytes. + + Encodes the image as a base64 data URL and forwards it to the + underlying LiteLLM-shaped embedding call via ``input=[{ + "image_url": {"url": "data:image/...;base64,..."}}]`` — providers + that support multimodal embedding (Voyage AI, Jina v3, OpenAI + multimodal embeddings, etc.) accept this shape natively. If + ``multimodal=False`` (operator opted into vision without + configuring a multimodal embedder), raises ``EmbeddingError`` — + the worker_factory chunk 4b gate already prevents this state but + the runtime check is a defense-in-depth that surfaces clear + diagnostics if the gate is bypassed. + + ``alt_text`` is appended as a textual hint for embedders that + accept paired text+image inputs (improves retrieval recall on + images with meaningful captions / OCR'd alt-text). Embedders + that ignore it (image-only embedders) silently drop it. + + Wave 6 follow-up: provider-specific multimodal embedding + configuration (per-provider input format, MIME-type detection, + size limits) is staged in the cross-cutting refactor batch (per + §K.10 Wave 6 backlog). Until then, deployments that configure a + multimodal embedder must verify their provider accepts the + ``input=[{"image_url": {"url": "data:..."}}]`` shape (LiteLLM- + documented multimodal-capable providers). + """ + if not self.multimodal: + raise EmbeddingError( + "embed_image called on a non-multimodal EmbeddingService — " + "set multimodal=True on the collection's embedder spec or " + "set collection.config.enable_vision=false until a real " + "multimodal embedder is configured (Wave 5 P2 §G.2.5.1)" + ) + if not image_bytes: + raise EmptyTextError(1) + try: + return self._embed_image_via_litellm(image_bytes=image_bytes, alt_text=alt_text) + except (EmptyTextError, EmbeddingError): + raise + except Exception as e: + logger.error(f"Image embedding failed: {str(e)}") + raise wrap_litellm_error(e, "embedding", self.embedding_provider, self.model) from e + + async def aembed_image(self, image_bytes: bytes, alt_text: str = "") -> List[float]: + return await asyncio.to_thread(self.embed_image, image_bytes, alt_text) + + def _embed_image_via_litellm(self, *, image_bytes: bytes, alt_text: str) -> List[float]: + """Underlying LiteLLM multimodal embedding call. + + Encodes the image as base64 data URL + builds the LiteLLM + ``input`` payload. Provider-specific input shape variations are + Wave 6 follow-up (per §K.10 Wave 6 backlog cross-cutting + refactor). Currently uses the documented LiteLLM-shaped + ``[{"image_url": {"url": "data:..."}}]`` input that + multimodal-capable providers (Voyage / Jina v3 / OpenAI multi- + modal / etc.) accept natively. + """ + import base64 + import imghdr + + from litellm import embedding as litellm_embedding + + # Detect MIME type from the image bytes header (avoids relying + # on caller-provided alt_text format hints). Falls back to + # image/jpeg if detection fails — most providers tolerate + # mismatched MIME on data-URL inputs. + kind = imghdr.what(None, h=image_bytes) or "jpeg" + mime = f"image/{kind}" + b64 = base64.b64encode(image_bytes).decode("ascii") + data_url = f"data:{mime};base64,{b64}" + + input_payload: List[dict[str, Any]] = [{"image_url": {"url": data_url}}] + if alt_text and alt_text.strip(): + # Pair the image with text for embedders that accept multi- + # part inputs; embedders that ignore text simply drop it. + input_payload.append({"text": alt_text.strip()}) + + response = litellm_embedding( + model=f"{self.embedding_provider}/{self.model}" if self.embedding_provider else self.model, + input=input_payload, + api_key=self.api_key, + api_base=self.base_url, + ) + # LiteLLM normalises response shape to OpenAI-style; pull the + # embedding from the first (and only) data element. + data = getattr(response, "data", None) or response.get("data") # type: ignore[union-attr] + if not data: + raise EmbeddingError("multimodal embedding response missing data field") + first = data[0] + embedding = getattr(first, "embedding", None) + if embedding is None and isinstance(first, dict): + embedding = first.get("embedding") + if embedding is None: + raise EmbeddingError("multimodal embedding response missing embedding field") + return list(embedding) + def _embed_batch_with_indices(self, batch: Sequence[str], start_idx: int) -> List[Tuple[int, List[float]]]: """Process a batch of texts and return embeddings with their original indices.""" try: From 55729ac70771dcc51c8aea21940cdb5d4c000b62 Mon Sep 17 00:00:00 2001 From: earayu Date: Mon, 27 Apr 2026 18:25:29 +0800 Subject: [PATCH 07/13] feat(celery Wave 5 P5A item 1): per-collection knobs for T1 graph extractor caps + timeout Per huangheng T1 obs A msg=6b349693: surface the LightRAG-style extractor's per-chunk caps and LLM timeout as collection-level overrides (`KnowledgeGraphConfig.{max_entities_per_chunk, max_relations_per_chunk, per_chunk_timeout_seconds}`) so deployments tuning a slow LLM provider or extracting from entity-dense documents can lift the defaults without patching `aperag/indexing/graph_extractor.py` constants. * `aperag/schema/common.py`: 3 new optional fields on KnowledgeGraphConfig. * `aperag/indexing/graph_extractor.py`: `_resolve_int_kg_config` / `_resolve_float_kg_config` helpers (mirror `_resolve_entity_types` pydantic-attr / Mapping / JSON-string tolerance pattern + reject non-positive / non-numeric values with warning + default fallback); `_extract_one_chunk` now takes `timeout_seconds` kwarg and `asyncio.wait_for` wires it instead of the global constant. * `tests/integration/test_graph_extractor.py`: 6 new unit tests pinning override-wins / fallback-on-missing / fallback-on-non-positive / int-coerced-to-float / fallback-on-garbage; fixed pre-existing `_extract_one_chunk` call to pass `timeout_seconds=60.0`. Defaults unchanged (32 / 32 / 60.0); collections that don't set the new fields keep current behavior. Co-Authored-By: Claude Opus 4.7 --- aperag/indexing/graph_extractor.py | 83 ++++++++++++++++++++++- aperag/schema/common.py | 29 ++++++++ tests/integration/test_graph_extractor.py | 60 ++++++++++++++++ 3 files changed, 169 insertions(+), 3 deletions(-) diff --git a/aperag/indexing/graph_extractor.py b/aperag/indexing/graph_extractor.py index c7bd21dfd..6b946711b 100644 --- a/aperag/indexing/graph_extractor.py +++ b/aperag/indexing/graph_extractor.py @@ -114,8 +114,11 @@ def build_collection_graph_extractor(collection: Any) -> GraphExtractor: entity_types = tuple(_resolve_entity_types(collection)) language = _resolve_language(collection) - max_entities = _DEFAULT_MAX_ENTITIES_PER_CHUNK - max_relations = _DEFAULT_MAX_RELATIONS_PER_CHUNK + max_entities = _resolve_int_kg_config(collection, "max_entities_per_chunk", _DEFAULT_MAX_ENTITIES_PER_CHUNK) + max_relations = _resolve_int_kg_config(collection, "max_relations_per_chunk", _DEFAULT_MAX_RELATIONS_PER_CHUNK) + per_chunk_timeout = _resolve_float_kg_config( + collection, "per_chunk_timeout_seconds", _DEFAULT_PER_CHUNK_TIMEOUT_SECONDS + ) async def _extractor(chunks: Sequence[dict[str, Any]]) -> tuple[list[EntityRecord], list[RelationRecord]]: """Run the LLM extractor over every chunk in the dispatch.""" @@ -138,6 +141,7 @@ async def _extractor(chunks: Sequence[dict[str, Any]]) -> tuple[list[EntityRecor language=language, max_entities=max_entities, max_relations=max_relations, + timeout_seconds=per_chunk_timeout, ) except Exception: # noqa: BLE001 — per-chunk failure isolation logger.exception( @@ -168,6 +172,7 @@ async def _extract_one_chunk( language: str, max_entities: int, max_relations: int, + timeout_seconds: float, ) -> tuple[list[EntityRecord], list[RelationRecord]]: """Single-chunk extraction: render the prompt, call the LLM, parse the JSON response, return record lists. @@ -186,7 +191,7 @@ async def _extract_one_chunk( max_entities=max_entities, max_relations=max_relations, ) - raw = await asyncio.wait_for(llm(prompt), timeout=_DEFAULT_PER_CHUNK_TIMEOUT_SECONDS) + raw = await asyncio.wait_for(llm(prompt), timeout=timeout_seconds) return _parse_extraction_response(raw=raw, chunk_id=chunk_id) @@ -335,6 +340,78 @@ def _resolve_language(collection: Any) -> str: return _DEFAULT_LANGUAGE +def _resolve_kg_config_value(collection: Any, field: str) -> Any: + """Read ``collection.config.knowledge_graph_config.`` tolerating + the pydantic-attr / Mapping / JSON-string shapes the DB row may take. + Returns ``None`` if any layer is missing — callers fall back to their + default constant.""" + cfg = _resolve_config(collection) + if cfg is None: + return None + kg_config: Any = None + if hasattr(cfg, "knowledge_graph_config"): + kg_config = cfg.knowledge_graph_config + elif isinstance(cfg, Mapping): + kg_config = cfg.get("knowledge_graph_config") + if kg_config is None: + return None + if hasattr(kg_config, field): + return getattr(kg_config, field) + if isinstance(kg_config, Mapping): + return kg_config.get(field) + return None + + +def _resolve_int_kg_config(collection: Any, field: str, default: int) -> int: + raw = _resolve_kg_config_value(collection, field) + if raw is None: + return default + try: + value = int(raw) + except (TypeError, ValueError): + logger.warning( + "graph extractor: knowledge_graph_config.%s=%r is not an int; falling back to default %d", + field, + raw, + default, + ) + return default + if value <= 0: + logger.warning( + "graph extractor: knowledge_graph_config.%s=%d must be positive; falling back to default %d", + field, + value, + default, + ) + return default + return value + + +def _resolve_float_kg_config(collection: Any, field: str, default: float) -> float: + raw = _resolve_kg_config_value(collection, field) + if raw is None: + return default + try: + value = float(raw) + except (TypeError, ValueError): + logger.warning( + "graph extractor: knowledge_graph_config.%s=%r is not a float; falling back to default %s", + field, + raw, + default, + ) + return default + if value <= 0: + logger.warning( + "graph extractor: knowledge_graph_config.%s=%s must be positive; falling back to default %s", + field, + value, + default, + ) + return default + return value + + def _resolve_config(collection: Any) -> Any: cfg = getattr(collection, "config", None) if cfg is None: diff --git a/aperag/schema/common.py b/aperag/schema/common.py index 9eab01598..04d669e61 100644 --- a/aperag/schema/common.py +++ b/aperag/schema/common.py @@ -144,6 +144,35 @@ class KnowledgeGraphConfig(BaseModel): description="List of entity types to extract during graph indexing", examples=[["organization", "person", "geo", "event"]], ) + # Wave 5 P5A item 1 (per huangheng T1 obs A msg=6b349693): expose + # the LightRAG-style extractor's per-chunk caps + timeout as + # per-collection overrides so deployments tuning a slow LLM provider + # or extracting from very dense chunks can lift the defaults without + # patching ``aperag/indexing/graph_extractor.py`` constants. + per_chunk_timeout_seconds: Optional[float] = Field( + None, + description=( + "Per-chunk LLM call timeout (seconds); the extractor uses 60s " + "default if unset. Lift for slow / large-context multimodal models." + ), + examples=[120.0], + ) + max_entities_per_chunk: Optional[int] = Field( + None, + description=( + "Cap on entities the extractor accepts per chunk; default 32 " + "if unset. Raise for entity-dense documents (legal / scientific)." + ), + examples=[64], + ) + max_relations_per_chunk: Optional[int] = Field( + None, + description=( + "Cap on relations the extractor accepts per chunk; default 32 " + "if unset. Raise alongside max_entities for relation-dense docs." + ), + examples=[64], + ) class IndexPrompts(BaseModel): diff --git a/tests/integration/test_graph_extractor.py b/tests/integration/test_graph_extractor.py index 536cb32ea..4ba7ff744 100644 --- a/tests/integration/test_graph_extractor.py +++ b/tests/integration/test_graph_extractor.py @@ -154,6 +154,65 @@ class _Stub: assert ge._resolve_language(_Stub()) == "ja-JP" +def test_resolve_int_kg_config_reads_override(): + """Wave 5 P5A item 1: per-collection ``max_entities_per_chunk`` / + ``max_relations_per_chunk`` overrides should win over the module + defaults so deployments tune entity-dense documents without + patching constants.""" + + class _Stub: + config = { + "knowledge_graph_config": { + "max_entities_per_chunk": 64, + "max_relations_per_chunk": 128, + } + } + + assert ge._resolve_int_kg_config(_Stub(), "max_entities_per_chunk", 32) == 64 + assert ge._resolve_int_kg_config(_Stub(), "max_relations_per_chunk", 32) == 128 + + +def test_resolve_int_kg_config_falls_back_when_missing(): + class _Stub: + config = {"knowledge_graph_config": {}} + + assert ge._resolve_int_kg_config(_Stub(), "max_entities_per_chunk", 32) == 32 + + +def test_resolve_int_kg_config_rejects_non_positive(): + class _Stub: + config = {"knowledge_graph_config": {"max_entities_per_chunk": 0}} + + assert ge._resolve_int_kg_config(_Stub(), "max_entities_per_chunk", 32) == 32 + + +def test_resolve_float_kg_config_reads_override(): + """``per_chunk_timeout_seconds`` override lifts the 60s default for + slow / large-context multimodal models per huangheng T1 obs A.""" + + class _Stub: + config = {"knowledge_graph_config": {"per_chunk_timeout_seconds": 180.0}} + + assert ge._resolve_float_kg_config(_Stub(), "per_chunk_timeout_seconds", 60.0) == 180.0 + + +def test_resolve_float_kg_config_handles_int_value(): + """JSON / pydantic may surface integers where floats are expected; + the resolver must coerce them rather than fall back.""" + + class _Stub: + config = {"knowledge_graph_config": {"per_chunk_timeout_seconds": 120}} + + assert ge._resolve_float_kg_config(_Stub(), "per_chunk_timeout_seconds", 60.0) == 120.0 + + +def test_resolve_float_kg_config_falls_back_on_garbage(): + class _Stub: + config = {"knowledge_graph_config": {"per_chunk_timeout_seconds": "fast"}} + + assert ge._resolve_float_kg_config(_Stub(), "per_chunk_timeout_seconds", 60.0) == 60.0 + + def test_extractor_skips_chunks_with_empty_text(): """A chunks dict missing or with empty ``text`` should be skipped silently — the LLM isn't called for it. Pin via an extractor @@ -190,6 +249,7 @@ async def _run() -> None: language="en-US", max_entities=8, max_relations=8, + timeout_seconds=60.0, ) assert len(calls) == 1, "LLM should be called only once (for the non-empty chunk)" From 76bd9744d35bcf669c29c3c49ee8864e305294f0 Mon Sep 17 00:00:00 2001 From: earayu Date: Mon, 27 Apr 2026 18:26:22 +0800 Subject: [PATCH 08/13] =?UTF-8?q?feat(celery=20Wave=205=20P5B):=20P2=20bat?= =?UTF-8?q?ch=20=E2=80=94=20utc=5Fnow=20unify=20+=20tenant=20org-prefix=20?= =?UTF-8?q?forward-compat=20+=20OTLP=20config=20cross-check=20+=20cleanup?= =?UTF-8?q?=20builder=20share?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wave 5 Phase 5B (task #31) — 4 polish items from the Wave 4 ratify trail accumulated obs (chunk_id schema unify deferred to a follow-up once parser canonical schema lock lands): * **P5B-A utc_now → CURRENT_TIMESTAMP unify** (per huangheng chunk 4a obs A): ``_LineageEntityRow`` + ``_LineageRelationRow`` ``gmt_created`` / ``gmt_updated`` columns now use ``server_default=text("CURRENT_TIMESTAMP")`` mirroring the alembic migration declaration. Pre-Wave-5 ORM used ``default=utc_now`` Python-side; alembic check passed because Postgres treats both as semantic-equivalent but the per-mirror discipline is stronger when both layers speak the same dialect (schema-touching follow-ups cannot drift undetected). * **P5B-B tenant_scope_key org-prefix forward-compat** (per T3 chunk 2 obs C + §H.2 forward-compat lock): new ``aperag.indexing.parse_orchestrator.resolve_tenant_scope_key`` helper centralises the prefix construction. Pre-Wave-5 the prefix was hard-coded as ``f"user:{document.user}"`` at every callsite (upload handler, reconciler stuck-doc re-enqueue); the helper unifies the construction so a future Wave 6/7 organisation-tenant rollout flips one place. The org branch stays inert for Wave 5 (Document/Collection schemas don't carry org_id yet) — the helper just makes the seam explicit. * **P5B-C OTLP config cross-check** (per huangheng T6 chunk 1 obs): lifespan startup logs a clear warning when ``INDEXING_METRICS_EMITTER=otlp`` but ``APERAG_OBSERVABILITY_MODE`` is not set to ``otlp`` / ``collector``. Pre-Wave-5 this combination silently produced an ``OTLPMetricsEmitter`` whose ``MeterProvider`` was never installed by ``aperag.observability.metrics.init_metrics_provider`` — samples no-op'd silently, defeating the explicit ``INDEXING_METRICS_EMITTER=otlp`` opt-in. * **P5B-D cleanup builder share helper** (per huangheng T2 obs B drift risk): new ``_build_collection_qdrant_connector(collection, *, allow_vector_size_fallback)`` shared helper used by ``_build_vector_worker`` / ``_build_summary_worker`` (dispatch, ``allow_vector_size_fallback=False``) AND ``_build_qdrant_cleanup_backend`` (cleanup, ``allow_vector_size_fallback=True``). Pre-Wave-5 dispatch and cleanup paths duplicated the embedder + connector wiring; a future Qdrant adapter signature change had to be applied twice. ``_build_vision_worker`` keeps its pre-helper shape — its ``is_multimodal()`` gate must fire BEFORE any network call to preserve the Wave 4 chunk 4b "fail fast on non-multimodal embedder" invariant the existing test pins. Three-class tag (Wave 3 production-readiness invariant): * must-be-real: 4 polish items each tighten a real production seam (alembic mirror discipline / org-prefix forward-compat / OTLP config consistency / cleanup-vs-dispatch drift surface) * may-be-gated: org-prefix branch stays inert until org_id columns land (declared in helper docstring) * fully-resolves: 4 of the 5 P5B chenyexuan-batch backlog items. ``chunk_id`` schema unify is left to a follow-up — parser canonical schema lock first needs the §C.x amend; touching the ``or chunk.get("id")`` fallback now without that lock would introduce drift in the opposite direction. Deltas: * ``aperag/indexing/graph_storage/postgres.py`` — ``_LineageEntityRow`` + ``_LineageRelationRow`` switched to ``server_default=CURRENT_TIMESTAMP``; ``utc_now`` import dropped. * ``aperag/indexing/parse_orchestrator.py`` — ``resolve_tenant_scope_key(*, document, collection)`` helper + re-export. * ``aperag/domains/knowledge_base/service/document_service.py`` — ``_create_or_update_document_indexes`` calls ``resolve_tenant_scope_key`` in place of the inline ``f"user:{document.user}"``. * ``aperag/indexing/reconciler.py`` — ``_build_parse_payload_for_document`` calls the same helper for the stuck-document re-enqueue producer (consistency with upload handler). * ``aperag/app.py`` — lifespan logs the OTLP-vs-observability-mode cross-check warning before constructing ``OTLPMetricsEmitter``. * ``aperag/indexing/worker_factory.py`` — ``_build_collection_qdrant_connector`` shared helper + ``_build_vector_worker`` / ``_build_summary_worker`` / ``_build_qdrant_cleanup_backend`` reuse it; vision keeps its multimodal-gate-first shape. Local gates: * ``pytest tests/unit_test/indexing/ tests/integration/`` — 232 passed / 48 skipped (no regression). * ``ruff check aperag/ tests/integration/`` — clean. * ``ruff format`` — applied. Branch: ``chenyexuan/celery-wave5-p4`` rebased on ``bryce/celery-wave5`` HEAD ``4f0e9b0`` (P2 chunk 1 embed_image API surface) → push to ``bryce/celery-wave5`` for Wave 5 single-PR pile. Co-Authored-By: Claude Opus 4.7 --- aperag/app.py | 23 ++++++ .../service/document_service.py | 4 +- aperag/indexing/graph_storage/postgres.py | 38 +++++++-- aperag/indexing/parse_orchestrator.py | 52 ++++++++++++ aperag/indexing/reconciler.py | 4 +- aperag/indexing/worker_factory.py | 79 +++++++++++-------- 6 files changed, 159 insertions(+), 41 deletions(-) diff --git a/aperag/app.py b/aperag/app.py index 3b8e52dc6..3aa4ae5bf 100644 --- a/aperag/app.py +++ b/aperag/app.py @@ -269,6 +269,29 @@ async def combined_lifespan(app: FastAPI): # — otherwise queue-backlog / failure-rate alerts on the # collector side never receive data. if settings.indexing_metrics_emitter.lower() == "otlp": + # Wave 5 P5B: cross-check that the broader observability + # mode is also OTLP-shaped — operators that flip + # ``INDEXING_METRICS_EMITTER=otlp`` without configuring + # the parent ``APERAG_OBSERVABILITY_MODE`` end up with + # an :class:`OTLPMetricsEmitter` whose underlying + # ``MeterProvider`` was never installed by + # ``aperag.observability.metrics.init_metrics_provider``. + # The samples then no-op silently — the same operator- + # visible failure mode we explicitly avoided when + # making ``noop`` the default. + obs_mode = (settings.aperag_observability_mode or "").lower() + if obs_mode not in ("otlp", "collector"): + import logging as _logging + + _logging.getLogger(__name__).warning( + "INDEXING_METRICS_EMITTER=otlp but APERAG_OBSERVABILITY_MODE=%r " + "(expected 'otlp' or 'collector') — the OTLP MeterProvider " + "may not be installed, indexing SLI samples will silently " + "no-op. Set APERAG_OBSERVABILITY_MODE=otlp to enable real " + "OTLP export, OR revert INDEXING_METRICS_EMITTER=noop to " + "make the gap explicit.", + settings.aperag_observability_mode, + ) metrics_emitter = OTLPMetricsEmitter() else: metrics_emitter = NoopMetricsEmitter() diff --git a/aperag/domains/knowledge_base/service/document_service.py b/aperag/domains/knowledge_base/service/document_service.py index 48e730949..427cbae63 100644 --- a/aperag/domains/knowledge_base/service/document_service.py +++ b/aperag/domains/knowledge_base/service/document_service.py @@ -189,12 +189,14 @@ async def _create_or_update_document_indexes( return parser_config = await _resolve_parser_config_for_collection(document.collection_id, session) + collection = await session.get(Collection, document.collection_id) if document.collection_id else None + from aperag.indexing.parse_orchestrator import resolve_tenant_scope_key payload = ParseDispatchPayload( document_id=document.id, collection_id=document.collection_id, object_path=object_path, - tenant_scope_key=f"user:{document.user}", + tenant_scope_key=resolve_tenant_scope_key(document=document, collection=collection), modalities=tuple(m.value for m in index_types), parser_config=parser_config, purge_existing_triples=True, diff --git a/aperag/indexing/graph_storage/postgres.py b/aperag/indexing/graph_storage/postgres.py index 9a71dc7d7..c7ecf7efb 100644 --- a/aperag/indexing/graph_storage/postgres.py +++ b/aperag/indexing/graph_storage/postgres.py @@ -77,7 +77,10 @@ RelationRecord, RelationWithLineage, ) -from aperag.utils.utils import utc_now + +# utc_now is no longer needed: Wave 5 P5B switched ORM gmt_* columns +# to ``server_default=CURRENT_TIMESTAMP`` mirroring the alembic +# migration declaration. logger = logging.getLogger(__name__) @@ -116,8 +119,23 @@ class _LineageEntityRow(_LineageGraphBase): type = Column(String(64), nullable=False) source_lineage = Column(JSONB, nullable=False, server_default=text("'[]'::jsonb")) description_parts = Column(JSONB, nullable=False, server_default=text("'[]'::jsonb")) - gmt_created = Column(DateTime(timezone=True), default=utc_now, nullable=False) - gmt_updated = Column(DateTime(timezone=True), default=utc_now, nullable=False) + # Wave 5 P5B: ORM uses the same ``server_default=CURRENT_TIMESTAMP`` + # the alembic migration declares — strict ORM↔migration mirror so + # ``alembic check`` cannot drift on schema-touching follow-ups. + # Pre-Wave-5 ORM used ``default=utc_now`` Python-side; alembic + # check passed because Postgres treats both as semantic-equivalent + # but the per-mirror discipline is stronger when both layers + # speak the same dialect. + gmt_created = Column( + DateTime(timezone=True), + nullable=False, + server_default=text("CURRENT_TIMESTAMP"), + ) + gmt_updated = Column( + DateTime(timezone=True), + nullable=False, + server_default=text("CURRENT_TIMESTAMP"), + ) class _LineageRelationRow(_LineageGraphBase): @@ -140,8 +158,18 @@ class _LineageRelationRow(_LineageGraphBase): # standalone ``description`` field so the column had no consumer. evidence_lineage = Column(JSONB, nullable=False, server_default=text("'[]'::jsonb")) description_parts = Column(JSONB, nullable=False, server_default=text("'[]'::jsonb")) - gmt_created = Column(DateTime(timezone=True), default=utc_now, nullable=False) - gmt_updated = Column(DateTime(timezone=True), default=utc_now, nullable=False) + # Wave 5 P5B: same ORM↔migration mirror discipline as + # ``_LineageEntityRow`` above. + gmt_created = Column( + DateTime(timezone=True), + nullable=False, + server_default=text("CURRENT_TIMESTAMP"), + ) + gmt_updated = Column( + DateTime(timezone=True), + nullable=False, + server_default=text("CURRENT_TIMESTAMP"), + ) # --------------------------------------------------------------------- diff --git a/aperag/indexing/parse_orchestrator.py b/aperag/indexing/parse_orchestrator.py index dc908b253..ca71dddc6 100644 --- a/aperag/indexing/parse_orchestrator.py +++ b/aperag/indexing/parse_orchestrator.py @@ -442,6 +442,57 @@ async def run_parse_worker( ) +def resolve_tenant_scope_key(*, document: Any, collection: Any = None) -> str: + """Wave 5 P5B forward-compat helper for tenant_scope_key construction. + + Centralises the prefix logic so all enqueue producers (upload + handler, reconciler stuck-doc re-enqueue, future org-tenant + bootstrap) share the same source of truth. Pre-Wave-5 the prefix + was hard-coded as ``f"user:{document.user}"`` at every callsite — + this helper unifies the construction so a future Wave 6/7 + organisation-tenant rollout flips one place. + + Resolution order: + + 1. ``collection.config.tenant_scope_prefix == "org"`` AND + ``document.org_id`` (or ``collection.org_id``) populated → + ``f"org:"`` + 2. Otherwise → ``f"user:"`` (Wave 4 default, + per §H.2 forward-compat) + + The org branch stays inert for Wave 5 because ``Document`` / + ``Collection`` schemas do not yet carry ``org_id``; the helper + just *makes the seam explicit* so a future PR landing the org + column can flip behaviour without grepping for hard-coded + ``user:`` prefixes. + """ + org_prefix_active = False + org_id: str | None = None + if collection is not None: + config_obj = getattr(collection, "config", None) + if config_obj is not None: + cfg_dict: dict[str, Any] | None = None + if isinstance(config_obj, str): + try: + cfg_dict = json.loads(config_obj) + except (TypeError, ValueError): + cfg_dict = None + elif isinstance(config_obj, Mapping): + cfg_dict = dict(config_obj) + elif hasattr(config_obj, "tenant_scope_prefix"): + cfg_dict = {"tenant_scope_prefix": getattr(config_obj, "tenant_scope_prefix", None)} + if cfg_dict and cfg_dict.get("tenant_scope_prefix") == "org": + org_prefix_active = True + if not org_id: + org_id = getattr(collection, "org_id", None) + if not org_id: + org_id = getattr(document, "org_id", None) + + if org_prefix_active and org_id: + return f"org:{org_id}" + return f"user:{getattr(document, 'user', '')}" + + __all__ = [ "DEFAULT_PARSE_CONCURRENCY", "DEFAULT_POLL_TIMEOUT_SECONDS", @@ -449,6 +500,7 @@ async def run_parse_worker( "ParseDispatchPayload", "ParseOrchestratorConfig", "process_one_parse_task", + "resolve_tenant_scope_key", "run_parse_worker", "run_parse_worker_loop", ] diff --git a/aperag/indexing/reconciler.py b/aperag/indexing/reconciler.py index 997abc8f0..9727b3c92 100644 --- a/aperag/indexing/reconciler.py +++ b/aperag/indexing/reconciler.py @@ -581,11 +581,13 @@ def _build_parse_payload_for_document(session: Session, document): ) return None + from aperag.indexing.parse_orchestrator import resolve_tenant_scope_key + return ParseDispatchPayload( document_id=document.id, collection_id=document.collection_id or "", object_path=str(object_path), - tenant_scope_key=f"user:{document.user}", + tenant_scope_key=resolve_tenant_scope_key(document=document, collection=collection), modalities=tuple(modalities), parser_config=parser_config, purge_existing_triples=True, diff --git a/aperag/indexing/worker_factory.py b/aperag/indexing/worker_factory.py index 8e36446d1..c1f76f54d 100644 --- a/aperag/indexing/worker_factory.py +++ b/aperag/indexing/worker_factory.py @@ -232,14 +232,9 @@ def _build_vector_worker(*, collection: Any, object_store: Any) -> ModalityWorke """Wire :class:`VectorModality` to a real Qdrant collection + real EmbeddingService for the collection's configured model. """ - from aperag.config import get_vector_db_connector from aperag.indexing.vector import VectorModality - from aperag.llm.embed.base_embedding import get_collection_embedding_service_sync - from aperag.utils.utils import generate_vector_db_collection_name - embedding_service, vector_size = get_collection_embedding_service_sync(collection) - qdrant_collection = generate_vector_db_collection_name(collection.id) - adaptor = get_vector_db_connector(qdrant_collection, vector_size=vector_size) + adaptor, embedding_service, _ = _build_collection_qdrant_connector(collection) backend = _QdrantPointBackend(connector=adaptor.connector) def _embed(text: str) -> list[float]: @@ -390,14 +385,9 @@ def _build_summary_worker(*, collection: Any, object_store: Any) -> ModalityWork model; the embedder is the same one vector uses (one model per collection, shared across modalities). """ - from aperag.config import get_vector_db_connector from aperag.indexing.summary import SummaryModality - from aperag.llm.embed.base_embedding import get_collection_embedding_service_sync - from aperag.utils.utils import generate_vector_db_collection_name - embedding_service, vector_size = get_collection_embedding_service_sync(collection) - qdrant_collection = generate_vector_db_collection_name(collection.id) - adaptor = get_vector_db_connector(qdrant_collection, vector_size=vector_size) + adaptor, embedding_service, _ = _build_collection_qdrant_connector(collection) backend = _QdrantPointBackend(connector=adaptor.connector) summarizer = _build_collection_summarizer(collection) @@ -444,6 +434,11 @@ def _build_vision_worker(*, collection: Any, object_store: Any) -> ModalityWorke from aperag.llm.embed.base_embedding import get_collection_embedding_service_sync from aperag.utils.utils import generate_vector_db_collection_name + # Vision keeps the multimodal gate ABOVE the connector wiring so a + # non-multimodal embedder fails fast without a network round-trip. + # The shared :func:`_build_collection_qdrant_connector` helper used + # by vector / summary calls the connector first; reusing it here + # would re-order the gate after the network call. embedding_service, vector_size = get_collection_embedding_service_sync(collection) if not embedding_service.is_multimodal(): raise WorkerFactoryError( @@ -452,7 +447,6 @@ def _build_vision_worker(*, collection: Any, object_store: Any) -> ModalityWorke "set collection.config.enable_vision=false until Wave 4 lands " "OR configure a multimodal embedding model on the collection's embedding spec" ) - qdrant_collection = generate_vector_db_collection_name(collection.id) adaptor = get_vector_db_connector(qdrant_collection, vector_size=vector_size) backend = _QdrantPointBackend(connector=adaptor.connector) @@ -1033,18 +1027,23 @@ async def sync( raise NotImplementedError("CleanupWorkerView.sync must not be called — cleanup-only shape") -def _build_qdrant_cleanup_backend(collection: Any) -> Any: - """Construct the Qdrant ``_backend`` adapter without any modality- - specific gate (used for cleanup of vector / summary / vision). - - The full ``_build_vector_worker`` chain calls - :func:`get_collection_embedding_service_sync` to size the Qdrant - collection; we duplicate that minimal step here so a collection - whose embedder config is broken (a Wave 3 lesson #10 case) still - deletes its points instead of silently leaking. If the embedding - service cannot be resolved we still need ``vector_size`` to - address the right collection — fall back to the connector's - introspection of the existing collection. +def _build_collection_qdrant_connector(collection: Any, *, allow_vector_size_fallback: bool = False) -> Any: + """Wave 5 P5B shared helper — resolves the per-collection Qdrant + connector for both dispatch (``_build_vector_worker`` / + ``_build_summary_worker`` / ``_build_vision_worker``) and cleanup + (``_build_qdrant_cleanup_backend``). + + Pre-Wave-5 the dispatch chain and cleanup chain duplicated this + block; a future Qdrant adapter / embedder signature change had to + be applied twice. Centralising in one helper drops the drift risk + surface huangheng flagged on T2 ratify (msg=6aa8ca88 obs B). + + ``allow_vector_size_fallback`` (cleanup-only) tolerates an + embedder resolve failure by reporting ``vector_size=0`` — the + connector still addresses the right collection by name and + ``delete_by_filter`` does not touch the vector dimension. Dispatch + callers pass ``False`` so an embedder fault surfaces as + :class:`WorkerFactoryError` immediately. """ from aperag.config import get_vector_db_connector from aperag.llm.embed.base_embedding import get_collection_embedding_service_sync @@ -1052,22 +1051,34 @@ def _build_qdrant_cleanup_backend(collection: Any) -> Any: qdrant_collection = generate_vector_db_collection_name(collection.id) try: - _, vector_size = get_collection_embedding_service_sync(collection) - except Exception as exc: # noqa: BLE001 - # The embedder is irrelevant for ``delete_by_filter``; the - # connector only needs ``vector_size`` to validate against - # the existing collection on the Qdrant side. Fall back to a - # benign size — the connector will still address the right - # collection by name and the delete-by-filter call does not - # touch the vector dimension. + embedding_service, vector_size = get_collection_embedding_service_sync(collection) + except Exception as exc: + if not allow_vector_size_fallback: + raise logger.warning( - "cleanup vector backend: embedder resolve failed for collection=%s (%s); " + "qdrant connector: embedder resolve failed for collection=%s (%s); " "falling back to size=0 for delete-only operations", getattr(collection, "id", ""), exc, ) + embedding_service = None vector_size = 0 adaptor = get_vector_db_connector(qdrant_collection, vector_size=vector_size) + return adaptor, embedding_service, vector_size + + +def _build_qdrant_cleanup_backend(collection: Any) -> Any: + """Construct the Qdrant ``_backend`` adapter for cleanup paths. + + Reuses :func:`_build_collection_qdrant_connector` with the + cleanup-only ``allow_vector_size_fallback=True`` shape so a + collection whose embedder config is broken (a Wave 3 lesson #10 + case) still has its points deleted instead of leaking. + """ + adaptor, _embedder, _vector_size = _build_collection_qdrant_connector( + collection, + allow_vector_size_fallback=True, + ) return _QdrantPointBackend(connector=adaptor.connector) From e85c77bb65afb7162703d9c49bedcff13b4029a2 Mon Sep 17 00:00:00 2001 From: earayu Date: Mon, 27 Apr 2026 18:29:40 +0800 Subject: [PATCH 09/13] feat(celery Wave 5 P5A item 4): Neo4j label namespace prefix `aperag_` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per §K.9.1 P5A item 14 + design pack §K.9 Phase 5A item 4: deployments that share a Neo4j instance with user-owned graphs would collide on the generic `LineageEntity` / `LineageRelation` labels. Bump them to `aperag_LineageEntity` / `aperag_LineageRelation` and align constraint names (`aperag_lineage_entity_collection_name_unique` / `aperag_lineage_relation_collection_triple_unique`). Hard-cut second round (no data migration): Wave 4 graph indexing defaulted `enable_knowledge_graph=False`, so no production deployment has data on the legacy unprefixed labels. Postgres / Nebula backends are unaffected (table names already namespaced via `aperag_lineage_*` schema in alembic migration `e7a3b9c2d1f6`). Co-Authored-By: Claude Opus 4.7 --- aperag/indexing/graph_storage/neo4j.py | 38 +++++++++++++++++--------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/aperag/indexing/graph_storage/neo4j.py b/aperag/indexing/graph_storage/neo4j.py index a37cc5d26..bcc432fe6 100644 --- a/aperag/indexing/graph_storage/neo4j.py +++ b/aperag/indexing/graph_storage/neo4j.py @@ -25,13 +25,15 @@ without requiring APOC. Storage layout — one row per entity / relation, modelled as a label -``LineageEntity`` / ``LineageRelation`` node (relations are NOT modelled -as edges here because the §D.3 Protocol does not require traversal — -that is a separate retrieval concern owned by the legacy ``GraphStore``; -keeping relations as their own labelled nodes mirrors the Postgres -two-table schema exactly): - - (:LineageEntity { +``aperag_LineageEntity`` / ``aperag_LineageRelation`` node (relations +are NOT modelled as edges here because the §D.3 Protocol does not +require traversal — that is a separate retrieval concern owned by the +legacy ``GraphStore``; keeping relations as their own labelled nodes +mirrors the Postgres two-table schema exactly). The ``aperag_`` label +prefix (Wave 5 P5A item 4) prevents collisions with user-owned graphs +in shared Neo4j deployments. + + (:aperag_LineageEntity { collection_id, name, type, source_lineage, -- list, JSON-encoded LineageMember source_lineage_doc_ids, -- list, parallel index, dedup + strip-by-doc filter @@ -42,7 +44,7 @@ gmt_created, gmt_updated }) - (:LineageRelation { + (:aperag_LineageRelation { collection_id, source, target, type, evidence_lineage, evidence_lineage_doc_ids, @@ -117,11 +119,21 @@ logger = logging.getLogger(__name__) -_ENTITY_LABEL = "LineageEntity" -_RELATION_LABEL = "LineageRelation" - -_ENTITY_CONSTRAINT_NAME = "lineage_entity_collection_name_unique" -_RELATION_CONSTRAINT_NAME = "lineage_relation_collection_triple_unique" +# Wave 5 P5A item 4 (per `feedback_production_readiness_invariant.md` +# multi-tenant Neo4j hygiene): namespace the lineage labels with an +# ``aperag_`` prefix so deployments that share a Neo4j instance with +# user-owned graphs don't collide on a generic ``LineageEntity`` / +# ``LineageRelation`` label. Constraint names get the same prefix so +# ``ensure_schema`` is idempotent against fresh and prefixed-but-already- +# created clusters alike. Wave 4 graph indexing was gated behind +# ``enable_knowledge_graph=False`` by default so there is no production +# data on the legacy unprefixed labels — no data migration needed for +# the hard-cut second-round bump. +_ENTITY_LABEL = "aperag_LineageEntity" +_RELATION_LABEL = "aperag_LineageRelation" + +_ENTITY_CONSTRAINT_NAME = "aperag_lineage_entity_collection_name_unique" +_RELATION_CONSTRAINT_NAME = "aperag_lineage_relation_collection_triple_unique" # --------------------------------------------------------------------- From ba1cd0e3edef164771f3a134bbd9e6c1f8b390a6 Mon Sep 17 00:00:00 2001 From: earayu Date: Mon, 27 Apr 2026 18:31:12 +0800 Subject: [PATCH 10/13] docs(celery Wave 5 P5A close-out): items 2/3/5 deferred to Wave 6 with explicit reasoning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per Wave 5 P5A scope close-out 2026-04-27: * Item 2 (chunk 4b _no_op_extractor identity check → attribute marker) is N/A: the placeholder was deleted in Wave 4 T1 (`19d3d70f`) when `build_collection_graph_extractor` replaced `_no_op_extractor`. There is no surviving identity-check site to harden — moot in current code. * Item 3 (W5-perf-graph-lineage parallel-list O(N) cross-backend) is deferred to Wave 6: Postgres needs JSONB → text[] migration, Nebula needs tag re-model, plus alembic column-shape change. >10k docs/entity is not a Wave 5 acceptance criterion; trigger when observed in real deployments. * Item 5 (Cypher type keyword rename `n.type` → `entity_type` / `relation_type`) is deferred to Wave 6: cross-backend rename (Postgres column + alembic; Cypher property rename; Nebula tag-prop rename) plus §D.3 Protocol-surface change (`EntityRecord.type` → `EntityRecord.entity_type` etc.). Cypher `TYPE()` is technically only for relationships so current code is unambiguous — forward-compat hygiene rather than correctness fix. §K.9 Phase 5A acceptance lines amended to reflect ship status (items 1+4 shipped; 2 N/A; 3+5 → Wave 6). §K.10 Wave 6 backlog extended with items 6+7 covering the deferred cross-backend rewrites. Co-Authored-By: Claude Opus 4.7 --- .../indexing-redesign-design-pack.md | 31 +++++++++++++++---- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/docs/modularization/indexing-redesign-design-pack.md b/docs/modularization/indexing-redesign-design-pack.md index e3fe8e17c..9ae4a6604 100644 --- a/docs/modularization/indexing-redesign-design-pack.md +++ b/docs/modularization/indexing-redesign-design-pack.md @@ -1701,13 +1701,13 @@ per chunk 4d+4e ratify msg=c279a0ff + sweep D Layer 2 stub (`4d36c7fb`): 5 commits in same phase batch (Bryce ownership per Wave 4 chunk 4 / T1 / T8 expertise): -1. **T1 graph extractor per-collection config tunability** (per huangheng T1 obs A): `collection.config.knowledge_graph_config.{per_chunk_timeout, max_entities_per_chunk, max_relations_per_chunk}` per-collection override -2. **chunk 4b `_no_op_extractor` identity check → attribute marker** (per huangheng chunk 4b obs B): `getattr(extractor, 'is_no_op', False)` more robust pattern -3. **W5-perf-graph-lineage parallel-list O(N) alternative encoding** for high-cardinality entities (>10k docs/entity, per architect msg=39a74026) -4. **W5-neo4j-label-namespace prefix `aperag_LineageEntity` / `aperag_LineageRelation`** to avoid user-namespace collision -5. **W5-cypher-type-keyword rename `n.type` property** (Cypher `TYPE()` keyword shadow); cross-backend rename also in Postgres/Nebula +1. **T1 graph extractor per-collection config tunability** (per huangheng T1 obs A): `collection.config.knowledge_graph_config.{per_chunk_timeout, max_entities_per_chunk, max_relations_per_chunk}` per-collection override — **shipped `f5e454ed`** +2. ~~chunk 4b `_no_op_extractor` identity check → attribute marker~~ — **N/A: resolved by Wave 4 T1 land**. The placeholder was deleted when `build_collection_graph_extractor` replaced `_no_op_extractor` (see commit `19d3d70f`); there is no surviving identity-check site to harden, so this item is moot in current code. +3. ~~W5-perf-graph-lineage parallel-list O(N) alternative encoding~~ — **deferred to Wave 6**. Cross-backend perf rewrite touches Postgres / Nebula schema (Postgres switching from JSONB-array-of-objects to parallel `text[]` columns; Nebula re-modelling tags) plus alembic migration. High-cardinality (>10k docs/entity) is not a Wave 5 acceptance criterion — defer to dedicated perf wave once observed in real deployments. +4. **W5-neo4j-label-namespace prefix `aperag_LineageEntity` / `aperag_LineageRelation`** to avoid user-namespace collision — **shipped `42b5fa6a`** (Neo4j-only constant + comment rename, no schema migration needed since Wave 4 default-gated graph indexing meant no production data on legacy labels). +5. ~~W5-cypher-type-keyword rename `n.type` property~~ — **deferred to Wave 6**. Although `TYPE(n)` is a Cypher built-in for relationships not nodes (so node-property `n.type` is technically unambiguous), a forward-compat rename across all 3 backends (Postgres column rename + alembic migration + Cypher property rename + Nebula tag-prop rename + `EntityRecord.type` / `RelationRecord.type` Protocol surface) is non-trivial. Keep current naming until Wave 6 dedicated cross-backend rename batch. -**fully-resolves**: §K.8 Wave 5 backlog items 5 + 12 (其中 12 fold 进 PR-A T7 multimodal 实施 if applicable) +**fully-resolves**: §K.8 P5A items 1 + 4 (items 2 / 3 / 5 redirected as noted) #### Phase 5B (post-Phase-4, chenyexuan batch) — P2 batch infra + cleanup polish @@ -1802,6 +1802,25 @@ PM (燧木) 决定。架构师建议参考 D10 模式: - ``tests/integration/compat/test_graph_compat.py`` 5. **Final grep-zero verify**: post-Wave-6, ``aperag/`` 全树无 ``from aperag.domains.knowledge_graph.graphindex`` import +**Cross-backend lineage polish folded from Wave 5 P5A** (per Wave 5 P5A close-out +2026-04-27): + +6. **W5-perf-graph-lineage parallel-list O(N) cross-backend** — Postgres + switches the JSONB-array-of-objects to parallel ``text[]`` columns + (matching Neo4j parallel-list encoding); Nebula re-models tags to + parallel-list semantics; alembic migration for the column-shape + change. Trigger when a deployment observes >10k docs/entity (per + architect msg=39a74026 high-cardinality threshold). +7. **W5-cypher-type-keyword cross-backend rename** — rename + ``EntityRecord.type`` / ``RelationRecord.type`` (and + ``EntityWithLineage`` / ``RelationWithLineage``) to ``entity_type`` / + ``relation_type`` across the §D.3 Protocol; Postgres column rename via + alembic; Cypher ``n.type`` / ``r.type`` rewrite; Nebula tag-prop + rename. Forward-compat hygiene (Cypher ``TYPE(r)`` is for relationships + only so current code is technically unambiguous, but Protocol-surface + rename frees future use of ``TYPE`` as a true Cypher keyword without + ambiguity). + **production-readiness 三类 layer**: - must-be-real: 新 LightRAG-style query layer ships behavior-equivalent retrieval (existing graph search results within ε tolerance) - may-be-gated: 无 (full hard-cut) From f15e504eb9ec350cceea1c9e56b1ea36922018f8 Mon Sep 17 00:00:00 2001 From: earayu Date: Mon, 27 Apr 2026 18:46:46 +0800 Subject: [PATCH 11/13] =?UTF-8?q?feat(celery=20Wave=205=20P2=20chunk=202):?= =?UTF-8?q?=20parser=20image=20extraction=20=E2=86=92=20derived/parse=5F/vision/{images,source.jsonl}?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per §G.2.5.1 spec amend item 2: when DocParser produces `AssetBinPart` payloads (PDF page images, single-image-input passthrough, data-URI extracted images), the parser writes each image blob to `derived/parse_/vision/images/.` and lands a `vision/source.jsonl` descriptor enumerating them. The vision worker (chunk 4) consumes the descriptor instead of the T1 simulator's synthetic `images.json` companion. `aperag/indexing/parser.py`: * `_docparser_extract_markdown` now returns `tuple[str, list[_VisionImageAsset]]` — markdown body unchanged plus the extracted image assets list. Non-image `AssetBinPart`s (audio / PDF data) drop. Duplicate `asset_id`s deduplicated to keep Qdrant-point-id stable. * `_VisionImageAsset` carrier holds (`image_id`, `data`, `mime_type`, `alt_text`, `page_idx`, `bbox`). * `_vision_image_extension(mime_type)` maps known image MIMEs to a filename extension; falls back to `.bin` for unknown types since vision worker uses `imghdr` on bytes at embed time. * `_write_vision_assets` persists each blob via `write_atomic` and writes the JSONL descriptor. * `ParseResult.vision_source_path` and `vision_image_count` are new optional fields (defaults `""` / `0`) so pre-Wave-5 callers see no behaviour change. Image-only inputs (no markdown emitted) still land their assets so vision modality has bytes to embed. `tests/unit_test/indexing/test_parser_image_extraction.py`: 9 unit tests covering MIME → extension mapping, descriptor schema, simulator-path no-op, image-only input asset persistence, unknown-MIME `.bin` fallback, and the new `ParseResult` fields. Tests monkeypatch `_docparser_extract_markdown` to avoid pulling MinerU / MarkItDown into the unit-test path. Production-readiness 三类: - must-be-real: real `AssetBinPart` extraction wired through DocParser - may-be-gated: descriptor + image artefacts only land when DocParser surfaces `AssetBinPart`s (image-less docs see no vision/ writes) - fully-resolves: §G.2.5.1 spec item 2 (parser image extraction) + Wave 5 P2 chunk 2 acceptance The vision worker callsite rewrite (chunk 4) and provider v3 multimodal flag UI (chunk 3) remain. Defaults preserve existing behaviour for text-only callers. Co-Authored-By: Claude Opus 4.7 --- aperag/indexing/parser.py | 198 +++++++++++- .../indexing/test_parser_image_extraction.py | 291 ++++++++++++++++++ 2 files changed, 473 insertions(+), 16 deletions(-) create mode 100644 tests/unit_test/indexing/test_parser_image_extraction.py diff --git a/aperag/indexing/parser.py b/aperag/indexing/parser.py index f62d8edac..b1648c19c 100644 --- a/aperag/indexing/parser.py +++ b/aperag/indexing/parser.py @@ -166,15 +166,28 @@ class ParseResult: """Outcome of :func:`parse_document`. ``parse_version`` pins the (parser, content, chunking) triple so - callers can persist it on the ``DocumentIndex`` row. The three - artifact paths are the canonical names downstream modalities - expect to find under ``derived/parse_/``. + callers can persist it on the ``DocumentIndex`` row. The artifact + paths are the canonical names downstream modalities expect to find + under ``derived/parse_/``. + + Wave 5 P2 chunk 2 (per §G.2.5.1 spec amend item 2): when DocParser + produces ``AssetBinPart`` payloads (PDF page images / single-image + inputs / data-URI extracted images), the parser writes each blob to + ``derived/parse_/vision/images/.`` and lands a + ``vision/source.jsonl`` descriptor enumerating them. The vision + worker consumes the descriptor (chunk 4 callsite rewrite) instead + of the T1 simulator's synthetic ``images.json`` companion. The + descriptor path is empty when the parsed document has no image + assets — vision modality short-circuits to the no-image FAILED + handling already in place. """ parse_version: str markdown_path: str outline_path: str chunks_path: str + vision_source_path: str = "" + vision_image_count: int = 0 # --------------------------------------------------------------------- @@ -352,28 +365,85 @@ def _all_artifacts_present( return False +# --------------------------------------------------------------------- +# Vision asset extraction helpers — Wave 5 P2 chunk 2 +# --------------------------------------------------------------------- + + +# MIME-type → file extension lookup. Provider-specific extras (HEIC, +# AVIF, etc.) drop through to the generic ``.bin`` fallback rather +# than rejecting the asset — the vision worker uses ``imghdr`` on the +# image bytes themselves at embed time, so the on-disk filename +# extension is informational only. +_MIME_EXTENSION_MAP: dict[str, str] = { + "image/jpeg": "jpg", + "image/jpg": "jpg", + "image/png": "png", + "image/webp": "webp", + "image/gif": "gif", + "image/bmp": "bmp", + "image/tiff": "tiff", + "image/svg+xml": "svg", +} + + +def _vision_image_extension(mime_type: str | None) -> str: + if not mime_type: + return "bin" + key = mime_type.split(";", 1)[0].strip().lower() + return _MIME_EXTENSION_MAP.get(key, "bin") + + +@dataclass(frozen=True) +class _VisionImageAsset: + """Internal carrier for a single extracted image asset. + + Holds enough to land both the blob (under ``vision/images/``) and + its row in the ``vision/source.jsonl`` descriptor consumed by the + vision worker (chunk 4). ``image_id`` is the canonical ``asset_id`` + DocParser already computes (md5 of the image bytes), so two parses + of the same content produce the same image_id and downstream + identity (``vision:::`` Qdrant point id) + stays stable across retries. + """ + + image_id: str + data: bytes + mime_type: str + alt_text: str + page_idx: int | None + bbox: list[float] | None + + def _docparser_extract_markdown( *, source_bytes: bytes, extension: str, parser_config: dict[str, Any] | None, -) -> str: - """Run :class:`DocParser` on ``source_bytes`` and return concatenated markdown. +) -> tuple[str, list[_VisionImageAsset]]: + """Run :class:`DocParser` on ``source_bytes`` and return both + concatenated markdown AND the extracted vision image assets. + + Wave 5 P2 chunk 2 (per §G.2.5.1 item 2): the assets list carries + every :class:`AssetBinPart` whose ``mime_type`` is a recognised + image type. Audio / video / PDF-data assets are dropped — only + images participate in the vision modality. The caller writes the + blobs out under ``derived/parse_/vision/images/`` plus a + descriptor JSONL line for each. Materialises the bytes into a tempfile (DocParser only accepts a real path on disk because MarkItDown / MinerU / OCR all stream from disk), runs the parser chain, then collects every - :class:`MarkdownPart` body in order. Non-markdown parts (assets / - images / pdf data) are not used here — they belong to the - vision modality's ``derive`` step (T1.4) and the cleanup loop's - asset GC (T2.1), not the shared markdown contract. + :class:`MarkdownPart` body in order. Non-image asset parts (PDF + data, audio, etc.) belong to other modalities or to the cleanup + loop's asset GC (T2.1), not the shared markdown contract. DocParser is imported lazily so the indexing package's __init__ does not pull MarkItDown / MinerU / pikepdf at import time (matches the existing T1.1 lazy-import discipline that kept the Wave 3 hard-cut circular-import-free). """ - from aperag.docparser.base import MarkdownPart + from aperag.docparser.base import AssetBinPart, MarkdownPart from aperag.docparser.doc_parser import DocParser # Use the suffix the caller already normalised so the temp filename @@ -402,14 +472,44 @@ def _docparser_extract_markdown( os.unlink(tmp_path) markdown_parts = [p.markdown for p in parts if isinstance(p, MarkdownPart) and p.markdown] + seen_image_ids: set[str] = set() + image_assets: list[_VisionImageAsset] = [] + for part in parts: + if not isinstance(part, AssetBinPart): + continue + mime = (part.mime_type or "").lower() + if not mime.startswith("image/"): + continue + if not part.data: + continue + if part.asset_id in seen_image_ids: + # Same image referenced twice in the document → keep the + # first record + drop the duplicate so the descriptor has a + # single canonical row per image_id. The Qdrant point id + # would have collided otherwise. + continue + seen_image_ids.add(part.asset_id) + metadata = part.metadata or {} + image_assets.append( + _VisionImageAsset( + image_id=part.asset_id, + data=part.data, + mime_type=mime, + alt_text=str(metadata.get("alt_text") or part.content or ""), + page_idx=metadata.get("page_idx") if isinstance(metadata.get("page_idx"), int) else None, + bbox=metadata.get("bbox") if isinstance(metadata.get("bbox"), list) else None, + ) + ) + if not markdown_parts: # Image-only / audio-only inputs (no MarkdownPart) currently # have nothing for the outline + chunks pipeline to emit; we # return an empty body so the artifacts exist but downstream - # vector / fulltext modalities see zero chunks. Vision modality - # consumes the original asset directly in its derive step. - return "" - return "\n\n".join(markdown_parts) + # vector / fulltext modalities see zero chunks. Image-only + # uploads still land their assets via the descriptor below so + # the vision modality has bytes to embed. + return ("", image_assets) + return ("\n\n".join(markdown_parts), image_assets) def parse_document( @@ -525,6 +625,7 @@ def parse_document( chunks_path=chunks_path, ) + image_assets: list[_VisionImageAsset] = [] if extension is None or extension in _SIMULATOR_EXTENSIONS: try: markdown = source_bytes.decode("utf-8") @@ -540,7 +641,7 @@ def parse_document( f"with the real extension to dispatch to DocParser" ) from exc else: - markdown = _docparser_extract_markdown( + markdown, image_assets = _docparser_extract_markdown( source_bytes=source_bytes, extension=extension, parser_config=parser_config, @@ -561,14 +662,25 @@ def parse_document( ("\n".join(json.dumps(c, ensure_ascii=False) for c in chunks) + "\n").encode("utf-8"), ) + vision_source_path = "" + if image_assets: + vision_source_path = _write_vision_assets( + store=store, + collection_id=collection_id, + document_id=document_id, + parse_version=parse_version, + assets=image_assets, + ) + logger.info( "indexing parser produced derived artifacts collection=%s document=%s " - "parse_version=%s outline_size=%d chunk_count=%d", + "parse_version=%s outline_size=%d chunk_count=%d vision_image_count=%d", collection_id, document_id, parse_version, len(outline), len(chunks), + len(image_assets), ) return ParseResult( @@ -576,7 +688,61 @@ def parse_document( markdown_path=markdown_path, outline_path=outline_path, chunks_path=chunks_path, + vision_source_path=vision_source_path, + vision_image_count=len(image_assets), + ) + + +def _write_vision_assets( + *, + store: _SyncObjectStore, + collection_id: str, + document_id: str, + parse_version: str, + assets: list[_VisionImageAsset], +) -> str: + """Persist extracted image bytes + descriptor under + ``derived/parse_/vision/``. + + Each asset is written to ``vision/images/.`` and + enumerated in a ``vision/source.jsonl`` descriptor (one record per + line, schema ``{image_id, image_path, mime_type, alt_text, + page_idx, bbox}``). Returns the descriptor path so the caller can + pin it on :class:`ParseResult` and the orchestrator can hand it to + the vision worker (chunk 4 callsite rewrite). + """ + descriptor_lines: list[str] = [] + for asset in assets: + ext = _vision_image_extension(asset.mime_type) + image_path = derived_artifact( + collection_id=collection_id, + document_id=document_id, + parse_version=parse_version, + filename=f"vision/images/{asset.image_id}.{ext}", + ) + write_atomic(store, image_path, asset.data) + descriptor_lines.append( + json.dumps( + { + "image_id": asset.image_id, + "image_path": image_path, + "mime_type": asset.mime_type, + "alt_text": asset.alt_text, + "page_idx": asset.page_idx, + "bbox": asset.bbox, + }, + ensure_ascii=False, + ) + ) + + descriptor_path = derived_artifact( + collection_id=collection_id, + document_id=document_id, + parse_version=parse_version, + filename="vision/source.jsonl", ) + write_atomic(store, descriptor_path, ("\n".join(descriptor_lines) + "\n").encode("utf-8")) + return descriptor_path def read_chunks(store: _SyncObjectStore, chunks_path: str) -> list[dict[str, Any]]: diff --git a/tests/unit_test/indexing/test_parser_image_extraction.py b/tests/unit_test/indexing/test_parser_image_extraction.py new file mode 100644 index 000000000..61d2563ae --- /dev/null +++ b/tests/unit_test/indexing/test_parser_image_extraction.py @@ -0,0 +1,291 @@ +# Copyright 2025 ApeCloud, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for Wave 5 P2 chunk 2 — parser image extraction. + +Pin the §G.2.5.1 spec item 2 contract: when DocParser produces +``AssetBinPart`` payloads, the parser writes each image blob to +``derived/parse_/vision/images/.`` and lands a +``vision/source.jsonl`` descriptor. The vision worker (chunk 4) +consumes the descriptor instead of the T1 simulator's synthetic +``images.json`` companion. + +We don't exercise the full DocParser chain here (that needs MinerU / +MarkItDown deps); instead we monkeypatch +:func:`aperag.indexing.parser._docparser_extract_markdown` to return a +controlled ``(markdown, image_assets)`` tuple, then assert +:func:`parse_document` writes the right files at the right paths and +the right :class:`ParseResult` fields. +""" + +from __future__ import annotations + +import json + +import pytest + +from aperag.indexing import parser as parser_module +from aperag.indexing.object_store import InMemoryObjectStore, derived_dir +from aperag.indexing.parser import ( + _vision_image_extension, + _VisionImageAsset, + parse_document, +) + + +def test_vision_image_extension_maps_known_mime_types(): + assert _vision_image_extension("image/jpeg") == "jpg" + assert _vision_image_extension("image/jpg") == "jpg" + assert _vision_image_extension("image/png") == "png" + assert _vision_image_extension("image/webp") == "webp" + assert _vision_image_extension("image/gif") == "gif" + assert _vision_image_extension("image/svg+xml") == "svg" + + +def test_vision_image_extension_strips_charset_param(): + """``image/jpeg; charset=binary`` and similar provider-emitted + MIME parameters should not break the extension lookup.""" + assert _vision_image_extension("image/jpeg; charset=binary") == "jpg" + assert _vision_image_extension("IMAGE/PNG") == "png" + + +def test_vision_image_extension_falls_back_to_bin_for_unknown(): + assert _vision_image_extension(None) == "bin" + assert _vision_image_extension("") == "bin" + assert _vision_image_extension("application/octet-stream") == "bin" + # Newer formats not yet in the lookup table — vision worker will + # use ``imghdr`` on the bytes themselves at embed time, so the + # filename is informational only. + assert _vision_image_extension("image/heic") == "bin" + + +def test_parse_document_persists_image_blobs_and_descriptor(monkeypatch: pytest.MonkeyPatch): + """End-to-end: parser hands DocParser a PDF-shape input, gets back + markdown + 2 image assets; parser persists each blob at + ``vision/images/.`` and a descriptor at + ``vision/source.jsonl``.""" + + img_a = _VisionImageAsset( + image_id="aaa111", + data=b"\xff\xd8\xff\xe0fake-jpeg-bytes", + mime_type="image/jpeg", + alt_text="Figure 1: architecture diagram", + page_idx=2, + bbox=[0.1, 0.2, 0.3, 0.4], + ) + img_b = _VisionImageAsset( + image_id="bbb222", + data=b"\x89PNGfake-png-bytes", + mime_type="image/png", + alt_text="", + page_idx=None, + bbox=None, + ) + + def _fake_docparser(*, source_bytes: bytes, extension: str, parser_config): + return ("# Title\n\nbody", [img_a, img_b]) + + monkeypatch.setattr(parser_module, "_docparser_extract_markdown", _fake_docparser) + + store = InMemoryObjectStore() + result = parse_document( + store=store, + collection_id="col-1", + document_id="doc-1", + source_bytes=b"%PDF-1.4 fake", + source_filename="report.pdf", + ) + + assert result.vision_image_count == 2 + expected_dir = derived_dir("col-1", "doc-1", result.parse_version) + expected_descriptor = f"{expected_dir}/vision/source.jsonl" + assert result.vision_source_path == expected_descriptor + assert store.obj_exists(expected_descriptor) + + image_a_path = f"{expected_dir}/vision/images/aaa111.jpg" + image_b_path = f"{expected_dir}/vision/images/bbb222.png" + assert store.obj_exists(image_a_path) + assert store.obj_exists(image_b_path) + assert store.get(image_a_path).read() == img_a.data + assert store.get(image_b_path).read() == img_b.data + + descriptor_body = store.get(expected_descriptor).read().decode("utf-8") + rows = [json.loads(line) for line in descriptor_body.splitlines() if line.strip()] + assert len(rows) == 2 + assert rows[0] == { + "image_id": "aaa111", + "image_path": image_a_path, + "mime_type": "image/jpeg", + "alt_text": "Figure 1: architecture diagram", + "page_idx": 2, + "bbox": [0.1, 0.2, 0.3, 0.4], + } + assert rows[1] == { + "image_id": "bbb222", + "image_path": image_b_path, + "mime_type": "image/png", + "alt_text": "", + "page_idx": None, + "bbox": None, + } + + +def test_parse_document_skips_vision_artefacts_when_no_images(monkeypatch: pytest.MonkeyPatch): + """A parsed document with no image assets should not land any + ``vision/`` artefacts and ``vision_source_path`` must stay empty + so the orchestrator's vision dispatch can short-circuit cleanly. + """ + + def _fake_docparser(*, source_bytes: bytes, extension: str, parser_config): + return ("# Title\n\nbody", []) + + monkeypatch.setattr(parser_module, "_docparser_extract_markdown", _fake_docparser) + + store = InMemoryObjectStore() + result = parse_document( + store=store, + collection_id="col-1", + document_id="doc-no-images", + source_bytes=b"fake doc", + source_filename="doc.docx", + ) + + assert result.vision_image_count == 0 + assert result.vision_source_path == "" + expected_dir = derived_dir("col-1", "doc-no-images", result.parse_version) + # No vision assets should have been written. + assert not store.obj_exists(f"{expected_dir}/vision/source.jsonl") + + +def test_parse_document_simulator_path_emits_no_vision_artefacts(): + """The text-only simulator path (markdown / txt) does not invoke + DocParser so no ``AssetBinPart`` extraction happens — the + ``ParseResult`` defaults must keep vision fields empty so existing + pre-Wave-5 callers see no behaviour change.""" + + store = InMemoryObjectStore() + result = parse_document( + store=store, + collection_id="col-1", + document_id="doc-md", + source_bytes=b"# Header\n\nBody.", + source_filename="notes.md", + ) + + assert result.vision_image_count == 0 + assert result.vision_source_path == "" + expected_dir = derived_dir("col-1", "doc-md", result.parse_version) + assert not store.obj_exists(f"{expected_dir}/vision/source.jsonl") + + +def test_parse_document_image_paths_use_mime_extension(monkeypatch: pytest.MonkeyPatch): + """The on-disk filename extension follows the asset's ``mime_type`` + so operators can spot-check the bytes directly without renaming.""" + + img = _VisionImageAsset( + image_id="ccc333", + data=b"GIF89a-fake", + mime_type="image/gif", + alt_text="", + page_idx=None, + bbox=None, + ) + + def _fake_docparser(*, source_bytes: bytes, extension: str, parser_config): + return ("# t", [img]) + + monkeypatch.setattr(parser_module, "_docparser_extract_markdown", _fake_docparser) + + store = InMemoryObjectStore() + result = parse_document( + store=store, + collection_id="col-1", + document_id="doc-gif", + source_bytes=b"fake", + source_filename="anim.gif", + ) + + expected_dir = derived_dir("col-1", "doc-gif", result.parse_version) + assert store.obj_exists(f"{expected_dir}/vision/images/ccc333.gif") + + +def test_parse_document_falls_back_to_bin_for_unknown_mime(monkeypatch: pytest.MonkeyPatch): + """Unknown MIMEs (e.g., ``image/heic``) fall back to ``.bin`` so + the asset is still persisted — vision worker uses ``imghdr`` on + the bytes at embed time and tolerates the generic extension.""" + + img = _VisionImageAsset( + image_id="ddd444", + data=b"heic-fake", + mime_type="image/heic", + alt_text="", + page_idx=None, + bbox=None, + ) + + def _fake_docparser(*, source_bytes: bytes, extension: str, parser_config): + return ("# t", [img]) + + monkeypatch.setattr(parser_module, "_docparser_extract_markdown", _fake_docparser) + + store = InMemoryObjectStore() + result = parse_document( + store=store, + collection_id="col-1", + document_id="doc-heic", + source_bytes=b"fake", + source_filename="phone.heic", + ) + + expected_dir = derived_dir("col-1", "doc-heic", result.parse_version) + assert store.obj_exists(f"{expected_dir}/vision/images/ddd444.bin") + + +def test_parse_document_image_only_input_still_lands_descriptor(monkeypatch: pytest.MonkeyPatch): + """An image-only input (e.g., a single PNG upload) produces no + markdown but should still land its image asset + descriptor so + the vision modality has bytes to embed. ``markdown.md`` is empty + on this path; ``vision_source_path`` is populated.""" + + img = _VisionImageAsset( + image_id="eee555", + data=b"single-png-bytes", + mime_type="image/png", + alt_text="cover photo", + page_idx=None, + bbox=None, + ) + + def _fake_docparser(*, source_bytes: bytes, extension: str, parser_config): + # No markdown emitted (image-only input) but one asset landed. + return ("", [img]) + + monkeypatch.setattr(parser_module, "_docparser_extract_markdown", _fake_docparser) + + store = InMemoryObjectStore() + result = parse_document( + store=store, + collection_id="col-1", + document_id="doc-single-img", + source_bytes=b"png-bytes", + source_filename="cover.png", + ) + + assert result.vision_image_count == 1 + expected_dir = derived_dir("col-1", "doc-single-img", result.parse_version) + assert store.obj_exists(f"{expected_dir}/vision/images/eee555.png") + assert store.obj_exists(f"{expected_dir}/vision/source.jsonl") + # Markdown still written (empty) so the artifact contract holds. + assert store.obj_exists(result.markdown_path) + assert store.get(result.markdown_path).read() == b"" From 7fe3f25d457b0d986de75be63d4a69e0b6baef80 Mon Sep 17 00:00:00 2001 From: earayu Date: Mon, 27 Apr 2026 18:52:41 +0800 Subject: [PATCH 12/13] feat(celery Wave 5 P2 chunk 3): Model.supports_multimodal_embedding capability flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per §G.2.5.1 spec amend item 3: surface a typed capability flag for embedding models that accept image bytes (CLIP / Voyage Multimodal / Jina v3 / OpenAI multimodal embeddings) so operators can register them via the v3 model platform UI. The flag drives the chunk 4b vision gate's `EmbeddingService.is_multimodal()` runtime check — flip it on the collection's embedder spec model and the gate self-disables, enabling vision modality. Distinct from existing `supports_vision` (chat/completion models that accept image input) — `supports_multimodal_embedding` describes embedding models that produce vectors from images. Changes: * `aperag/domains/model_platform/schemas.py`: new optional `supports_multimodal_embedding: bool = False` on `Model` / `ModelCreate` / `ModelUpdate` (default False, no behaviour change for existing rows). * `aperag/domains/model_platform/db/models.py`: new `supports_multimodal_embedding` Boolean column with default False. * `aperag/migration/versions/...f1c8d2a5b6e3...`: alembic migration adding the column with `server_default=false`. Forward-only cutover safe because pre-Wave-5 rows default to False (matches prior implicit behaviour). * `aperag/domains/model_platform/service/model_service.py`: `_model_to_schema` maps the new column. * `aperag/db/repositories/llm_provider.py`: `create_model` accepts the new field so the v3 `POST /models` flow persists it. * `aperag/llm/embed/base_embedding.py`: `get_embedding_service` prefers the typed `supports_multimodal_embedding` column; falls back to the legacy `runner_config["multimodal"]` JSON entry so pre-Wave-5 operators who edited the JSON keep working without re-saving the row. Tests: 3 new unit tests in `test_model_platform_v3_contract.py` covering default-False behaviour, opt-in True path, and v3 OpenAPI schema exposure on Model / ModelCreate / ModelUpdate. Production-readiness 三类: - must-be-real: real DB column + alembic migration + v3 API surface - may-be-gated: legacy `runner_config["multimodal"]` fallback for rows created before the typed column landed - fully-resolves: §G.2.5.1 spec item 3 (provider v3 multimodal capability flag UI) + Wave 5 P2 chunk 3 acceptance Phase 2 chunk 4 (callsite rewrite + chunk 4b gate self-disable verify + Layer 1/2 test rename) remains as the final Wave 5 blocker. Co-Authored-By: Claude Opus 4.7 --- aperag/db/repositories/llm_provider.py | 2 + aperag/domains/model_platform/db/models.py | 5 +++ aperag/domains/model_platform/schemas.py | 11 +++++ .../model_platform/service/model_service.py | 1 + aperag/llm/embed/base_embedding.py | 11 ++++- ...6e3_model_supports_multimodal_embedding.py | 41 ++++++++++++++++++ .../test_model_platform_v3_contract.py | 43 +++++++++++++++++++ 7 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 aperag/migration/versions/20260427030000-f1c8d2a5b6e3_model_supports_multimodal_embedding.py diff --git a/aperag/db/repositories/llm_provider.py b/aperag/db/repositories/llm_provider.py index dbe2a4a7e..00c15c871 100644 --- a/aperag/db/repositories/llm_provider.py +++ b/aperag/db/repositories/llm_provider.py @@ -286,6 +286,7 @@ async def create_model( embedding_dimensions: int | None = None, supports_vision: bool = False, supports_tool_calling: bool = False, + supports_multimodal_embedding: bool = False, extra: dict | None = None, ) -> Model: async def _operation(session): @@ -302,6 +303,7 @@ async def _operation(session): embedding_dimensions=embedding_dimensions, supports_vision=supports_vision, supports_tool_calling=supports_tool_calling, + supports_multimodal_embedding=supports_multimodal_embedding, extra=extra or {}, ) session.add(model) diff --git a/aperag/domains/model_platform/db/models.py b/aperag/domains/model_platform/db/models.py index 450c4c259..6aea57651 100644 --- a/aperag/domains/model_platform/db/models.py +++ b/aperag/domains/model_platform/db/models.py @@ -112,6 +112,11 @@ class Model(Base): embedding_dimensions = Column(Integer, nullable=True) supports_vision = Column(Boolean, default=False, nullable=False) supports_tool_calling = Column(Boolean, default=False, nullable=False) + # Wave 5 P2 chunk 3: typed capability flag for embedding models + # that accept image bytes; surfaced through the v3 model API + + # consumed by ``base_embedding.get_embedding_service`` to set + # ``EmbeddingService.multimodal=True``. + supports_multimodal_embedding = Column(Boolean, default=False, nullable=False) status = Column(_enum_column(ModelStatus), default=ModelStatus.ACTIVE.value, nullable=False, index=True) extra = Column(JSON, default=lambda: {}, nullable=False) gmt_created = Column(DateTime(timezone=True), default=utc_now, nullable=False) diff --git a/aperag/domains/model_platform/schemas.py b/aperag/domains/model_platform/schemas.py index 62251717c..1a11aecb0 100644 --- a/aperag/domains/model_platform/schemas.py +++ b/aperag/domains/model_platform/schemas.py @@ -106,6 +106,15 @@ class Model(BaseModel): embedding_dimensions: Optional[int] = None supports_vision: bool = False supports_tool_calling: bool = False + # Wave 5 P2 chunk 3 (per §G.2.5.1 spec amend item 3): a typed + # capability flag for embedding models that accept image bytes + # (CLIP / Voyage Multimodal / Jina v3 / OpenAI multimodal + # embeddings / etc.) and produce a single vector. Distinct from + # ``supports_vision`` which describes chat/completion models that + # accept images as input. Drives the chunk 4b vision gate's + # ``EmbeddingService.is_multimodal()`` runtime check — flip on the + # collection's embedder spec model and the gate self-disables. + supports_multimodal_embedding: bool = False status: Literal["ACTIVE", "INACTIVE"] = "ACTIVE" extra: dict[str, Any] = Field(default_factory=dict) created: Optional[datetime] = None @@ -125,6 +134,7 @@ class ModelCreate(BaseModel): embedding_dimensions: Optional[int] = None supports_vision: bool = False supports_tool_calling: bool = False + supports_multimodal_embedding: bool = False extra: dict[str, Any] = Field(default_factory=dict) @@ -139,6 +149,7 @@ class ModelUpdate(BaseModel): embedding_dimensions: Optional[int] = None supports_vision: Optional[bool] = None supports_tool_calling: Optional[bool] = None + supports_multimodal_embedding: Optional[bool] = None status: Optional[Literal["ACTIVE", "INACTIVE"]] = None extra: Optional[dict[str, Any]] = None diff --git a/aperag/domains/model_platform/service/model_service.py b/aperag/domains/model_platform/service/model_service.py index 89d55bf9e..036ddd5ac 100644 --- a/aperag/domains/model_platform/service/model_service.py +++ b/aperag/domains/model_platform/service/model_service.py @@ -103,6 +103,7 @@ def _model_to_schema(model) -> Model: embedding_dimensions=model.embedding_dimensions, supports_vision=model.supports_vision, supports_tool_calling=model.supports_tool_calling, + supports_multimodal_embedding=getattr(model, "supports_multimodal_embedding", False) or False, status=model.status, extra=model.extra or {}, created=model.gmt_created, diff --git a/aperag/llm/embed/base_embedding.py b/aperag/llm/embed/base_embedding.py index 72ae5efdd..644ee9382 100644 --- a/aperag/llm/embed/base_embedding.py +++ b/aperag/llm/embed/base_embedding.py @@ -50,13 +50,22 @@ def get_embedding_service(model_id: str, user_id: str) -> tuple[EmbeddingService provider = invocation.runner_config.get("provider") if not provider: provider = "openai" if invocation.runner_type == "openai_compatible" else invocation.provider_type + # Wave 5 P2 chunk 3 (per §G.2.5.1 spec amend item 3): prefer the + # typed ``Model.supports_multimodal_embedding`` column when present, + # fall back to the legacy ``runner_config["multimodal"]`` JSON dict + # entry so collections / models created before the typed column + # landed keep working (operators who edited the JSON keep the same + # behaviour without re-saving the model row). + multimodal = bool(getattr(model, "supports_multimodal_embedding", False)) or bool( + invocation.runner_config.get("multimodal", False) + ) embedding_svc = EmbeddingService( embedding_provider=provider, embedding_model=invocation.provider_model_id, embedding_service_url=invocation.base_url, embedding_service_api_key=invocation.api_key, embedding_max_chunks_in_batch=settings.embedding_max_chunks_in_batch, - multimodal=invocation.runner_config.get("multimodal", False), + multimodal=multimodal, ) return embedding_svc, _get_embedding_dimension(embedding_svc, model_id, invocation.embedding_dimensions) diff --git a/aperag/migration/versions/20260427030000-f1c8d2a5b6e3_model_supports_multimodal_embedding.py b/aperag/migration/versions/20260427030000-f1c8d2a5b6e3_model_supports_multimodal_embedding.py new file mode 100644 index 000000000..c71773310 --- /dev/null +++ b/aperag/migration/versions/20260427030000-f1c8d2a5b6e3_model_supports_multimodal_embedding.py @@ -0,0 +1,41 @@ +"""add ``supports_multimodal_embedding`` column to ``model`` + +Wave 5 P2 chunk 3 (per §G.2.5.1 spec amend item 3): a typed +capability flag on ``Model`` for embedding models that accept image +bytes (CLIP / Voyage Multimodal / Jina v3 / OpenAI multimodal +embeddings / etc.) and produce a single vector. Distinct from +``supports_vision`` which describes chat/completion models that +accept images as input. Drives the chunk 4b vision gate's +``EmbeddingService.is_multimodal()`` runtime check — flip on the +collection's embedder spec model and the gate self-disables. + +Revision ID: f1c8d2a5b6e3 +Revises: e7a3b9c2d1f6 +Create Date: 2026-04-27 03:00:00.000000 +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op + +revision: str = "f1c8d2a5b6e3" +down_revision: Union[str, None] = "e7a3b9c2d1f6" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.add_column( + "model", + sa.Column( + "supports_multimodal_embedding", + sa.Boolean(), + nullable=False, + server_default=sa.text("false"), + ), + ) + + +def downgrade() -> None: + op.drop_column("model", "supports_multimodal_embedding") diff --git a/tests/unit_test/test_model_platform_v3_contract.py b/tests/unit_test/test_model_platform_v3_contract.py index ff1e906f2..294481e5d 100644 --- a/tests/unit_test/test_model_platform_v3_contract.py +++ b/tests/unit_test/test_model_platform_v3_contract.py @@ -76,3 +76,46 @@ def test_model_provider_and_model_use_are_explicit(): assert provider.supported_capabilities == [ModelCapability.CHAT, ModelCapability.EMBEDDING] assert ModelUseScenario.AGENT_CHAT.value == "agent_chat" + + +def test_model_create_supports_multimodal_embedding_flag_defaults_false(): + """Wave 5 P2 chunk 3 (per §G.2.5.1 spec amend item 3): the + ``supports_multimodal_embedding`` capability flag defaults to + False so existing model rows / pre-Wave-5 callers see no + behaviour change.""" + model = ModelCreate( + account_id="acct_1", + provider_model_id="text-embedding-3-large", + display_name="OpenAI text-embedding-3-large", + capability=ModelCapability.EMBEDDING, + ) + assert model.supports_multimodal_embedding is False + + +def test_model_create_supports_multimodal_embedding_flag_can_be_set(): + """Operators register a real multimodal embedder (Voyage Multimodal / + CLIP / Jina v3 / etc.) by setting this flag — it surfaces through + the v3 ``/models`` route so a UI can render a checkbox.""" + model = ModelCreate( + account_id="acct_1", + provider_model_id="voyage-multimodal-3", + display_name="Voyage Multimodal 3", + capability=ModelCapability.EMBEDDING, + supports_multimodal_embedding=True, + ) + assert model.supports_multimodal_embedding is True + + +def test_model_v3_openapi_exposes_supports_multimodal_embedding(): + """The v3 routes' OpenAPI schema must surface the new capability + flag so the API contract stays discoverable for UI clients.""" + spec = _provider_v3_spec() + schemas = spec["components"]["schemas"] + # All three model schemas (read / create / update) must carry + # the field so a UI can read + modify it consistently. + for schema_name in ("Model", "ModelCreate", "ModelUpdate"): + assert schema_name in schemas, f"{schema_name} schema missing from v3 OpenAPI" + properties = schemas[schema_name].get("properties", {}) + assert "supports_multimodal_embedding" in properties, ( + f"{schema_name} missing supports_multimodal_embedding capability flag" + ) From 8e7c6ecd21f466c949a2cff0444cff4ccb97e42f Mon Sep 17 00:00:00 2001 From: earayu Date: Mon, 27 Apr 2026 19:00:16 +0800 Subject: [PATCH 13/13] feat(celery Wave 5 P2 chunk 4): vision callsite rewrite + chunk 4b gate self-disable verify MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per §G.2.5.1 spec amend final piece: rewire `_build_vision_worker._embed` to call `EmbeddingService.embed_image(image_bytes, alt_text)` (chunk 1) with the actual image bytes the parser persisted (chunk 2 wrote `derived/parse_/vision/images/.` + a JSONL descriptor at `vision/source.jsonl`). The chunk 4b vision gate self-disables when the operator flips `Model.supports_multimodal_embedding=True` (chunk 3). `aperag/indexing/worker_factory.py`: * `_embed(image_id, alt_text, image_bytes=None)` — when image_bytes is provided, route to `embedding_service.embed_image(image_bytes, alt_text)`. None falls back to the legacy text-concat path so the T1 simulator + tests that hand the worker synthetic JSON keep working. * Gate-raise message reframed: drops "Wave 4 wiring" phrasing (now Wave 5 wiring is land), names the typed `Model.supports_multimodal_embedding` flag so an operator can fix the config directly. `aperag/indexing/vision.py`: * `VisionModality.derive` accepts both source-path formats: the legacy single-JSON-array shape (T1 simulator / pre-Wave-5 tests) AND the new JSONL-with-image-path shape (parser chunk 2 output). Format detection is by first non-whitespace byte (`[` → JSON array, else → JSONL). * `_load_image_bytes(record)` reads the descriptor's `image_path` through the object store; missing blob logs a warning and returns None (embedder still runs on the alt-text/id placeholder digest) so a partial parser write doesn't block the whole derive cycle. * `_placeholder_embedding(..., image_bytes=None)` mirrors the new embedder signature; placeholder ignores bytes. * Embedder Protocol is widened: `(image_id, alt_text, image_bytes=None)`. `tests/integration/test_full_indexing_pipeline.py`: * Renamed Layer 1 `test_phase1_vision_modality_raises_wave4_wiring_gate` → `..._gate_raises_when_embedder_not_multimodal`. Asserts the reframed message names `multimodal embedding model` + `supports_multimodal_embedding` flag. * New positive-path Layer 1 `test_phase1_vision_modality_gate_self_disables_when_embedder_multimodal`: with `is_multimodal()=True`, the factory builds a vision worker without raising — pins the chunk 4 gate-self-disable contract. * Layer 2 e2e assertion: vision modality may be ACTIVE (when CI fixture has multimodal embedder configured) OR FAILED with a gate marker including `supports_multimodal_embedding`. OR-on-marker tolerance kept for transition state. `tests/unit_test/indexing/test_t1_4_summary_vision.py`: 3 new tests covering JSONL descriptor with image_path (bytes loaded + forwarded), missing-blob graceful fallback, and legacy JSON-array backward compat. Production-readiness 三类: - must-be-real: real `embed_image` callsite + real bytes load from parser's descriptor - may-be-gated: legacy text-concat fallback + simulator JSON format preserved for tests - fully-resolves: §G.2.5.1 spec items 1+2+3 all wired end-to-end + chunk 4b vision gate self-disable verify (Wave 5 P2 closure) Wave 5 P2 (T7 multimodal vision-LLM 3-item bundle) closed. The chunk 4b vision gate self-disables when an operator configures a multimodal embedder; default-off behaviour preserved for text-only collections. Co-Authored-By: Claude Opus 4.7 --- aperag/indexing/vision.py | 100 +++++++++++-- aperag/indexing/worker_factory.py | 29 ++-- .../test_full_indexing_pipeline.py | 121 +++++++++++++--- .../indexing/test_t1_4_summary_vision.py | 131 ++++++++++++++++++ 4 files changed, 334 insertions(+), 47 deletions(-) diff --git a/aperag/indexing/vision.py b/aperag/indexing/vision.py index 3d17598ca..783339a4c 100644 --- a/aperag/indexing/vision.py +++ b/aperag/indexing/vision.py @@ -112,7 +112,21 @@ def points_for_document(self, document_id: str, parse_version: str | None = None return sorted(out, key=lambda r: r["point_id"]) -def _placeholder_embedding(image_id: str, alt_text: str, dim: int = SIMULATOR_VISION_EMBEDDING_DIM) -> list[float]: +def _placeholder_embedding( + image_id: str, + alt_text: str, + image_bytes: bytes | None = None, + dim: int = SIMULATOR_VISION_EMBEDDING_DIM, +) -> list[float]: + """Deterministic synthetic embedding for tests + the simulator. + + Wave 5 P2 chunk 4: the optional ``image_bytes`` parameter mirrors + the production embedder signature (real multimodal embedders consume + bytes). The placeholder ignores it — the digest is over + ``image_id|alt_text`` so re-running the simulator produces the same + vector for the same record. + """ + del image_bytes # placeholder ignores actual bytes digest = hashlib.sha256(f"{image_id}|{alt_text}".encode("utf-8")).digest() repeat = (dim + len(digest) - 1) // len(digest) expanded = (digest * repeat)[:dim] @@ -144,11 +158,21 @@ async def derive( ) -> DeriveResult: """Extract images, run vision-LLM, persist manifest atomically. - T1 simulator contract: ``source_path`` is a JSON file in the - object store containing ``[{"image_id": ..., "alt_text": ..., - "page_idx": int|None, "bbox": [...]|None}, ...]``. The real - T2.x pipeline replaces this with PDF extraction; the - ``manifest.jsonl`` schema is the contract that must not change. + Two source-path formats are supported (the production parser + wrote the second one starting Wave 5 P2 chunk 2): + + * **Legacy simulator** (single JSON array): a JSON list at + ``source_path`` holding ``[{"image_id": ..., "alt_text": ..., + "page_idx": int|None, "bbox": [...]|None}, ...]``. No image + bytes — the embedder gets ``image_bytes=None`` and falls back + to the alt-text/id placeholder digest. + * **Wave 5 production descriptor** (JSONL with ``image_path``): + one record per line at ``derived/parse_/vision/source.jsonl`` + with ``{"image_id", "image_path", "mime_type", "alt_text", + "page_idx", "bbox"}``. The worker loads each image's bytes + from ``image_path`` and hands them to the embedder so a real + multimodal embedding model can produce a real visual vector + (Wave 5 P2 chunk 4 callsite rewrite). """ body = read_or_none(self._store, source_path) if body is None: @@ -158,12 +182,7 @@ async def derive( ) return DeriveResult(derived_artifact_path="") - try: - image_records = json.loads(body) - except json.JSONDecodeError as exc: - raise ValueError(f"vision.derive expected JSON list at {source_path}, got non-JSON: {exc}") from exc - if not isinstance(image_records, list): - raise ValueError(f"vision.derive expected JSON list of image records, got {type(image_records).__name__}") + image_records = self._parse_descriptor(body, source_path=source_path) parts = source_path.split("/") try: @@ -179,7 +198,8 @@ async def derive( for record in image_records: image_id = record["image_id"] alt_text = record.get("alt_text", "") - embedding = self._embedder(image_id, alt_text) + image_bytes = self._load_image_bytes(record) + embedding = self._embedder(image_id, alt_text, image_bytes=image_bytes) entry = { "image_id": image_id, "alt_text": alt_text, @@ -251,6 +271,60 @@ async def sync( payload=payload, ) + def _parse_descriptor(self, body: bytes, *, source_path: str) -> list[dict]: + """Decode the source-image descriptor file. + + Tolerates both the legacy single-JSON-array shape and the + Wave 5 P2 chunk 2 JSONL-with-image-path shape (parser writes + the latter). The first byte of body picks the format: ``[`` + means a JSON array, anything else is JSONL. + """ + text = body.decode("utf-8") + stripped = text.lstrip() + if stripped.startswith("["): + try: + records = json.loads(text) + except json.JSONDecodeError as exc: + raise ValueError(f"vision.derive expected JSON list at {source_path}, got non-JSON: {exc}") from exc + if not isinstance(records, list): + raise ValueError(f"vision.derive expected JSON list of image records, got {type(records).__name__}") + return [record for record in records if isinstance(record, dict)] + records: list[dict] = [] + for line in text.splitlines(): + if not line.strip(): + continue + try: + record = json.loads(line) + except json.JSONDecodeError as exc: + raise ValueError(f"vision.derive expected JSONL at {source_path}, malformed line: {exc}") from exc + if not isinstance(record, dict): + raise ValueError(f"vision.derive expected JSONL records to be objects, got {type(record).__name__}") + records.append(record) + return records + + def _load_image_bytes(self, record: dict) -> bytes | None: + """Load image bytes from the descriptor's ``image_path`` if + present. + + Returns ``None`` for legacy simulator records (no ``image_path`` + field) so the embedder falls back to its non-bytes path. A + missing-blob error logs but doesn't raise — the embedder still + runs with ``image_bytes=None`` so a partial parser write doesn't + block the whole derive cycle. + """ + image_path = record.get("image_path") + if not image_path: + return None + body = read_or_none(self._store, str(image_path)) + if body is None: + logger.warning( + "vision.derive: image_path %s referenced in descriptor but blob missing; " + "embedder falls back to text-only path", + image_path, + ) + return None + return body + __all__ = [ "VisionModality", diff --git a/aperag/indexing/worker_factory.py b/aperag/indexing/worker_factory.py index c1f76f54d..b7378f44c 100644 --- a/aperag/indexing/worker_factory.py +++ b/aperag/indexing/worker_factory.py @@ -442,23 +442,28 @@ def _build_vision_worker(*, collection: Any, object_store: Any) -> ModalityWorke embedding_service, vector_size = get_collection_embedding_service_sync(collection) if not embedding_service.is_multimodal(): raise WorkerFactoryError( - "vision modality requires a real multimodal vision-LLM (Wave 4 wiring); " - "current text-only embedder produces fake string-concat vision vectors — " - "set collection.config.enable_vision=false until Wave 4 lands " - "OR configure a multimodal embedding model on the collection's embedding spec" + "vision modality requires a multimodal embedding model " + "(set Model.supports_multimodal_embedding=True on the collection's " + "embedder spec — Voyage Multimodal / Jina v3 / OpenAI multimodal / etc.) " + "OR set collection.config.enable_vision=false to keep the modality off" ) qdrant_collection = generate_vector_db_collection_name(collection.id) adaptor = get_vector_db_connector(qdrant_collection, vector_size=vector_size) backend = _QdrantPointBackend(connector=adaptor.connector) - def _embed(image_id: str, alt_text: str) -> list[float]: - # ``is_multimodal()`` gate above only verifies that operators - # explicitly opted into a multimodal embedder. The body is - # still the Wave 3 string-concat placeholder until T7 replaces - # it with a real image-bytes path (load image from object - # store → multimodal embed); ``embed_query`` of an alt-text - # surrogate is not actual visual indexing. - return embedding_service.embed_query(f"{image_id}|{alt_text}") + def _embed(image_id: str, alt_text: str, image_bytes: bytes | None = None) -> list[float]: + # Wave 5 P2 chunk 4: real callsite rewrite — load the actual + # image bytes from the parser's descriptor (chunk 2 writes + # ``vision/source.jsonl`` with ``image_path``) and call the + # multimodal-aware ``embed_image`` API surface (chunk 1). + # ``image_bytes=None`` only happens on the legacy simulator + # path used by tests — fall back to the text-only embedding so + # those fixtures keep working without standing up real image + # bytes. The chunk 4b gate above already prevents a non-multi- + # modal embedder from reaching this body. + if image_bytes is None: + return embedding_service.embed_query(f"{image_id}|{alt_text}") + return embedding_service.embed_image(image_bytes=image_bytes, alt_text=alt_text) return VisionModality(backend=backend, store=object_store, embedder=_embed) diff --git a/tests/integration/test_full_indexing_pipeline.py b/tests/integration/test_full_indexing_pipeline.py index 6651d22a4..d95db1343 100644 --- a/tests/integration/test_full_indexing_pipeline.py +++ b/tests/integration/test_full_indexing_pipeline.py @@ -211,18 +211,19 @@ async def _run() -> None: engine.dispose() -def test_phase1_vision_modality_raises_wave4_wiring_gate(monkeypatch: pytest.MonkeyPatch): - """Layer 1 gate invariant: vision modality requires a real - multimodal embedder. The Wave 3 vision gate (Wave 4 backlog #7) - raises ``WorkerFactoryError`` with ``"Wave 4 wiring"`` until T7 - lands a multimodal model. Phase 1 smoke pins this — Phase 2 (after - T7) flips it to ACTIVE assertion. +def test_phase1_vision_modality_gate_raises_when_embedder_not_multimodal(monkeypatch: pytest.MonkeyPatch): + """Layer 1 gate invariant: vision modality requires a multimodal + embedding model. When ``EmbeddingService.is_multimodal()`` is False + the gate raises ``WorkerFactoryError`` with an operator-actionable + message naming the typed `Model.supports_multimodal_embedding` + capability flag (Wave 5 P2 chunk 3) so the operator can fix the + config. + + Wave 5 P2 chunk 4 reframed the message — it no longer claims + "Wave 4 wiring" since the multimodal pieces are landed; the gate + now flags an operator-config gap, not a code gap. """ - # Stub the embedder so the gate reachability is decoupled from - # the model-provider config. The gate compares - # ``embedding_service.is_multimodal()`` — a non-multimodal stub - # exercises the gate; a multimodal stub flips it (Phase 2). class _StubEmbeddingService: def is_multimodal(self) -> bool: return False @@ -256,7 +257,76 @@ async def _run() -> None: with pytest.raises(WorkerFactoryError) as exc: await factory(payload) msg = str(exc.value) - assert "Wave 4 wiring" in msg + assert "multimodal embedding model" in msg + assert "supports_multimodal_embedding" in msg + + asyncio.run(_run()) + finally: + engine.dispose() + + +def test_phase1_vision_modality_gate_self_disables_when_embedder_multimodal(monkeypatch: pytest.MonkeyPatch): + """Layer 1 positive-path invariant: when the collection's embedder + is configured multimodal (``Model.supports_multimodal_embedding=True`` + via Wave 5 P2 chunk 3 → ``EmbeddingService.is_multimodal()=True``), + the chunk 4b vision gate self-disables and ``ProductionWorkerFactory`` + builds a vision worker without raising. + + Wave 5 P2 chunk 4 acceptance: the gate must self-disable end-to-end + once chunks 1+2+3 land. Chunk 4 wires the callsite; this test pins + that the gate no longer holds back vision when the multimodal + capability is honestly present. + """ + + class _StubEmbeddingService: + def is_multimodal(self) -> bool: + return True + + def embed_query(self, text: str) -> list[float]: + return [0.0] + + def embed_image(self, *, image_bytes: bytes, alt_text: str = "") -> list[float]: + return [0.0] + + def _stub_get_embedding_service(_collection: Any) -> tuple[Any, int]: + return _StubEmbeddingService(), 1 + + monkeypatch.setattr( + "aperag.llm.embed.base_embedding.get_collection_embedding_service_sync", + _stub_get_embedding_service, + ) + + # Vision builder calls into ``get_vector_db_connector`` to wire a + # Qdrant adaptor. Stub it out so the gate-self-disable invariant + # is decoupled from Qdrant being reachable. + def _stub_connector(*_args: Any, **_kwargs: Any) -> Any: + class _A: + connector = object() + + return _A() + + monkeypatch.setattr( + "aperag.config.get_vector_db_connector", + _stub_connector, + ) + + engine = _make_engine() + try: + cid = _seed_collection(engine, enable_vision=True) + row_id = _seed_pending_row(engine, modality=Modality.VISION, collection_id=cid) + payload = DispatchPayload( + index_id=row_id, + document_id=f"doc-{Modality.VISION.value}-phase1-active", + parse_version="parse-v1", + modality=Modality.VISION, + source_path="source/path", + collection_id=cid, + ) + + async def _run() -> None: + factory = ProductionWorkerFactory(engine=engine, object_store=object()) + worker = await factory(payload) + assert worker is not None, "vision worker must build when embedder is multimodal" asyncio.run(_run()) finally: @@ -452,9 +522,7 @@ async def _run_phase1_workers_until_quiet( while asyncio.get_event_loop().time() < deadline: with Session(engine) as session: rows = list( - session.execute( - sa_select(DocumentIndex).where(DocumentIndex.document_id == document_id) - ).scalars() + session.execute(sa_select(DocumentIndex).where(DocumentIndex.document_id == document_id)).scalars() ) if not rows: await asyncio.sleep(0.1) @@ -536,7 +604,6 @@ def test_phase1_full_pipeline_vector_fulltext_summary_active_graph_vision_failed fixture supports document-delete API access. """ - from aperag.indexing.dispatcher import DispatchRequest, IndexingMode, dispatch_indexing from aperag.indexing.parser import ParseConfig, parse_document from aperag.objectstore.base import get_object_store @@ -549,7 +616,7 @@ def test_phase1_full_pipeline_vector_fulltext_summary_active_graph_vision_failed b"# Phase 1 e2e smoke\n\n" b"This document exercises the canonical Phase 1 contract: " b"vector + fulltext + summary reach ACTIVE; graph + vision " - b"finalise FAILED with the Wave 4 wiring gate message.\n" + b"finalise per the collection's gate state.\n" ) async def _run() -> None: @@ -605,16 +672,28 @@ async def _run() -> None: ) assert row.is_serving is True + # Wave 5 P2 chunk 4: vision modality may be ACTIVE or + # FAILED depending on whether the e2e fixture's collection + # was bootstrapped with a multimodal embedder. Either is + # acceptable as long as the FAILED case surfaces a gate + # marker (so an operator can fix the config). Graph stays + # gated on a configured completion model — same OR-on- + # marker tolerance as before. for modality in (Modality.GRAPH, Modality.VISION): row = finalised[modality] + if modality is Modality.VISION and row.status == IndexStatus.ACTIVE.value: + # Multimodal embedder configured + vision pipeline + # produced a real point set — Wave 5 closure path. + assert row.is_serving is True + continue assert row.status == IndexStatus.FAILED.value, ( - f"modality={modality.value} must finalise FAILED until Wave 5 T7 lands; " - f"actual={row.status}" + f"modality={modality.value} must finalise ACTIVE (when prerequisites met) " + f"or FAILED with a gate marker; actual={row.status}" ) msg = row.error_message or "" assert any( marker in msg - for marker in ("Wave 4 wiring", "completion model", "multimodal") + for marker in ("multimodal", "completion model", "supports_multimodal_embedding", "Wave 4 wiring") ), f"modality={modality.value} FAILED message must surface a gate marker; got {msg!r}" finally: engine.dispose() @@ -657,9 +736,7 @@ async def _run() -> None: from aperag.indexing.runtime import get_runtime runtime = get_runtime() - assert runtime is not None and runtime.queue is not None, ( - "sweep D Layer 2 requires a live IndexingRuntime" - ) + assert runtime is not None and runtime.queue is not None, "sweep D Layer 2 requires a live IndexingRuntime" object_store = get_object_store() parsed = parse_document( diff --git a/tests/unit_test/indexing/test_t1_4_summary_vision.py b/tests/unit_test/indexing/test_t1_4_summary_vision.py index 8d7d9bb66..dae76d7ea 100644 --- a/tests/unit_test/indexing/test_t1_4_summary_vision.py +++ b/tests/unit_test/indexing/test_t1_4_summary_vision.py @@ -244,6 +244,137 @@ def test_vision_modality_enum_is_vision(): assert VisionModality.modality is Modality.VISION +def test_vision_derive_consumes_jsonl_descriptor_with_image_path(): + """Wave 5 P2 chunk 4: when the parser writes the new descriptor + (`vision/source.jsonl`) with an ``image_path`` per record, vision + derive must (a) load each image's bytes from the path, (b) hand + them to the embedder via ``image_bytes=``, (c) still emit one + manifest line per record.""" + + store = InMemoryObjectStore() + + # Stage two image blobs at canonical paths. + image_a_path = "collections/col-1/documents/doc-vision-jsonl/derived/parse_v1/vision/images/img-a.jpg" + image_b_path = "collections/col-1/documents/doc-vision-jsonl/derived/parse_v1/vision/images/img-b.png" + write_atomic(store, image_a_path, b"\xff\xd8\xff\xe0fake-jpeg") + write_atomic(store, image_b_path, b"\x89PNGfake-png") + + # Stage the JSONL descriptor pointing at the blobs. + descriptor_path = "collections/col-1/documents/doc-vision-jsonl/derived/parse_v1/vision/source.jsonl" + descriptor_lines = [ + json.dumps( + { + "image_id": "img-a", + "image_path": image_a_path, + "mime_type": "image/jpeg", + "alt_text": "banner", + "page_idx": 0, + "bbox": [0, 0, 100, 100], + } + ), + json.dumps( + { + "image_id": "img-b", + "image_path": image_b_path, + "mime_type": "image/png", + "alt_text": "", + "page_idx": None, + "bbox": None, + } + ), + ] + write_atomic(store, descriptor_path, ("\n".join(descriptor_lines) + "\n").encode("utf-8")) + + # Capture what the embedder sees so we can assert image bytes were + # actually loaded and forwarded. + seen: list[tuple[str, str, bytes | None]] = [] + + def _capturing_embedder(image_id: str, alt_text: str, image_bytes: bytes | None = None) -> list[float]: + seen.append((image_id, alt_text, image_bytes)) + return [0.1, 0.2, 0.3] + + modality = VisionModality(backend=InMemoryVisionBackend(), store=store, embedder=_capturing_embedder) + result = asyncio.run( + modality.derive( + document_id="doc-vision-jsonl", + parse_version="v1", + source_path=descriptor_path, + ) + ) + + assert result.derived_artifact_path.endswith("vision/manifest.jsonl") + assert len(seen) == 2 + a_seen = next(rec for rec in seen if rec[0] == "img-a") + b_seen = next(rec for rec in seen if rec[0] == "img-b") + assert a_seen[2] == b"\xff\xd8\xff\xe0fake-jpeg" + assert b_seen[2] == b"\x89PNGfake-png" + + +def test_vision_derive_falls_back_when_image_blob_missing(): + """If the descriptor references an ``image_path`` that does not + exist (parser write was interrupted / object store eviction), the + embedder still runs with ``image_bytes=None`` so the partial-derive + state surfaces a manifest the operator can inspect rather than the + whole derive cycle exploding.""" + + store = InMemoryObjectStore() + descriptor_path = "collections/col-1/documents/doc-broken/derived/parse_v1/vision/source.jsonl" + record = { + "image_id": "img-missing", + "image_path": "collections/col-1/documents/doc-broken/derived/parse_v1/vision/images/img-missing.jpg", + "mime_type": "image/jpeg", + "alt_text": "stale", + "page_idx": None, + "bbox": None, + } + write_atomic(store, descriptor_path, (json.dumps(record) + "\n").encode("utf-8")) + + seen: list[bytes | None] = [] + + def _capturing_embedder(image_id: str, alt_text: str, image_bytes: bytes | None = None) -> list[float]: + seen.append(image_bytes) + return [0.0] + + modality = VisionModality(backend=InMemoryVisionBackend(), store=store, embedder=_capturing_embedder) + asyncio.run( + modality.derive( + document_id="doc-broken", + parse_version="v1", + source_path=descriptor_path, + ) + ) + + assert seen == [None], "missing image blob → embedder receives image_bytes=None fallback" + + +def test_vision_derive_legacy_simulator_format_still_works(): + """Backward-compat: a legacy single-JSON-array source file (the + pre-Wave-5 simulator shape) should keep working. ``image_bytes`` + is None for every record (no ``image_path`` field) and the + placeholder embedder produces a deterministic vector.""" + + store = InMemoryObjectStore() + images = [ + {"image_id": "img-1", "alt_text": "x", "page_idx": 0, "bbox": None}, + {"image_id": "img-2", "alt_text": "y", "page_idx": 1, "bbox": None}, + ] + source_path = _seed_vision_source(store, document_id="doc-legacy", payload=images) + modality = VisionModality(backend=InMemoryVisionBackend(), store=store) + + result = asyncio.run( + modality.derive( + document_id="doc-legacy", + parse_version="v1", + source_path=source_path, + ) + ) + body = store.get(result.derived_artifact_path).read().decode("utf-8") + lines = [json.loads(line) for line in body.splitlines() if line.strip()] + assert {entry["image_id"] for entry in lines} == {"img-1", "img-2"} + for entry in lines: + assert entry["embedding"] + + def test_vision_payload_carries_modality_discriminator_and_image_id(): store = InMemoryObjectStore() images = [{"image_id": "img-only", "alt_text": "only", "page_idx": None, "bbox": None}]