apecloud
diff --git a/‎aperag/indexing/__init__.py‎
Lines changed: 171 additions & 0 deletions b/‎aperag/indexing/__init__.py‎
Lines changed: 171 additions & 0 deletions
diff --git a/‎aperag/indexing/base.py‎
Lines changed: 114 additions & 0 deletions b/‎aperag/indexing/base.py‎
Lines changed: 114 additions & 0 deletions
@@ -0,0 +1,171 @@
+# Copyright 2025 ApeCloud, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Phase celery indexing redesign — public surface.
+
+Per ``docs/modularization/indexing-redesign-design-pack.md``. T1.1
+Foundation lands the schema (``DocumentIndex`` + Modality / IndexStatus
+enums), the ``ModalityWorker`` ABC that downstream T1.2-T1.5 modality
+files implement, the atomic-write object-store helpers (§C.7), and a
+deterministic parser entry point that produces the shared
+``markdown.md`` / ``outline.json`` / ``chunks.jsonl`` artifacts
+(§C.1 / §C.6).
+"""
+
+from aperag.indexing.base import DeriveResult, ModalityWorker
+from aperag.indexing.fulltext import (
+    FulltextBackend,
+    FulltextModality,
+    InMemoryFulltextBackend,
+)
+from aperag.indexing.graph import (
+    KG_ARTIFACT_FILENAME,
+    DescriptionPart,
+    EntityLock,
+    EntityRecord,
+    EntityWithLineage,
+    GraphExtractor,
+    GraphModalityWorker,
+    InMemoryEntityLock,
+    InMemoryLineageGraphStore,
+    LineageGraphStore,
+    LineageMember,
+    RedisEntityLock,
+    RelationRecord,
+    RelationWithLineage,
+    parse_kg_jsonl,
+    serialize_kg_jsonl,
+)
+from aperag.indexing.models import DocumentIndex, IndexStatus, Modality
+from aperag.indexing.object_store import (
+    InMemoryObjectStore,
+    derived_artifact,
+    derived_dir,
+    read_or_none,
+    read_or_none_async,
+    source_artifact,
+    write_atomic,
+    write_atomic_async,
+)
+from aperag.indexing.observability import (
+    INDEX_FAILURE_METRIC,
+    INDEX_LAG_METRIC,
+    INDEX_SUCCESS_METRIC,
+    QUEUE_DEPTH_METRIC,
+    WORKER_UTILIZATION_METRIC,
+    InMemoryMetricsEmitter,
+    MetricsEmitter,
+    NoopMetricsEmitter,
+    emit_index_failure,
+    emit_index_lag,
+    emit_index_success,
+    emit_queue_depth,
+    emit_worker_utilization,
+)
+from aperag.indexing.parser import (
+    DEFAULT_CHUNK_OVERLAP,
+    DEFAULT_CHUNK_SIZE,
+    DEFAULT_PARSER_PIPELINE,
+    ChunkingConfig,
+    ParseConfig,
+    ParseResult,
+    parse_document,
+    read_chunks,
+)
+from aperag.indexing.summary import (
+    InMemorySummaryBackend,
+    SummaryBackend,
+    SummaryModality,
+)
+from aperag.indexing.vector import (
+    InMemoryVectorBackend,
+    VectorBackend,
+    VectorModality,
+)
+from aperag.indexing.vision import (
+    InMemoryVisionBackend,
+    VisionBackend,
+    VisionModality,
+)
+
+__all__ = [
+    # Schema
+    "DocumentIndex",
+    "Modality",
+    "IndexStatus",
+    # ABC
+    "ModalityWorker",
+    "DeriveResult",
+    # Object store helpers
+    "derived_dir",
+    "derived_artifact",
+    "source_artifact",
+    "write_atomic",
+    "write_atomic_async",
+    "read_or_none",
+    "read_or_none_async",
+    "InMemoryObjectStore",
+    # Parser
+    "ChunkingConfig",
+    "ParseConfig",
+    "ParseResult",
+    "parse_document",
+    "read_chunks",
+    "DEFAULT_PARSER_PIPELINE",
+    "DEFAULT_CHUNK_SIZE",
+    "DEFAULT_CHUNK_OVERLAP",
+    # Modalities (T1.2 graph / T1.3 / T1.4)
+    "VectorModality",
+    "VectorBackend",
+    "InMemoryVectorBackend",
+    "FulltextModality",
+    "FulltextBackend",
+    "InMemoryFulltextBackend",
+    "GraphModalityWorker",
+    "LineageGraphStore",
+    "InMemoryLineageGraphStore",
+    "EntityLock",
+    "InMemoryEntityLock",
+    "RedisEntityLock",
+    "LineageMember",
+    "DescriptionPart",
+    "EntityRecord",
+    "RelationRecord",
+    "EntityWithLineage",
+    "RelationWithLineage",
+    "GraphExtractor",
+    "KG_ARTIFACT_FILENAME",
+    "serialize_kg_jsonl",
+    "parse_kg_jsonl",
+    "SummaryModality",
+    "SummaryBackend",
+    "InMemorySummaryBackend",
+    "VisionModality",
+    "VisionBackend",
+    "InMemoryVisionBackend",
+    # Observability (T1.5)
+    "MetricsEmitter",
+    "NoopMetricsEmitter",
+    "InMemoryMetricsEmitter",
+    "INDEX_LAG_METRIC",
+    "INDEX_FAILURE_METRIC",
+    "INDEX_SUCCESS_METRIC",
+    "QUEUE_DEPTH_METRIC",
+    "WORKER_UTILIZATION_METRIC",
+    "emit_index_lag",
+    "emit_index_failure",
+    "emit_index_success",
+    "emit_queue_depth",
+    "emit_worker_utilization",
+]
@@ -0,0 +1,114 @@
+# Copyright 2025 ApeCloud, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""``Modality`` abstract base — celery T1.1 Foundation.
+
+Per ``docs/modularization/indexing-redesign-design-pack.md`` §C/§D, every
+modality (``vector`` / ``fulltext`` / ``graph`` / ``summary`` / ``vision``)
+implements two operations:
+
+- ``derive(document_id, parse_version, source_path) -> derived_artifact_path``
+  Reads the source / parser output, calls any expensive LLM /
+  embedding / vision pipelines, writes the canonical artifact under
+  ``derived/parse_<version>/<modality_file>`` using the
+  ``ObjectStore`` write-then-rename / multipart-then-complete contract
+  (§C.7) so partial writes are never visible.
+
+- ``sync(document_id, parse_version, derived_artifact_path) -> None``
+  Reads the derived artifact and applies the §D.1 two-phase
+  replace-idempotent contract: DELETE all backend entries WHERE
+  (document_id=X, parse_version=Y), then INSERT all entries from the
+  artifact. Cheap to retry; never re-runs ``derive``.
+
+The graph modality is the one exception to the simple
+DELETE-by-(doc, parse_version) shape — see §D.3 lineage model. The
+ABC accepts that variation: ``sync`` is just "make backend
+byte-equivalent to the artifact for this (doc, version) slot",
+however that has to be done.
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+from aperag.indexing.models import Modality
+
+
+@dataclass(frozen=True)
+class DeriveResult:
+    """Outcome of ``Modality.derive``.
+
+    The ``derived_artifact_path`` is what ``sync`` will read on this
+    and any subsequent retry. It is opaque to callers — only the
+    matching ``Modality.sync`` knows how to interpret it.
+    """
+
+    derived_artifact_path: str
+
+
+class ModalityWorker(ABC):
+    """Abstract base for the 5 per-modality workers.
+
+    Implementations live in ``aperag/indexing/{vector,fulltext,
+    graph,summary,vision}.py`` and are instantiated by the
+    orchestrator (T2.1). The ABC enforces the (derive, sync) split so
+    the orchestrator can route retries to ``sync`` only — never
+    re-charging the LLM / embedding cost.
+    """
+
+    #: The modality this worker owns. Subclasses MUST override.
+    modality: Modality
+
+    @abstractmethod
+    async def derive(
+        self,
+        *,
+        document_id: str,
+        parse_version: str,
+        source_path: str,
+    ) -> DeriveResult:
+        """Produce the canonical derived artifact for this modality.
+
+        Implementations must use the ``ObjectStore`` write-then-rename
+        / multipart-then-complete contract (§C.7) so a partial write
+        is never visible. Implementations MUST be idempotent at the
+        artifact level: re-running ``derive`` for the same
+        ``(document_id, parse_version)`` produces a byte-equivalent
+        artifact (modulo non-deterministic LLM noise, which is the
+        whole reason the artifact gets persisted in the first place).
+        """
+
+    @abstractmethod
+    async def sync(
+        self,
+        *,
+        document_id: str,
+        parse_version: str,
+        derived_artifact_path: str,
+    ) -> None:
+        """Apply the §D.1 replace-idempotent contract to the backend.
+
+        DELETE all backend entries WHERE
+        ``(document_id=X, parse_version=Y)`` then INSERT the entries
+        encoded in ``derived_artifact_path``. Re-running this method
+        produces a backend state byte-equivalent to a fresh sync (§D.4).
+
+        Graph modality reinterprets DELETE as the §D.3 lineage-level
+        DELETE+INSERT — ``sync`` keeps the same signature; the §D.3
+        algorithm is internal to the graph implementation.
+        """
+
+
+__all__ = ["ModalityWorker", "DeriveResult"]