fairagro
diff --git a/‎middleware/api/src/middleware/api/business_logic/arc_manager.py‎
Lines changed: 7 additions & 0 deletions b/‎middleware/api/src/middleware/api/business_logic/arc_manager.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎middleware/api/src/middleware/api/business_logic/harvest_manager.py‎
Lines changed: 3 additions & 7 deletions b/‎middleware/api/src/middleware/api/business_logic/harvest_manager.py‎
Lines changed: 3 additions & 7 deletions
diff --git a/‎middleware/api/src/middleware/api/document_store/__init__.py‎
Lines changed: 20 additions & 0 deletions b/‎middleware/api/src/middleware/api/document_store/__init__.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎middleware/api/src/middleware/api/document_store/config.py‎
Lines changed: 9 additions & 0 deletions b/‎middleware/api/src/middleware/api/document_store/config.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎middleware/api/src/middleware/api/document_store/couchdb.py‎
Lines changed: 57 additions & 5 deletions b/‎middleware/api/src/middleware/api/document_store/couchdb.py‎
Lines changed: 57 additions & 5 deletions
diff --git a/‎middleware/api/src/middleware/api/document_store/couchdb_client.py‎
Lines changed: 117 additions & 10 deletions b/‎middleware/api/src/middleware/api/document_store/couchdb_client.py‎
Lines changed: 117 additions & 10 deletions
diff --git a/‎middleware/api/tests/system_external/conftest.py‎
Lines changed: 6 additions & 0 deletions b/‎middleware/api/tests/system_external/conftest.py‎
Lines changed: 6 additions & 0 deletions
@@ -101,6 +101,13 @@ async def create_or_update_arc(
                 has_changes = doc_result.has_changes
                 should_trigger_git = is_new or has_changes
 
+                if harvest_id:
+                    await self._doc_store.increment_harvest_statistics(
+                        harvest_id,
+                        is_new=is_new,
+                        has_changes=has_changes,
+                    )
+
                 logger.info(
                     "[%s] Stored ARC %s in CouchDB: is_new=%s, has_changes=%s, trigger_git=%s",
                     client_id,
 
@@ -6,7 +6,7 @@
 from middleware.api.business_logic.config import HarvestConfig
 from middleware.api.business_logic.exceptions import AccessDeniedError, ResourceNotFoundError
 from middleware.api.document_store import DocumentStore
-from middleware.api.document_store.harvest_document import HarvestDocument
+from middleware.api.document_store.harvest_document import HarvestDocument, HarvestStatistics
 from middleware.shared.api_models.common.models import HarvestStatus
 
 logger = logging.getLogger(__name__)
@@ -85,12 +85,8 @@ async def complete_harvest(
             )
             raise AccessDeniedError(f"Harvest {harvest_id} does not belong to client {client_id}")
 
-        # Calculate statistics server-side from stored ARCs
-        statistics = await self._doc_store.get_harvest_statistics(harvest_id)
-
-        # Preserve expected_datasets if already set
-        if harvest.statistics and harvest.statistics.expected_datasets is not None:
-            statistics.expected_datasets = harvest.statistics.expected_datasets
+        # Statistics are maintained incrementally during ARC submission.
+        statistics = harvest.statistics or HarvestStatistics()
 
         updates: dict[str, Any] = {
             "status": HarvestStatus.COMPLETED,
 
@@ -157,6 +157,26 @@ async def update_harvest(self, harvest_id: str, updates: dict[str, Any]) -> Harv
         """
         raise NotImplementedError
 
+    @abstractmethod
+    async def increment_harvest_statistics(
+        self,
+        harvest_id: str,
+        *,
+        is_new: bool,
+        has_changes: bool,
+    ) -> None:
+        """Atomically increment harvest counters for one submitted ARC.
+
+        Implementations must be safe under concurrent updates (e.g. optimistic
+        concurrency with retry on revision conflicts).
+
+        Args:
+            harvest_id: Harvest identifier.
+            is_new: Whether the ARC was newly created.
+            has_changes: Whether an existing ARC changed.
+        """
+        raise NotImplementedError
+
     @abstractmethod
     async def list_harvests(
         self,
 
@@ -22,6 +22,15 @@ class CouchDBConfig(BaseModel):
             description="Default maximum number of documents returned by a Mango query",
         ),
     ] = 100
+    harvest_stats_max_retries: Annotated[
+        int,
+        Field(
+            default=5,
+            ge=1,
+            le=100,
+            description="Maximum retry attempts for atomic harvest statistics updates on CouchDB revision conflicts",
+        ),
+    ] = 5
 
     @field_validator("url")
     @classmethod
 
@@ -1,5 +1,6 @@
 """CouchDB implementation of DocumentStore."""
 
+import asyncio
 import hashlib
 import json
 import logging
@@ -8,7 +9,7 @@
 from typing import Any
 
 from middleware.api.document_store.config import CouchDBConfig
-from middleware.api.document_store.couchdb_client import CouchDBClient
+from middleware.api.document_store.couchdb_client import CouchDBClient, DocumentConflictError
 from middleware.api.utils import calculate_arc_id, extract_identifier
 from middleware.shared.api_models.common.models import ArcEventType, ArcLifecycleStatus, HarvestStatus
 
@@ -265,6 +266,60 @@ async def update_harvest(self, harvest_id: str, updates: dict[str, Any]) -> Harv
         await self._client.save_document(harvest_id, doc_data)
         return doc
 
+    async def increment_harvest_statistics(
+        self,
+        harvest_id: str,
+        *,
+        is_new: bool,
+        has_changes: bool,
+    ) -> None:
+        """Atomically increment harvest counters with optimistic-concurrency retry."""
+        max_retries = self._config.harvest_stats_max_retries
+
+        for attempt in range(1, max_retries + 1):
+            doc_dict = await self._client.get_document(harvest_id)
+            if not doc_dict:
+                logger.warning("Harvest %s not found while incrementing statistics", harvest_id)
+                return
+
+            harvest_doc = HarvestDocument.model_validate(doc_dict)
+            if not harvest_doc.doc_rev:
+                raise RuntimeError(f"Harvest {harvest_id} has no _rev; cannot apply atomic update")
+
+            stats = harvest_doc.statistics or HarvestStatistics()
+            stats.arcs_submitted += 1
+
+            if is_new:
+                stats.arcs_new += 1
+            elif has_changes:
+                stats.arcs_updated += 1
+            else:
+                stats.arcs_unchanged += 1
+
+            harvest_doc.statistics = stats
+            payload = harvest_doc.model_dump(mode="json", by_alias=True, exclude_none=True)
+
+            try:
+                await self._client.save_document_if_revision_matches(
+                    harvest_id,
+                    payload,
+                    expected_rev=harvest_doc.doc_rev,
+                )
+                return
+            except DocumentConflictError:
+                logger.debug(
+                    "Conflict incrementing harvest statistics for %s (attempt %d/%d)",
+                    harvest_id,
+                    attempt,
+                    max_retries,
+                )
+                await asyncio.sleep(0.1)  # Add a small delay before retrying
+
+        raise RuntimeError(
+            f"Failed to increment harvest statistics for {harvest_id} after "
+            f"{max_retries} retries due to revision conflicts"
+        )
+
     async def list_harvests(
         self,
         rdi: str | None = None,
@@ -281,11 +336,8 @@ async def list_harvests(
 
     async def get_harvest_statistics(self, harvest_id: str) -> HarvestStatistics:
         """Calculate and return statistics for a specific harvest run."""
-        # Fetch only the fields we need: event log and document type.
-        # Excluding arc_content avoids loading potentially large RO-Crate JSON.
-        projection_fields = ["_id", "type", "metadata.events", "metadata.last_harvest_id"]
         selector = {"type": "arc", "metadata.last_harvest_id": harvest_id}
-        docs = await self._client.find(selector, fields=projection_fields)
+        docs = await self._client.find_projected(selector, fields=["metadata.events"])
 
         stats = HarvestStatistics()
         stats.arcs_submitted = len(docs)
 
@@ -6,6 +6,7 @@
 import logging
 from http import HTTPStatus
 from typing import Any, Self
+from urllib.parse import quote
 
 import aiohttp
 from aiocouch import CouchDB, Database
@@ -16,6 +17,10 @@
 logger = logging.getLogger(__name__)
 
 
+class DocumentConflictError(RuntimeError):
+    """Raised when a document update conflicts with a newer CouchDB revision."""
+
+
 class CouchDBClient:
     """Async CouchDB client wrapper."""
 
@@ -222,7 +227,6 @@ async def find(
         selector: dict[str, Any],
         limit: int | None = None,
         skip: int = 0,
-        fields: list[str] | None = None,
     ) -> list[dict[str, Any]]:
         """Find documents using a Mango query selector.
 
@@ -231,9 +235,6 @@ async def find(
             limit: Maximum number of results to return per call.
                    Defaults to the instance's ``default_query_limit``.
             skip: Number of results to skip (for pagination)
-            fields: Optional list of fields to include in results (projection).
-                    When set, ``arc_content`` and other large fields can be
-                    excluded to reduce memory and network usage.
 
         Returns:
             List of matching documents
@@ -242,12 +243,7 @@ async def find(
             raise RuntimeError("Not connected to CouchDB")
 
         effective_limit = limit if limit is not None else self._default_query_limit
-        # aiocouch's find passes extra kwargs through to the _find body.
-        kwargs: dict[str, Any] = {"limit": effective_limit, "skip": skip}
-        if fields is not None:
-            kwargs["fields"] = fields
-
-        result = self._db.find(selector, **kwargs)
+        result = self._db.find(selector, limit=effective_limit, skip=skip)
         docs = [dict(doc) async for doc in result]
 
         if len(docs) == effective_limit:
@@ -260,6 +256,117 @@ async def find(
 
         return docs
 
+    async def find_projected(
+        self,
+        selector: dict[str, Any],
+        fields: list[str],
+        limit: int | None = None,
+        skip: int = 0,
+    ) -> list[dict[str, Any]]:
+        """Find documents using CouchDB _find with explicit field projection.
+
+        This method uses the raw HTTP endpoint because aiocouch's ``Database.find``
+        returns full ``Document`` objects and therefore does not support the
+        ``fields`` parameter.
+
+        Args:
+            selector: Mango query selector.
+            fields: List of fields to return (CouchDB ``fields`` projection).
+            limit: Maximum number of results to return per call.
+                   Defaults to the instance's ``default_query_limit``.
+            skip: Number of results to skip (for pagination).
+
+        Returns:
+            List of projected documents.
+        """
+        if not self._db:
+            raise RuntimeError("Not connected to CouchDB")
+        if not self._db_name:
+            raise RuntimeError("Database name is not set")
+
+        effective_limit = limit if limit is not None else self._default_query_limit
+
+        payload: dict[str, Any] = {
+            "selector": selector,
+            "fields": fields,
+            "limit": effective_limit,
+            "skip": skip,
+        }
+
+        url = f"{self._url}/{self._db_name}/_find"
+        session = self._get_session()
+        async with session.post(url, json=payload) as resp:
+            if resp.status != HTTPStatus.OK:
+                text = await resp.text()
+                logger.error("CouchDB _find with projection failed: %s", text)
+                raise RuntimeError(f"CouchDB _find failed with status {resp.status}: {text}")
+
+            response_data = await resp.json()
+
+        docs_raw = response_data.get("docs", [])
+        docs: list[dict[str, Any]] = [dict(doc) for doc in docs_raw]
+
+        if len(docs) == effective_limit:
+            logger.warning(
+                "CouchDB find_projected() returned exactly %d documents for selector %s — "
+                "results may be silently truncated. Use skip/limit for pagination.",
+                effective_limit,
+                selector,
+            )
+
+        return docs
+
+    async def save_document_if_revision_matches(
+        self,
+        doc_id: str,
+        data: dict[str, Any],
+        *,
+        expected_rev: str,
+    ) -> dict[str, Any]:
+        """Save a document only if the expected revision still matches.
+
+        Uses raw ``PUT /{db}/{docid}`` to allow optimistic-concurrency handling
+        in higher layers (retry on 409 Conflict).
+
+        Args:
+            doc_id: Document ID.
+            data: Complete document payload to save.
+            expected_rev: Revision expected by the caller.
+
+        Returns:
+            Saved document payload including updated ``_rev``.
+
+        Raises:
+            DocumentConflictError: If CouchDB returns 409 conflict.
+            RuntimeError: For non-success HTTP errors.
+        """
+        if not self._db:
+            raise RuntimeError("Not connected to CouchDB")
+        if not self._db_name:
+            raise RuntimeError("Database name is not set")
+
+        payload = dict(data)
+        payload["_id"] = doc_id
+        payload["_rev"] = expected_rev
+
+        encoded_doc_id = quote(doc_id, safe="")
+        url = f"{self._url}/{self._db_name}/{encoded_doc_id}"
+        session = self._get_session()
+
+        async with session.put(url, json=payload) as resp:
+            if resp.status in {HTTPStatus.CREATED, HTTPStatus.ACCEPTED, HTTPStatus.OK}:
+                response_data = await resp.json()
+                new_rev = response_data.get("rev")
+                if isinstance(new_rev, str):
+                    payload["_rev"] = new_rev
+                return payload
+
+            if resp.status == HTTPStatus.CONFLICT:
+                raise DocumentConflictError(f"Conflict updating document {doc_id}")
+
+            text = await resp.text()
+            raise RuntimeError(f"Failed to update document {doc_id}: {resp.status} {text}")
+
     def _get_session(self) -> aiohttp.ClientSession:
         """Return the shared aiohttp session, creating it on first call."""
         if self._session is None:
 
@@ -92,6 +92,9 @@ def config(
         "log_level": "DEBUG",
         "known_rdis": list(known_rdis),
         "client_auth_oid": oid.dotted_string,
+        "otel": {
+            "endpoint": None,
+        },
         "require_client_cert": False,
         "gitlab_api": {
             "url": "https://datahub-dev.ipk-gatersleben.de",
@@ -186,6 +189,9 @@ def worker_process(
     """
     gitlab_token = os.getenv("GITLAB_API_TOKEN", "")
     worker_cfg: dict[str, Any] = {
+        "otel": {
+            "endpoint": None,
+        },
         "couchdb": {
             "url": external_services["couchdb_url"],
             "user": external_services["couchdb_user"],