From 2b75ca8de0ffb8e0ee80f2b5f540fef9c585ea2d Mon Sep 17 00:00:00 2001 From: "rudimar.ronsoni" Date: Tue, 19 May 2026 18:14:12 +0200 Subject: [PATCH 01/15] Add local daemon Git layered indexing --- AGENTS.md | 48 +++ README.md | 171 +++++++- docker/Dockerfile | 9 +- docker/docker-compose.yml | 28 +- docker/entrypoint.sh | 15 +- docs/README.md | 4 + docs/docker-layered-indexing.md | 143 +++++++ docs/layered-indexing.md | 197 +++++++++ pyproject.toml | 1 + src/cocoindex_code/_daemon_paths.py | 17 + src/cocoindex_code/cli.py | 149 ++++++- src/cocoindex_code/client.py | 33 +- src/cocoindex_code/daemon.py | 137 +++++- src/cocoindex_code/git_context.py | 13 + src/cocoindex_code/indexer.py | 8 +- src/cocoindex_code/layer_store.py | 8 + src/cocoindex_code/layered_project.py | 211 ++++++++++ src/cocoindex_code/layers/__init__.py | 18 + src/cocoindex_code/layers/layer.py | 47 +++ src/cocoindex_code/layers/layer_kind.py | 9 + src/cocoindex_code/layers/layer_manifest.py | 11 + src/cocoindex_code/layers/layer_paths.py | 27 ++ src/cocoindex_code/layers/layer_runtime.py | 57 +++ src/cocoindex_code/layers/layer_stack.py | 359 ++++++++++++++++ src/cocoindex_code/layers/layer_store.py | 398 ++++++++++++++++++ src/cocoindex_code/project.py | 58 ++- src/cocoindex_code/protocol.py | 43 ++ src/cocoindex_code/query.py | 29 +- src/cocoindex_code/server.py | 10 +- src/cocoindex_code/shared.py | 1 + .../version_control/__init__.py | 16 + src/cocoindex_code/version_control/branch.py | 12 + .../version_control/change_set.py | 22 + src/cocoindex_code/version_control/git.py | 303 +++++++++++++ .../version_control/repository.py | 16 + .../version_control/worktree.py | 67 +++ tests/e2e_docker/test_docker_workspace.py | 58 +++ tests/test_docker_setup.py | 94 +++++ tests/test_git_layers.py | 172 ++++++++ tests/test_protocol.py | 16 +- uv.lock | 68 +++ 41 files changed, 3040 insertions(+), 63 deletions(-) create mode 100644 AGENTS.md create mode 100644 docs/README.md create mode 100644 docs/docker-layered-indexing.md create mode 100644 docs/layered-indexing.md create mode 100644 src/cocoindex_code/git_context.py create mode 100644 src/cocoindex_code/layer_store.py create mode 100644 src/cocoindex_code/layered_project.py create mode 100644 src/cocoindex_code/layers/__init__.py create mode 100644 src/cocoindex_code/layers/layer.py create mode 100644 src/cocoindex_code/layers/layer_kind.py create mode 100644 src/cocoindex_code/layers/layer_manifest.py create mode 100644 src/cocoindex_code/layers/layer_paths.py create mode 100644 src/cocoindex_code/layers/layer_runtime.py create mode 100644 src/cocoindex_code/layers/layer_stack.py create mode 100644 src/cocoindex_code/layers/layer_store.py create mode 100644 src/cocoindex_code/version_control/__init__.py create mode 100644 src/cocoindex_code/version_control/branch.py create mode 100644 src/cocoindex_code/version_control/change_set.py create mode 100644 src/cocoindex_code/version_control/git.py create mode 100644 src/cocoindex_code/version_control/repository.py create mode 100644 src/cocoindex_code/version_control/worktree.py create mode 100644 tests/test_docker_setup.py create mode 100644 tests/test_git_layers.py diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..8b74612 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,48 @@ +This is built on top of [CocoIndex v1](https://cocoindex.io/docs-v1/llms.txt). + + +## Build and Test Commands + +This project uses [uv](https://docs.astral.sh/uv/) for project management. + +```bash +uv run mypy . # Type check Python code +uv run pytest tests/ # Run Python tests +``` + +## Code Conventions + +### Internal vs External Modules + +We distinguish between **internal modules** (under packages with `_` prefix, e.g. `_internal.*` or `connectors.*._source`) and **external modules** (which users can directly import). + +**External modules** (user-facing, e.g. `cocoindex/ops/sentence_transformers.py`): + +* Be strict about not leaking implementation details +* Use `__all__` to explicitly list public exports +* Prefix ALL non-public symbols with `_`, including: + * Standard library imports: `import threading as _threading`, `import typing as _typing` + * Third-party imports: `import numpy as _np`, `from numpy.typing import NDArray as _NDArray` + * Internal package imports: `from cocoindex.resources import schema as _schema` +* Exception: `TYPE_CHECKING` imports for type hints don't need prefixing + +**Internal modules** (e.g. `cocoindex/_internal/component_ctx.py`): + +* Less strict since users shouldn't import these directly +* Standard library and internal imports don't need underscore prefix +* Only prefix symbols that are truly private to the module itself (e.g. `_context_var` for a module-private ContextVar) + +### General principles (also covered by `/review-changes`) + +- **Top-level imports.** Defer to in-function only for a real circular dependency or a heavy import that isn't always needed. +- **Specific types over `Any`.** When a value enters as a weaker form (`str`, `Any`), convert to the strong type at the earliest point. Don't propagate the weak form. +- **`NamedTuple`/small dataclass for multi-value returns.** Access fields by name at call sites. +- **Single source of truth.** When the same value or logic appears in multiple places, consolidate it. +- **Delete dead code and dead config.** When a change makes something unreachable, the code, the tests, and the knobs all go. +- **Honest names.** The name describes what the code does today. + +### Testing Guidelines + +We prefer end-to-end tests on user-facing APIs, over unit tests on smaller internal functions. With this said, there're cases where unit tests are necessary, e.g. for internal logic with various situations and edge cases, in which case it's usually easier to cover various scenarios with unit tests. + +When tests fail, fix the underlying issue. Don't skip, ignore, or exclude to get a green result. diff --git a/README.md b/README.md index c488421..590caf1 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,10 @@ Two install styles — they mirror the Docker image variants of the same names: Next, set up your [coding agent integration](#coding-agent-integration) — or jump to [Manual CLI Usage](#manual-cli-usage) if you prefer direct control. +Docs: +- [Git Layered Indexing](./docs/layered-indexing.md): configure reusable `base > branch > dirty` Git layers for root clones and linked worktrees. +- [Docker Layered Indexing](./docs/docker-layered-indexing.md): run the layered daemon in Docker with persistent native state. + ## Coding Agent Integration ### Skill (Recommended) @@ -162,6 +166,16 @@ The background daemon starts automatically on first use. > **Tip:** `ccc index` auto-initializes if you haven't run `ccc init` yet, so you can skip straight to indexing. +For Git repositories, you can configure layered indexing once from the root clone: + +```bash +ccc init --base main # share a base layer across linked worktrees +ccc index # builds base + branch + dirty layers as needed +ccc overlay status # inspect the current layer stack +``` + +Linked worktrees reuse the same daemon-owned base layer and only index branch and dirty deltas. See [Git Layered Indexing](./docs/layered-indexing.md) for the full configuration model. + ### CLI Reference | Command | Description | @@ -170,6 +184,8 @@ The background daemon starts automatically on first use. | `ccc index` | Build or update the index (auto-inits if needed). Shows streaming progress. | | `ccc search ` | Semantic search across the codebase | | `ccc status` | Show index stats (chunk count, file count, language breakdown) | +| `ccc overlay status` | Inspect Git layered indexing state for the current worktree | +| `ccc overlay prune` | Prune expired branch and dirty layers | | `ccc mcp` | Run as MCP server in stdio mode | | `ccc doctor` | Run diagnostics — checks settings, daemon, model, file matching, and index health | | `ccc reset` | Delete index databases. `--all` also removes settings. `-f` skips confirmation. | @@ -185,6 +201,7 @@ ccc search --lang python --lang markdown schema # filter by language ccc search --path 'src/utils/*' query handler # filter by path ccc search --offset 10 --limit 5 database schema # pagination ccc search --refresh database schema # update index first, then search +ccc index --base release/1.2 # override Git overlay base ref once ``` By default, `ccc search` scopes results to your current working directory (relative to the project root). Use `--path` to override. @@ -231,11 +248,12 @@ PUID=$(id -u) PGID=$(id -g) docker compose -f <(curl -L https://raw.githubuserco Or grab [`docker/docker-compose.yml`](./docker/docker-compose.yml) and run `docker compose up -d` next to it (works on any shell, including Windows cmd / PowerShell). -By default your home directory is mounted into the container (set -`COCOINDEX_HOST_WORKSPACE` to narrow this to a specific code folder). Index -data and the embedding model cache persist in a Docker volume across -restarts. Your global settings file at `$HOME/.cocoindex_code/global_settings.yml` -is visible and editable on the host; edits take effect on your next `ccc` command. +By default your home directory is mounted into the container. For team setups, +prefer a narrower mount such as `COCOINDEX_HOST_WORKSPACE=$HOME/src` or one +repo path. Index data, daemon Git-layer state, and the embedding model cache +persist in the `cocoindex-data` Docker volume under `/var/cocoindex`. Your +global settings file at `$HOME/.cocoindex_code/global_settings.yml` is visible +and editable on the host; edits take effect on your next `ccc` command. > **Pick a different image:** set `COCOINDEX_CODE_IMAGE` to override the > default. For example, the `:full` variant or GHCR: @@ -254,6 +272,9 @@ docker run -d --name cocoindex-code \ --volume "$HOME:/workspace" \ --volume cocoindex-data:/var/cocoindex \ -e COCOINDEX_CODE_HOST_PATH_MAPPING="/workspace=$HOME" \ + -e COCOINDEX_CODE_STATE_DIR=/var/cocoindex/state \ + -e COCOINDEX_CODE_RUNTIME_DIR=/var/run/cocoindex_code \ + -e COCOINDEX_CODE_DB_PATH_MAPPING=/workspace=/var/cocoindex/db \ cocoindex/cocoindex-code:latest ``` @@ -267,18 +288,35 @@ docker run -d --name cocoindex-code \ --volume "$HOME:/workspace" \ --volume cocoindex-data:/var/cocoindex \ -e COCOINDEX_CODE_HOST_PATH_MAPPING="/workspace=$HOME" \ + -e COCOINDEX_CODE_STATE_DIR=/var/cocoindex/state \ + -e COCOINDEX_CODE_RUNTIME_DIR=/var/run/cocoindex_code \ + -e COCOINDEX_CODE_DB_PATH_MAPPING=/workspace=/var/cocoindex/db \ cocoindex/cocoindex-code:latest ``` ### Shell wrapper for `ccc` commands -Paste this into `~/.bashrc` / `~/.zshrc` so `ccc` feels native on the host -and picks up the right project based on your current directory: +Paste this into `~/.bashrc` / `~/.zshrc` so `ccc` feels native on the host, +picks up the right project based on your current directory, and uses the right +TTY mode for interactive commands vs. MCP or piped stdin: ```bash ccc() { - docker exec -it -e COCOINDEX_CODE_HOST_CWD="$PWD" cocoindex-code ccc "$@" + local container="${COCOINDEX_CODE_CONTAINER_NAME:-cocoindex-code}" + if [ "$(docker inspect -f '{{.State.Running}}' "$container" 2>/dev/null)" != "true" ]; then + echo "cocoindex-code container is not running. Start it with: docker compose -f docker/docker-compose.yml up -d" >&2 + return 1 + fi + + local flags=(-i) + if [ "${1:-}" != "mcp" ] && [ -t 0 ] && [ -t 1 ]; then + flags=(-it) + fi + + docker exec "${flags[@]}" \ + -e COCOINDEX_CODE_HOST_CWD="$PWD" \ + "$container" ccc "$@" } ``` @@ -346,6 +384,86 @@ docker rm -f cocoindex-code docker volume rm cocoindex-db cocoindex-model-cache ``` +For regular upgrades, keep the volume and recreate the container: + +```bash +docker compose -f docker/docker-compose.yml pull +docker compose -f docker/docker-compose.yml up -d +``` + +Switch between the slim and full images by changing `COCOINDEX_CODE_IMAGE`: + +```bash +COCOINDEX_CODE_IMAGE=cocoindex/cocoindex-code:latest docker compose -f docker/docker-compose.yml up -d +COCOINDEX_CODE_IMAGE=cocoindex/cocoindex-code:full docker compose -f docker/docker-compose.yml up -d +``` + +### Docker debugging + +Useful commands: + +```bash +# Logs from the daemon supervisor and daemon process +docker logs -f cocoindex-code + +# Shell inside the container +docker exec -it cocoindex-code sh + +# Daemon readiness/status +docker exec cocoindex-code ccc daemon status +docker exec cocoindex-code test -S /var/run/cocoindex_code/daemon.sock + +# Restart the container +docker restart cocoindex-code + +# Stop and remove the container, preserving index/state/cache volume +docker rm -f cocoindex-code + +# Reset all Docker-managed index/state/cache data +docker compose -f docker/docker-compose.yml down -v +``` + +Docker paths: + +| Data | Default path | +|---|---| +| Host workspace mount | `/workspace` | +| Settings on the mounted workspace | `/workspace/.cocoindex_code/global_settings.yml` | +| DB/index files | `/var/cocoindex/db` via `COCOINDEX_CODE_DB_PATH_MAPPING=/workspace=/var/cocoindex/db` | +| Durable daemon Git-layer state | `/var/cocoindex/state` via `COCOINDEX_CODE_STATE_DIR` | +| Runtime socket, PID, and daemon log | `/var/run/cocoindex_code` via `COCOINDEX_CODE_RUNTIME_DIR` | +| Embedding/cache data | `/var/cocoindex/cache` | + +Local Git worktrees that use the same Docker container should share the Docker +daemon state in `/var/cocoindex/state`. That lets the local daemon Git-layer +feature reuse daemon-owned layer metadata and materialized layer sources across +projects while keeping transient sockets under `/var/run/cocoindex_code`. + +For layered indexing in Docker, initialize the base ref from the root clone and +then use linked worktrees through the same wrapper/container: + +```bash +cd $HOME/src/github/cocoindex-io/cocoindex-code +ccc init --base main +ccc index + +git worktree add ../cocoindex-code.worktrees/feature-1 -b feature-1 main +cd ../cocoindex-code.worktrees/feature-1 +ccc index +ccc overlay status +``` + +Mount a workspace parent that contains both the root clone and linked +worktrees. For example: + +```bash +COCOINDEX_HOST_WORKSPACE=$HOME/src/github/cocoindex-io \ + docker compose -f docker/docker-compose.yml up -d +``` + +See [Docker Layered Indexing](./docs/docker-layered-indexing.md) for the full +Docker setup and troubleshooting guide. + ### Configuration via environment variables Pass configuration to `docker run` / compose with `-e`: @@ -365,6 +483,25 @@ Pass configuration to `docker run` / compose with `-e`: > to everything under it. If that's too broad, bind-mount a narrower > directory instead (`COCOINDEX_HOST_WORKSPACE=/path/to/code`). +Supported Docker environment variables: + +| Variable | Purpose | +|---|---| +| `COCOINDEX_CODE_IMAGE` | Compose image, e.g. `cocoindex/cocoindex-code:full`. | +| `COCOINDEX_CODE_CONTAINER_NAME` | Compose container name, default `cocoindex-code`. | +| `COCOINDEX_HOST_WORKSPACE` | Host directory mounted at `/workspace`, default `${HOME}`. | +| `COCOINDEX_CODE_HOST_PATH_MAPPING` | Container-to-host path mapping for displayed paths. | +| `COCOINDEX_CODE_HOST_CWD` | Host current directory forwarded by `docker exec` wrappers. | +| `COCOINDEX_CODE_STATE_DIR` | Durable daemon state directory, default `/var/cocoindex/state`. | +| `COCOINDEX_CODE_RUNTIME_DIR` | Runtime socket/PID/log directory, default `/var/run/cocoindex_code`. | +| `COCOINDEX_CODE_DB_PATH_MAPPING` | DB/index storage remapping, default `/workspace=/var/cocoindex/db`. | +| `PUID`, `PGID` | Linux UID/GID used to chown Docker-managed paths and write host-owned workspace files. | + +`COCOINDEX_CODE_STATE_DIR` is where repository/worktree metadata, overlay +policy, layer manifests, and materialized layer sources are stored. Keep it on +the persistent Docker volume if you want base layers to survive container +recreation. + ### Build the image locally ```bash @@ -373,6 +510,7 @@ docker build -t cocoindex-code:local -f docker/Dockerfile . ## Features - **Semantic Code Search**: Find relevant code using natural language queries when grep doesn't work well, and save tokens immediately. +- **Git Layered Indexing**: Reuse a shared base index across root clones and linked worktrees, then index only branch and dirty deltas. Configure it with `ccc init --base main`; see [Git Layered Indexing](./docs/layered-indexing.md). - **Ultra Performant**: ⚡ Built on top of ultra performant [Rust indexing engine](https://github.com/cocoindex-io/cocoindex). Only re-indexes changed files for fast updates. - **Multi-Language Support**: Python, JavaScript/TypeScript, Rust, Go, Java, C/C++, C#, SQL, Shell, and more. - **Embedded**: Portable and just works, no database setup required! @@ -493,6 +631,23 @@ def my_chunker(path: Path, content: str) -> tuple[str | None, list[Chunk]]: See [`src/cocoindex_code/chunking.py`](./src/cocoindex_code/chunking.py) for the public types and [`tests/example_toml_chunker.py`](./tests/example_toml_chunker.py) for a complete example. +### Git Layered Indexing Configuration + +For Git repositories, `ccc init --base ` stores a repository-level overlay +policy in daemon state. The checkout-local `settings.yml` still controls file +matching and chunking, while daemon state controls the shared base ref used by +root clones and linked worktrees. + +```bash +ccc init --base main +ccc index +ccc overlay status +``` + +The daemon stores durable layer metadata under `COCOINDEX_CODE_STATE_DIR` and +uses stable hash IDs, so moving a repository or linked worktree does not +invalidate reusable base and branch layers. See [Git Layered Indexing](./docs/layered-indexing.md) for details. + ## Embedding Models With the `[full]` extra installed, `ccc init` defaults to a local SentenceTransformers model ([Snowflake/snowflake-arctic-embed-xs](https://huggingface.co/Snowflake/snowflake-arctic-embed-xs)) — no API key required. To use a different model, edit `~/.cocoindex_code/global_settings.yml`. diff --git a/docker/Dockerfile b/docker/Dockerfile index efadb41..a5596c2 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -3,7 +3,7 @@ # # Stable layers (reuse across releases — digest reproducible from the RUN # command string + base image, so users keep them in local cache): -# 1. apt install gosu + create coco user +# 1. apt install gosu + git + create coco user # 2. install uv # 3. (full only) `uv pip install sentence-transformers` — ~1 GB of torch + # transformers. This is the heavy, slow-changing layer we're optimizing @@ -11,7 +11,7 @@ # 4. (full only) pre-bake the default embedding model under # /var/cocoindex/cache/... so the named volume's copy-up populates it # on first start without a network fetch. -# 5. writable-path setup (mkdir /var/cocoindex/db + /var/run/cocoindex_code, +# 5. writable-path setup (mkdir /var/cocoindex/state + /var/cocoindex/db + /var/run/cocoindex_code, # chown to coco) + env vars + entrypoint copy. # # Per-release layers (invalidate when the source tree changes): @@ -36,7 +36,7 @@ FROM python:3.12-slim RUN apt-get update \ - && apt-get install -y --no-install-recommends gosu \ + && apt-get install -y --no-install-recommends git gosu \ && rm -rf /var/lib/apt/lists/* \ && groupadd -g 1000 coco \ && useradd -u 1000 -g 1000 -m coco @@ -65,7 +65,7 @@ RUN mkdir -p /var/cocoindex/cache/huggingface /var/cocoindex/cache/sentence-tran # entrypoint re-chowns to the host user; under root (Docker Desktop # default) coco-ownership is harmless since processes run as root and can # write anywhere. -RUN mkdir -p /var/cocoindex/db /var/run/cocoindex_code \ +RUN mkdir -p /var/cocoindex/state /var/cocoindex/db /var/cocoindex/cache/huggingface /var/cocoindex/cache/sentence-transformers /var/run/cocoindex_code \ && chown -R coco:coco /var/cocoindex /var/run/cocoindex_code WORKDIR /workspace @@ -73,6 +73,7 @@ WORKDIR /workspace # Runtime defaults — see the spec for what each does. All overridable at # `docker run -e ...` time. ENV COCOINDEX_CODE_DIR=/workspace/.cocoindex_code \ + COCOINDEX_CODE_STATE_DIR=/var/cocoindex/state \ COCOINDEX_CODE_RUNTIME_DIR=/var/run/cocoindex_code \ COCOINDEX_CODE_DB_PATH_MAPPING=/workspace=/var/cocoindex/db \ COCOINDEX_CODE_DAEMON_SUPERVISED=1 diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 6d0804f..5788984 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -13,23 +13,47 @@ # Override the image via COCOINDEX_CODE_IMAGE — for example: # COCOINDEX_CODE_IMAGE=cocoindex/cocoindex-code:full docker compose up -d # COCOINDEX_CODE_IMAGE=ghcr.io/cocoindex-io/cocoindex-code:latest docker compose up -d +# +# Optional knobs: +# COCOINDEX_CODE_CONTAINER_NAME=my-ccc +# COCOINDEX_CODE_STATE_DIR=/var/cocoindex/state +# COCOINDEX_CODE_RUNTIME_DIR=/var/run/cocoindex_code +# COCOINDEX_CODE_DB_PATH_MAPPING=/workspace=/var/cocoindex/db +# +# For Git layered indexing, mount a workspace parent that contains both the +# root clone and linked worktrees. Keep COCOINDEX_CODE_STATE_DIR on the +# persistent cocoindex-data volume so base/branch layers survive container +# recreation. See docs/docker-layered-indexing.md. services: cocoindex-code: image: ${COCOINDEX_CODE_IMAGE:-cocoindex/cocoindex-code:latest} - container_name: cocoindex-code + container_name: ${COCOINDEX_CODE_CONTAINER_NAME:-cocoindex-code} volumes: - ${COCOINDEX_HOST_WORKSPACE:-${HOME}}:/workspace - cocoindex-data:/var/cocoindex environment: + COCOINDEX_CODE_STATE_DIR: ${COCOINDEX_CODE_STATE_DIR:-/var/cocoindex/state} + COCOINDEX_CODE_RUNTIME_DIR: ${COCOINDEX_CODE_RUNTIME_DIR:-/var/run/cocoindex_code} + COCOINDEX_CODE_DB_PATH_MAPPING: ${COCOINDEX_CODE_DB_PATH_MAPPING:-/workspace=/var/cocoindex/db} # Makes CLI and MCP output show your real paths # (e.g. `/Users/you/myproject/...`) instead of container paths # (e.g. `/workspace/myproject/...`). - COCOINDEX_CODE_HOST_PATH_MAPPING: /workspace=${COCOINDEX_HOST_WORKSPACE:-${HOME}} + COCOINDEX_CODE_HOST_PATH_MAPPING: ${COCOINDEX_CODE_HOST_PATH_MAPPING:-/workspace=${COCOINDEX_HOST_WORKSPACE:-${HOME}}} # Linux only: set these so files written to your workspace are owned by # you rather than root. Not needed on macOS / Windows — leave empty. PUID: ${PUID:-} PGID: ${PGID:-} + healthcheck: + test: + [ + "CMD-SHELL", + "ccc daemon status >/dev/null 2>&1 || test -S /var/run/cocoindex_code/daemon.sock", + ] + interval: 10s + timeout: 5s + retries: 12 + start_period: 10s volumes: cocoindex-data: diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 9a87e22..d95e8a9 100644 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -14,10 +14,23 @@ # graceful shutdown still flows through the normal cleanup path. set -e +COCOINDEX_CODE_STATE_DIR=${COCOINDEX_CODE_STATE_DIR:-/var/cocoindex/state} +COCOINDEX_CODE_RUNTIME_DIR=${COCOINDEX_CODE_RUNTIME_DIR:-/var/run/cocoindex_code} +HF_HOME=${HF_HOME:-/var/cocoindex/cache/huggingface} +SENTENCE_TRANSFORMERS_HOME=${SENTENCE_TRANSFORMERS_HOME:-/var/cocoindex/cache/sentence-transformers} +export COCOINDEX_CODE_STATE_DIR COCOINDEX_CODE_RUNTIME_DIR HF_HOME SENTENCE_TRANSFORMERS_HOME + +mkdir -p \ + "$COCOINDEX_CODE_STATE_DIR" \ + /var/cocoindex/db \ + "$HF_HOME" \ + "$SENTENCE_TRANSFORMERS_HOME" \ + "$COCOINDEX_CODE_RUNTIME_DIR" + if [ -n "$PUID" ] && [ -n "$PGID" ]; then groupmod -o -g "$PGID" coco usermod -o -u "$PUID" coco - chown -R coco:coco /var/cocoindex /var/run/cocoindex_code + chown -R coco:coco /var/cocoindex "$COCOINDEX_CODE_RUNTIME_DIR" if [ -d /workspace/.cocoindex_code ]; then chown coco:coco /workspace/.cocoindex_code 2>/dev/null || true fi diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..18a81de --- /dev/null +++ b/docs/README.md @@ -0,0 +1,4 @@ +# CocoIndex Code Docs + +- [Git Layered Indexing](./layered-indexing.md): configuration model, stable IDs, layer stack behavior, and commands. +- [Docker Layered Indexing](./docker-layered-indexing.md): Docker-specific state layout, wrapper, and linked worktree setup. diff --git a/docs/docker-layered-indexing.md b/docs/docker-layered-indexing.md new file mode 100644 index 0000000..6af166e --- /dev/null +++ b/docs/docker-layered-indexing.md @@ -0,0 +1,143 @@ +# Docker Layered Indexing + +This guide covers the Docker-specific configuration for Git layered indexing. For the core model, see [Git Layered Indexing](./layered-indexing.md). + +## Recommended Compose Setup + +Use the repository compose file: + +```bash +docker compose -f docker/docker-compose.yml up -d +``` + +The compose defaults are designed for layered indexing: + +```yaml +COCOINDEX_CODE_STATE_DIR: /var/cocoindex/state +COCOINDEX_CODE_RUNTIME_DIR: /var/run/cocoindex_code +COCOINDEX_CODE_DB_PATH_MAPPING: /workspace=/var/cocoindex/db +COCOINDEX_CODE_HOST_PATH_MAPPING: /workspace=$HOME +``` + +The important split is: + +- source code and settings live on the bind mount under `/workspace` +- durable daemon layer metadata lives under `/var/cocoindex/state` +- per-project non-layer DB paths are remapped to `/var/cocoindex/db` +- sockets, PID files, and logs stay under `/var/run/cocoindex_code` + +## Mount the Right Workspace + +The default compose file mounts your home directory: + +```bash +COCOINDEX_HOST_WORKSPACE=$HOME docker compose -f docker/docker-compose.yml up -d +``` + +For a narrower mount, point it at the parent containing both the root clone and linked worktrees: + +```bash +COCOINDEX_HOST_WORKSPACE=$HOME/src/github/cocoindex-io \ + docker compose -f docker/docker-compose.yml up -d +``` + +Example host layout: + +```text +$HOME/src/github/cocoindex-io/ + cocoindex-code/ + cocoindex-code.worktrees/ + feature-1/ +``` + +Both paths must be visible inside the same container mount for the daemon to reuse repository and layer state across them. + +## Host Wrapper + +Use this wrapper so Docker commands resolve the host current directory correctly: + +```bash +ccc() { + local container="${COCOINDEX_CODE_CONTAINER_NAME:-cocoindex-code}" + if [ "$(docker inspect -f '{{.State.Running}}' "$container" 2>/dev/null)" != "true" ]; then + echo "cocoindex-code container is not running. Start it with: docker compose -f docker/docker-compose.yml up -d" >&2 + return 1 + fi + + local flags=(-i) + if [ "${1:-}" != "mcp" ] && [ -t 0 ] && [ -t 1 ]; then + flags=(-it) + fi + + docker exec "${flags[@]}" \ + -e COCOINDEX_CODE_HOST_CWD="$PWD" \ + "$container" ccc "$@" +} +``` + +`COCOINDEX_CODE_HOST_CWD` is required for linked worktrees. It tells the container-side CLI which host directory you are actually in, then the path mapping translates it to `/workspace/...`. + +## Layered Workflow in Docker + +Root clone: + +```bash +cd $HOME/src/github/cocoindex-io/cocoindex-code +ccc init --base main +ccc index +``` + +Linked worktree: + +```bash +git worktree add ../cocoindex-code.worktrees/feature-1 -b feature-1 main +cd ../cocoindex-code.worktrees/feature-1 +ccc index +ccc search "query planner" +ccc overlay status +``` + +The base layer is stored once under `/var/cocoindex/state` and reused by the linked worktree. + +## Environment Variables + +| Variable | Purpose | +|---|---| +| `COCOINDEX_CODE_IMAGE` | Image used by compose, e.g. `cocoindex/cocoindex-code:full`. | +| `COCOINDEX_CODE_CONTAINER_NAME` | Container name used by compose and the wrapper. | +| `COCOINDEX_HOST_WORKSPACE` | Host directory mounted at `/workspace`. Mount a parent that contains all worktrees you want to share. | +| `COCOINDEX_CODE_HOST_PATH_MAPPING` | Container-to-host path mapping for display and host CWD translation. | +| `COCOINDEX_CODE_HOST_CWD` | Host current directory passed per `docker exec` invocation. | +| `COCOINDEX_CODE_STATE_DIR` | Durable daemon layer state. Default: `/var/cocoindex/state`. | +| `COCOINDEX_CODE_RUNTIME_DIR` | Runtime socket/PID/log directory. Default: `/var/run/cocoindex_code`. | +| `COCOINDEX_CODE_DB_PATH_MAPPING` | Non-layer project DB remapping. Default: `/workspace=/var/cocoindex/db`. | +| `PUID`, `PGID` | Linux-only ownership mapping for bind-mounted files and Docker-managed state. | + +## Debugging + +Check daemon status: + +```bash +docker exec cocoindex-code ccc daemon status +``` + +Inspect overlay status from the current host directory: + +```bash +ccc overlay status +``` + +Inspect state in the container: + +```bash +docker exec -it cocoindex-code sh +ls -R /var/cocoindex/state +``` + +Reset all Docker-managed index, layer, and cache state: + +```bash +docker compose -f docker/docker-compose.yml down -v +``` + +This preserves your source tree because it is bind-mounted from the host. diff --git a/docs/layered-indexing.md b/docs/layered-indexing.md new file mode 100644 index 0000000..f85613b --- /dev/null +++ b/docs/layered-indexing.md @@ -0,0 +1,197 @@ +# Git Layered Indexing + +Git layered indexing lets one local daemon reuse work across a root clone and its linked worktrees. Instead of rebuilding a full index for every branch, `ccc` composes three layers: + +```text +dirty > branch > base +``` + +- `base`: an immutable snapshot of the configured base ref, usually `main` or `master`. +- `branch`: files changed between the branch merge base and branch `HEAD`. +- `dirty`: uncommitted working tree changes. + +Search results are merged from highest to lowest layer. A file in a higher layer shadows the same file in lower layers, and deleted files are tombstoned so stale base results do not appear. + +## Quick Start + +Initialize once from the root clone and choose the shared base ref: + +```bash +cd ~/src/github/cocoindex-io/cocoindex-code +ccc init --base main +ccc index +``` + +Then use linked worktrees normally: + +```bash +git worktree add ../cocoindex-code.worktrees/feature-1 -b feature-1 main +cd ../cocoindex-code.worktrees/feature-1 +ccc index +ccc search "query planner" +ccc overlay status +``` + +The linked worktree reuses the base layer and only indexes the branch and dirty deltas. + +## Configuration Model + +Layered indexing has two kinds of configuration. + +Project settings stay checkout-local: + +```text +/.cocoindex_code/settings.yml +``` + +They control include/exclude patterns, language overrides, and chunkers. These settings are part of the index configuration hash, so changing them creates new layer IDs and causes affected layers to rebuild. + +Repository overlay policy is stored in daemon state: + +```text +$COCOINDEX_CODE_STATE_DIR/daemon.db +``` + +`ccc init --base ` registers the repository policy. Linked worktrees use the same policy automatically when they resolve to the same logical repository. + +Current policy fields are: + +```yaml +layers: + enabled: true + base_ref: main + dirty: true + environment_strategy: per-layer + branch_ttl: 14d + dirty_ttl: 24h +``` + +The current implementation persists the base ref and uses the conservative `per-layer` CocoIndex environment strategy. TTLs are applied to branch and dirty layer manifests so stale layers can be pruned. + +## Stable IDs + +Layered indexing uses names for display and hashes for storage. Physical paths are mutable metadata only. + +Repository ID: + +```text +hash(normalized_remote_url, repo_relative_root, index_config_hash) +``` + +Base layer ID: + +```text +hash(repo_id, base_ref_name, base_commit_hash, index_config_hash) +``` + +Branch layer ID: + +```text +hash(repo_id, branch_name, head_commit_hash, merge_base_commit_hash, base_layer_id, index_config_hash) +``` + +Worktree ID: + +```text +hash(repo_id, worktree_name, branch_name) +``` + +Dirty layer ID: + +```text +hash(repo_id, worktree_id, branch_name, head_commit_hash, dirty_snapshot_hash, index_config_hash) +``` + +This means: + +- moving a repository does not change its repository ID +- moving a linked worktree does not change its worktree ID if the worktree name and branch stay the same +- advancing `main` or `master` creates a new base layer because the base commit hash changes +- rebasing or merging a feature branch creates a new branch layer because the head or merge-base hash changes +- editing uncommitted files creates a new dirty layer because the dirty snapshot hash changes + +## State Layout + +The default native layout is: + +```text +$COCOINDEX_CODE_STATE_DIR/ + daemon.db + repos/ + / + layers/ + / + src/ + db/ + cocoindex.db + target_sqlite.db +``` + +`daemon.db` stores repository metadata, worktree metadata, layer metadata, manifests, and overlay policy. CocoIndex owns the per-layer indexing state under each layer's `db/` directory. + +## Commands + +Initialize or update the repository base policy: + +```bash +ccc init --base main +``` + +Build or refresh the current layer stack: + +```bash +ccc index +``` + +Override the base ref for a specific command: + +```bash +ccc index --base release/1.2 +ccc search --base release/1.2 "migration logic" +ccc overlay status --base release/1.2 +``` + +Inspect layer state: + +```bash +ccc overlay status +``` + +Prune expired branch and dirty layers: + +```bash +ccc overlay prune +``` + +## Linked Worktree Example + +```bash +cd ~/src/github/cocoindex-io/cocoindex-code +ccc init --base main +ccc index + +git worktree add ../cocoindex-code.worktrees/feature-1 -b feature-1 main +cd ../cocoindex-code.worktrees/feature-1 +ccc index +ccc search "daemon socket lifecycle" +``` + +Expected layer stack in the feature worktree: + +```text +dirty: uncommitted changes in feature-1, if any +branch: diff from merge-base(main, feature-1) to feature-1 HEAD +base: shared main layer +``` + +## Docker Notes + +In Docker, keep `COCOINDEX_CODE_STATE_DIR` on the container-native persistent volume: + +```text +/var/cocoindex/state +``` + +Keep source code mounted under `/workspace`, and use `COCOINDEX_CODE_HOST_CWD` in `docker exec` wrappers so the daemon resolves the correct root clone or linked worktree. + +See [Docker Layered Indexing](./docker-layered-indexing.md) for a complete Docker setup. diff --git a/pyproject.toml b/pyproject.toml index 0e9b2d5..e429ea2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ dependencies = [ "typer>=0.9.0", "msgspec>=0.19.0", "pathspec>=0.12.1", + "pygit2>=1.19.0", "pyyaml>=6.0", "questionary>=2.0.0", ] diff --git a/src/cocoindex_code/_daemon_paths.py b/src/cocoindex_code/_daemon_paths.py index 7099083..75d8c79 100644 --- a/src/cocoindex_code/_daemon_paths.py +++ b/src/cocoindex_code/_daemon_paths.py @@ -31,6 +31,23 @@ def daemon_runtime_dir() -> Path: return user_settings_dir() +def daemon_state_dir() -> Path: + """Return the durable daemon-owned state directory. + + This is separate from both project checkout state and daemon runtime files: + it stores shared layer metadata, materialized layer sources, and layer + databases. ``COCOINDEX_CODE_STATE_DIR`` exists mostly for tests and + advanced users; otherwise we follow XDG data-home on Unix-like systems. + """ + override = os.environ.get("COCOINDEX_CODE_STATE_DIR") + if override: + return Path(override) + xdg_data_home = os.environ.get("XDG_DATA_HOME") + if xdg_data_home: + return Path(xdg_data_home) / "cocoindex-code" + return Path.home() / ".local" / "share" / "cocoindex-code" + + def connection_family() -> str: """Return the multiprocessing connection family for this platform.""" return "AF_PIPE" if sys.platform == "win32" else "AF_UNIX" diff --git a/src/cocoindex_code/cli.py b/src/cocoindex_code/cli.py index 71ebab9..6f326f0 100644 --- a/src/cocoindex_code/cli.py +++ b/src/cocoindex_code/cli.py @@ -44,6 +44,8 @@ daemon_app = _typer.Typer(name="daemon", help="Manage the daemon process.") app.add_typer(daemon_app, name="daemon") +overlay_app = _typer.Typer(name="overlay", help="Inspect and prune Git index overlays.") +app.add_typer(overlay_app, name="overlay") @app.callback() @@ -99,6 +101,28 @@ def require_project_root() -> Path: return root +def require_project_root_from(cwd: Path | None) -> Path: + """Find the initialized project root for *cwd* or the process CWD.""" + if cwd is None: + return require_project_root() + gs_path = user_settings_path() + if not gs_path.is_file(): + _typer.echo( + f"Error: Global settings not found: {format_path_for_display(gs_path)}\n" + "Run `ccc init` to create it with default settings.", + err=True, + ) + raise _typer.Exit(code=1) + root = find_project_root(cwd) + if root is None: + _typer.echo( + f"Error: Not in an initialized project directory: {format_path_for_display(cwd)}", + err=True, + ) + raise _typer.Exit(code=1) + return root + + _F = TypeVar("_F", bound=Callable[..., object]) @@ -181,7 +205,12 @@ def print_search_results(response: SearchResponse) -> None: _typer.echo(r.content) -def _run_index_with_progress(project_root: str) -> None: +def _run_index_with_progress( + project_root: str, + *, + cwd: str | None = None, + base_ref: str | None = None, +) -> None: """Run indexing with streaming progress display. Exits on failure.""" from rich.console import Console as _Console from rich.live import Live as _Live @@ -208,7 +237,13 @@ def _on_progress(progress: IndexingProgress) -> None: live.update(_Spinner("dots", last_progress_line)) try: - resp = _client.index(project_root, on_progress=_on_progress, on_waiting=_on_waiting) + resp = _client.index( + project_root, + cwd=cwd, + base_ref=base_ref, + on_progress=_on_progress, + on_waiting=_on_waiting, + ) except RuntimeError as e: live.stop() # Let DaemonStartError propagate to the decorator for consistent handling. @@ -229,6 +264,8 @@ def _on_progress(progress: IndexingProgress) -> None: def _search_with_wait_spinner( project_root: str, query: str, + cwd: str | None = None, + base_ref: str | None = None, languages: list[str] | None = None, paths: list[str] | None = None, limit: int = 10, @@ -254,6 +291,8 @@ def _on_waiting() -> None: resp = _client.search( project_root=project_root, query=query, + cwd=cwd, + base_ref=base_ref, languages=languages, paths=paths, limit=limit, @@ -477,6 +516,42 @@ def _setup_user_settings_interactive(litellm_model_flag: str | None) -> None: _typer.echo() +def _register_overlay_policy(project_root: Path, base_ref: str) -> None: + from ._daemon_paths import daemon_state_dir + from .embedder_params import resolve_embedder_params + from .layered_project import build_index_config_hash + from .layers import LayerStore + from .settings import load_user_settings + from .version_control import GitContextError, resolve_worktree + + user_settings = load_user_settings() + params = resolve_embedder_params(user_settings.embedding) + config_hash = build_index_config_hash( + project_root, + indexing_params=params.indexing, + query_params=params.query, + ) + try: + worktree = resolve_worktree( + project_root, base_ref=base_ref, index_config_hash=config_hash + ) + except GitContextError as e: + _typer.echo(f"Warning: could not register Git overlay policy: {e}", err=True) + return + + store = LayerStore(daemon_state_dir() / "daemon.db") + store.upsert_repository( + repo_id=worktree.repository.id, + repo_name=worktree.repository.repo_name, + remote_url=worktree.repository.remote_url, + normalized_remote_url=worktree.repository.normalized_remote_url, + repo_relative_root=worktree.repository.repo_relative_root, + last_seen_root=worktree.repository.last_seen_root, + ) + store.upsert_overlay_policy(repo_id=worktree.repository.id, base_ref=worktree.branch.base_ref) + _typer.echo(f"Registered Git overlay base: {worktree.branch.base_ref}") + + @app.command() def init( litellm_model: str | None = _typer.Option( @@ -485,6 +560,7 @@ def init( help="Use the given LiteLLM model and skip provider/model prompts.", ), force: bool = _typer.Option(False, "-f", "--force", help="Skip parent directory warning"), + base_ref: str | None = _typer.Option(None, "--base", help="Git base ref for overlays"), ) -> None: """Initialize a project for cocoindex-code.""" cwd = Path.cwd().resolve() @@ -506,6 +582,8 @@ def init( # Check if already initialized if settings_file.is_file(): _typer.echo("Project already initialized.") + if base_ref is not None: + _register_overlay_policy(cwd, base_ref) return # Check parent directories for markers @@ -524,6 +602,9 @@ def init( save_project_settings(cwd, default_project_settings()) _typer.echo(f"Created project settings: {format_path_for_display(settings_file)}") + if base_ref is not None: + _register_overlay_policy(cwd, base_ref) + # Add to .gitignore add_to_gitignore(cwd) @@ -533,13 +614,18 @@ def init( @app.command() @_catch_daemon_start_error -def index() -> None: +def index( + cwd: Path | None = _typer.Option(None, "--cwd", help="Workspace path to index"), + base_ref: str | None = _typer.Option(None, "--base", help="Git base ref"), +) -> None: """Create/update index for the codebase.""" from . import client as _client - project_root = str(require_project_root()) + project_root_path = require_project_root_from(cwd.resolve() if cwd is not None else None) + project_root = str(project_root_path) + request_cwd = str(cwd.resolve()) if cwd is not None else None print_project_header(project_root) - _run_index_with_progress(project_root) + _run_index_with_progress(project_root, cwd=request_cwd, base_ref=base_ref) print_index_stats(_client.project_status(project_root)) @@ -552,13 +638,17 @@ def search( offset: int = _typer.Option(0, "--offset", help="Number of results to skip"), limit: int = _typer.Option(10, "--limit", help="Maximum results to return"), refresh: bool = _typer.Option(False, "--refresh", help="Refresh index before searching"), + cwd: Path | None = _typer.Option(None, "--cwd", help="Workspace path to search"), + base_ref: str | None = _typer.Option(None, "--base", help="Git base ref"), ) -> None: """Semantic search across the codebase.""" - project_root = str(require_project_root()) + project_root_path = require_project_root_from(cwd.resolve() if cwd is not None else None) + project_root = str(project_root_path) + request_cwd = str(cwd.resolve()) if cwd is not None else None query_str = " ".join(query) if refresh: - _run_index_with_progress(project_root) + _run_index_with_progress(project_root, cwd=request_cwd, base_ref=base_ref) # Default path filter from CWD paths: list[str] | None = None @@ -572,6 +662,8 @@ def search( resp = _search_with_wait_spinner( project_root=project_root, query=query_str, + cwd=request_cwd, + base_ref=base_ref, languages=lang or None, paths=paths, limit=limit, @@ -825,7 +917,7 @@ def mcp() -> None: async def _run_mcp() -> None: from .server import create_mcp_server - mcp_server = create_mcp_server(project_root) + mcp_server = create_mcp_server(project_root, cwd=str(Path.cwd().resolve())) asyncio.create_task(_bg_index(project_root)) await mcp_server.run_stdio_async() @@ -845,6 +937,47 @@ async def _bg_index(project_root: str) -> None: pass +# --- Overlay subcommands --- + + +@overlay_app.command("status") +@_catch_daemon_start_error +def overlay_status( + cwd: Path | None = _typer.Option(None, "--cwd", help="Workspace path to inspect"), + base_ref: str | None = _typer.Option(None, "--base", help="Git base ref"), +) -> None: + """Show daemon layer metadata for the current Git repo.""" + from . import client as _client + + project_root_path = require_project_root_from(cwd.resolve() if cwd is not None else None) + request_cwd = str(cwd.resolve()) if cwd is not None else None + resp = _client.overlay_status(str(project_root_path), cwd=request_cwd, base_ref=base_ref) + if resp.repo_id is not None: + _typer.echo(f"Repo: {resp.repo_id}") + if not resp.layers: + _typer.echo("No layers.") + return + for layer in resp.layers: + _typer.echo( + f"{layer.kind:6} {layer.status:8} {layer.layer_id} " + f"ref={layer.ref_name or '-'} affected={layer.affected_count} " + f"tombstones={layer.tombstoned_count}" + ) + + +@overlay_app.command("prune") +@_catch_daemon_start_error +def overlay_prune() -> None: + """Prune expired dirty and branch overlays.""" + from . import client as _client + + resp = _client.overlay_prune() + if not resp.pruned_layer_ids: + _typer.echo("No expired layers pruned.") + return + _typer.echo(f"Pruned {len(resp.pruned_layer_ids)} layer(s).") + + # --- Daemon subcommands --- diff --git a/src/cocoindex_code/client.py b/src/cocoindex_code/client.py index 262af87..7fbfa70 100644 --- a/src/cocoindex_code/client.py +++ b/src/cocoindex_code/client.py @@ -40,6 +40,10 @@ IndexRequest, IndexResponse, IndexWaitingNotice, + OverlayPruneRequest, + OverlayPruneResponse, + OverlayStatusRequest, + OverlayStatusResponse, ProjectStatusRequest, ProjectStatusResponse, RemoveProjectRequest, @@ -242,14 +246,20 @@ def _send(req: Request) -> Response: def index( project_root: str, + cwd: str | None = None, + base_ref: str | None = None, on_progress: Callable[[IndexingProgress], None] | None = None, on_waiting: Callable[[], None] | None = None, ) -> IndexResponse: """Request indexing with streaming progress. Blocks until complete.""" project_root = normalize_input_path(project_root) + if cwd is not None: + cwd = normalize_input_path(cwd) conn = _connect_and_handshake() try: - conn.send_bytes(encode_request(IndexRequest(project_root=project_root))) + conn.send_bytes( + encode_request(IndexRequest(project_root=project_root, cwd=cwd, base_ref=base_ref)) + ) while True: try: data = conn.recv_bytes() @@ -276,6 +286,8 @@ def index( def search( project_root: str, query: str, + cwd: str | None = None, + base_ref: str | None = None, languages: list[str] | None = None, paths: list[str] | None = None, limit: int = 5, @@ -289,6 +301,8 @@ def search( until the final ``SearchResponse``. """ project_root = normalize_input_path(project_root) + if cwd is not None: + cwd = normalize_input_path(cwd) conn = _connect_and_handshake() try: conn.send_bytes( @@ -296,6 +310,8 @@ def search( SearchRequest( project_root=project_root, query=query, + cwd=cwd, + base_ref=base_ref, languages=languages, paths=paths, limit=limit, @@ -345,6 +361,21 @@ def daemon_env() -> DaemonEnvResponse: return _send(DaemonEnvRequest()) # type: ignore[return-value] +def overlay_status( + project_root: str, + cwd: str | None = None, + base_ref: str | None = None, +) -> OverlayStatusResponse: + project_root = normalize_input_path(project_root) + if cwd is not None: + cwd = normalize_input_path(cwd) + return _send(OverlayStatusRequest(project_root=project_root, cwd=cwd, base_ref=base_ref)) # type: ignore[return-value] + + +def overlay_prune() -> OverlayPruneResponse: + return _send(OverlayPruneRequest()) # type: ignore[return-value] + + def doctor( project_root: str | None = None, on_result: Callable[[DoctorCheckResult], None] | None = None, diff --git a/src/cocoindex_code/daemon.py b/src/cocoindex_code/daemon.py index 41334bc..9b31d84 100644 --- a/src/cocoindex_code/daemon.py +++ b/src/cocoindex_code/daemon.py @@ -21,10 +21,14 @@ daemon_pid_path, daemon_runtime_dir, daemon_socket_path, + daemon_state_dir, ) from ._version import __version__ from .chunking import ChunkerFn as _ChunkerFn from .embedder_params import resolve_embedder_params +from .git_context import GitContextError, resolve_worktree_context +from .layer_store import LayerStore +from .layered_project import LayeredProject, build_index_config_hash from .project import Project from .protocol import ( DaemonEnvRequest, @@ -42,6 +46,11 @@ IndexRequest, IndexStreamResponse, IndexWaitingNotice, + OverlayLayerInfo, + OverlayPruneRequest, + OverlayPruneResponse, + OverlayStatusRequest, + OverlayStatusResponse, ProjectStatusRequest, RemoveProjectRequest, RemoveProjectResponse, @@ -126,7 +135,8 @@ class ProjectRegistry: mismatch once the file is created and trigger a supervisor respawn. """ - _projects: dict[str, Project] + _projects: dict[str, Project | LayeredProject] + _layer_project_cache: dict[str, Project] _embedder: Embedder | None indexing_params: dict[str, Any] query_params: dict[str, Any] @@ -138,35 +148,74 @@ def __init__( query_params: dict[str, Any] | None = None, ) -> None: self._projects = {} + self._layer_project_cache = {} self._embedder = embedder self.indexing_params = dict(indexing_params) if indexing_params else {} self.query_params = dict(query_params) if query_params else {} + self.state_dir = daemon_state_dir() + self.state_dir.mkdir(parents=True, exist_ok=True) + self.layer_store = LayerStore(self.state_dir / "daemon.db") - async def get_project(self, project_root: str) -> Project: + async def get_project( + self, + project_root: str, + *, + cwd: str | None = None, + base_ref: str | None = None, + ) -> Project | LayeredProject: """Get or create a Project for the given root. Lazy initialization.""" if self._embedder is None: raise RuntimeError( "Daemon has no global settings loaded. Run `ccc init` to set up cocoindex-code." ) - if project_root not in self._projects: - root = Path(project_root) + root = Path(project_root) + request_cwd = Path(cwd) if cwd is not None else root + cache_key = f"{root.resolve()}\0{request_cwd.resolve()}\0{base_ref or ''}" + if cache_key not in self._projects: project_settings = load_project_settings(root) chunker_registry = _resolve_chunker_registry(project_settings.chunkers) - project = await Project.create( - root, - self._embedder, - indexing_params=self.indexing_params, - query_params=self.query_params, - chunker_registry=chunker_registry, - ) - self._projects[project_root] = project - return self._projects[project_root] + try: + config_hash = build_index_config_hash( + root, + indexing_params=self.indexing_params, + query_params=self.query_params, + ) + resolve_worktree_context( + request_cwd, base_ref=base_ref, index_config_hash=config_hash + ) + except GitContextError: + project: Project | LayeredProject = await Project.create( + root, + self._embedder, + indexing_params=self.indexing_params, + query_params=self.query_params, + chunker_registry=chunker_registry, + ) + else: + project = LayeredProject( + project_root=root, + cwd=request_cwd, + base_ref=base_ref, + state_dir=self.state_dir, + store=self.layer_store, + embedder=self._embedder, + indexing_params=self.indexing_params, + query_params=self.query_params, + chunker_registry=chunker_registry, + project_cache=self._layer_project_cache, + ) + self._projects[cache_key] = project + return self._projects[cache_key] def remove_project(self, project_root: str) -> bool: """Remove a project from the registry. Returns True if it was loaded.""" import gc - project = self._projects.pop(project_root, None) + prefix = f"{Path(project_root).resolve()}\0" + keys = [key for key in self._projects if key.startswith(prefix) or key == project_root] + project = None + for key in keys: + project = self._projects.pop(key, None) if project is not None: project.close() del project @@ -180,14 +229,17 @@ def close_all(self) -> None: for project in self._projects.values(): project.close() + for project in self._layer_project_cache.values(): + project.close() self._projects.clear() + self._layer_project_cache.clear() gc.collect() def list_projects(self) -> list[DaemonProjectInfo]: """List all loaded projects with their indexing state.""" return [ DaemonProjectInfo( - project_root=root, + project_root=root.split("\0", 1)[0], indexing=project._index_lock.locked(), ) for root, project in self._projects.items() @@ -265,7 +317,7 @@ async def handle_connection( async def _search_with_wait( - project: Project, req: SearchRequest + project: Any, req: SearchRequest ) -> AsyncIterator[SearchStreamResponse]: """Stream search response, waiting for ongoing indexing first.""" yield IndexWaitingNotice() @@ -474,11 +526,15 @@ async def _dispatch( """ try: if isinstance(req, IndexRequest): - project = await registry.get_project(req.project_root) + project = await registry.get_project( + req.project_root, cwd=req.cwd, base_ref=req.base_ref + ) return project.stream_index() if isinstance(req, SearchRequest): - project = await registry.get_project(req.project_root) + project = await registry.get_project( + req.project_root, cwd=req.cwd, base_ref=req.base_ref + ) await project.ensure_indexing_started() if project.should_wait_for_indexing: @@ -535,6 +591,51 @@ async def _dispatch( ], ) + if isinstance(req, OverlayStatusRequest): + try: + config_hash = build_index_config_hash( + Path(req.project_root), + indexing_params=registry.indexing_params, + query_params=registry.query_params, + ) + ctx = resolve_worktree_context( + Path(req.cwd) if req.cwd is not None else Path(req.project_root), + base_ref=req.base_ref, + index_config_hash=config_hash, + ) + layers = registry.layer_store.list_layers(repo_id=ctx.repo_id) + repo_id: str | None = ctx.repo_id + except Exception: + layers = registry.layer_store.list_layers() + repo_id = None + layer_infos: list[OverlayLayerInfo] = [] + for layer in layers: + manifest = registry.layer_store.get_manifest(layer.layer_id) + layer_infos.append( + OverlayLayerInfo( + layer_id=layer.layer_id, + repo_id=layer.repo_id, + kind=layer.kind.value, + ref_name=layer.ref_name, + commit=layer.commit, + status=layer.status, + affected_count=len(manifest.affected_paths) if manifest else 0, + tombstoned_count=len(manifest.tombstoned_paths) if manifest else 0, + ) + ) + return OverlayStatusResponse( + repo_id=repo_id, + layers=layer_infos, + ) + + if isinstance(req, OverlayPruneRequest): + pruned = registry.layer_store.prune_expired() + for layer in pruned: + import shutil + + shutil.rmtree(layer.source_dir.parent, ignore_errors=True) + return OverlayPruneResponse(pruned_layer_ids=[layer.layer_id for layer in pruned]) + if isinstance(req, DoctorRequest): return _handle_doctor(req, registry) diff --git a/src/cocoindex_code/git_context.py b/src/cocoindex_code/git_context.py new file mode 100644 index 0000000..0fe68d9 --- /dev/null +++ b/src/cocoindex_code/git_context.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +from .version_control import GitContextError, GitStatusEntry, normalize_remote_url +from .version_control import Worktree as WorktreeContext +from .version_control import resolve_worktree as resolve_worktree_context + +__all__ = [ + "GitContextError", + "GitStatusEntry", + "WorktreeContext", + "normalize_remote_url", + "resolve_worktree_context", +] diff --git a/src/cocoindex_code/indexer.py b/src/cocoindex_code/indexer.py index e028103..2d6ae98 100644 --- a/src/cocoindex_code/indexer.py +++ b/src/cocoindex_code/indexer.py @@ -20,6 +20,7 @@ CODEBASE_DIR, EMBEDDER, INDEXING_EMBED_PARAMS, + PROJECT_ROOT, SQLITE_DB, CodeChunk, ) @@ -152,7 +153,7 @@ async def process_file( return suffix = file.file_path.path.suffix - project_root = coco.use_context(CODEBASE_DIR) + project_root = coco.use_context(PROJECT_ROOT) ps = load_project_settings(project_root) ext_lang_map = {f".{lo.ext}": lo.lang for lo in ps.language_overrides} language = ( @@ -197,7 +198,8 @@ async def process(chunk: Chunk) -> None: @coco.fn async def indexer_main() -> None: """Main indexing function - walks files and processes each.""" - project_root = coco.use_context(CODEBASE_DIR) + project_root = coco.use_context(PROJECT_ROOT) + codebase_dir = coco.use_context(CODEBASE_DIR) ps = load_project_settings(project_root) gitignore_spec = load_gitignore_spec(project_root) @@ -218,7 +220,7 @@ async def indexer_main() -> None: included_patterns=ps.include_patterns, excluded_patterns=ps.exclude_patterns, ) - matcher: FilePathMatcher = GitignoreAwareMatcher(base_matcher, gitignore_spec, project_root) + matcher: FilePathMatcher = GitignoreAwareMatcher(base_matcher, gitignore_spec, codebase_dir) files = localfs.walk_dir( CODEBASE_DIR, diff --git a/src/cocoindex_code/layer_store.py b/src/cocoindex_code/layer_store.py new file mode 100644 index 0000000..ff6eeb8 --- /dev/null +++ b/src/cocoindex_code/layer_store.py @@ -0,0 +1,8 @@ +from __future__ import annotations + +from .layers.layer import Layer as LayerRecord +from .layers.layer_kind import LayerKind +from .layers.layer_manifest import LayerManifest as OverlayManifest +from .layers.layer_store import LayerStore + +__all__ = ["LayerKind", "LayerRecord", "LayerStore", "OverlayManifest"] diff --git a/src/cocoindex_code/layered_project.py b/src/cocoindex_code/layered_project.py new file mode 100644 index 0000000..4c736e8 --- /dev/null +++ b/src/cocoindex_code/layered_project.py @@ -0,0 +1,211 @@ +from __future__ import annotations + +import asyncio +import hashlib +import sqlite3 +from collections.abc import AsyncIterator, Callable +from pathlib import Path +from typing import Any + +from .layers import LayerBuildResult, LayerStack, LayerStore +from .project import Project +from .protocol import ( + IndexingProgress, + IndexResponse, + IndexStreamResponse, + ProjectStatusResponse, + SearchResult, +) +from .settings import load_project_settings +from .shared import Embedder +from .version_control import resolve_worktree + + +def _sha_short(value: str) -> str: + return hashlib.sha256(value.encode()).hexdigest()[:24] + + +def build_index_config_hash( + project_root: Path, + *, + indexing_params: dict[str, Any], + query_params: dict[str, Any], +) -> str: + settings = load_project_settings(project_root) + seed = repr( + ( + settings.include_patterns, + settings.exclude_patterns, + [(lo.ext, lo.lang) for lo in settings.language_overrides], + [(cm.ext, cm.module) for cm in settings.chunkers], + sorted(indexing_params.items()), + sorted(query_params.items()), + ) + ) + return _sha_short(seed) + + +class LayeredProject: + """A Project-compatible facade backed by base/branch/dirty Git layers.""" + + def __init__( + self, + *, + project_root: Path, + cwd: Path, + base_ref: str | None, + state_dir: Path, + store: LayerStore, + embedder: Embedder, + indexing_params: dict[str, Any], + query_params: dict[str, Any], + chunker_registry: dict[str, Any], + project_cache: dict[str, Project], + ) -> None: + self.project_root = project_root + self.cwd = cwd + self.base_ref = base_ref + self.state_dir = state_dir + self.store = store + self.embedder = embedder + self.indexing_params = indexing_params + self.query_params = query_params + self.chunker_registry = chunker_registry + self.project_cache = project_cache + self._stack = LayerStack( + project_root=project_root, + state_dir=state_dir, + store=store, + embedder=embedder, + indexing_params=indexing_params, + query_params=query_params, + chunker_registry=chunker_registry, + project_cache=project_cache, + ) + self._index_lock = asyncio.Lock() + self._initial_index_done = asyncio.Event() + self._indexing_stats: IndexingProgress | None = None + self._last_layers: list[LayerBuildResult] = [] + + @property + def should_wait_for_indexing(self) -> bool: + return not self._initial_index_done.is_set() + + @property + def indexing_stats(self) -> IndexingProgress | None: + return self._indexing_stats + + def close(self) -> None: + for project in self.project_cache.values(): + project.close() + + async def ensure_indexing_started(self) -> None: + if self._initial_index_done.is_set() or self._index_lock.locked(): + return + await self.run_index() + + async def wait_for_indexing_done(self) -> None: + await self._initial_index_done.wait() + if self._index_lock.locked(): + async with self._index_lock: + pass + + async def stream_index(self) -> AsyncIterator[IndexStreamResponse]: + if self._index_lock.locked(): + from .protocol import IndexWaitingNotice + + yield IndexWaitingNotice() + try: + await self.run_index() + yield IndexResponse(success=True) + except Exception as e: + yield IndexResponse(success=False, message=str(e)) + + async def run_index( + self, + on_progress: Callable[[IndexingProgress], None] | None = None, + on_started: asyncio.Event | None = None, + ) -> None: + async with self._index_lock: + self._indexing_stats = IndexingProgress(0, 0, 0, 0, 0, 0) + if on_started is not None: + on_started.set() + try: + self._last_layers = await self._ensure_layers(on_progress=on_progress) + finally: + self._initial_index_done.set() + self._indexing_stats = None + + async def search( + self, + query: str, + languages: list[str] | None = None, + paths: list[str] | None = None, + limit: int = 5, + offset: int = 0, + ) -> list[SearchResult]: + layers = self._last_layers or await self._ensure_layers(on_progress=None) + return await self._stack.search( + layers=layers, + query=query, + languages=languages, + paths=paths, + limit=limit, + offset=offset, + ) + + def get_status(self) -> ProjectStatusResponse: + total_chunks = 0 + total_files_set: set[str] = set() + languages: dict[str, int] = {} + index_exists = bool(self._last_layers) + for layer in self._last_layers: + db_path = layer.layer.paths.target_sqlite + if not db_path.exists(): + continue + try: + conn = sqlite3.connect(db_path) + try: + total_chunks += conn.execute( + "SELECT COUNT(*) FROM code_chunks_vec" + ).fetchone()[0] + for (path,) in conn.execute("SELECT DISTINCT file_path FROM code_chunks_vec"): + total_files_set.add(path) + for lang, count in conn.execute( + "SELECT language, COUNT(*) FROM code_chunks_vec GROUP BY language" + ): + languages[lang] = languages.get(lang, 0) + count + finally: + conn.close() + except sqlite3.OperationalError: + index_exists = False + return ProjectStatusResponse( + indexing=self._index_lock.locked(), + total_chunks=total_chunks, + total_files=len(total_files_set), + languages=languages, + progress=self._indexing_stats, + index_exists=index_exists, + ) + + async def _ensure_layers( + self, + on_progress: Callable[[IndexingProgress], None] | None, + ) -> list[LayerBuildResult]: + config_hash = build_index_config_hash( + self.project_root, + indexing_params=self.indexing_params, + query_params=self.query_params, + ) + worktree = resolve_worktree(self.cwd, base_ref=self.base_ref, index_config_hash=config_hash) + if self.base_ref is None: + stored_base_ref = self.store.get_overlay_base_ref(worktree.repository.id) + if stored_base_ref is not None and stored_base_ref != worktree.branch.base_ref: + worktree = resolve_worktree( + self.cwd, base_ref=stored_base_ref, index_config_hash=config_hash + ) + return await self._stack.ensure( + worktree=worktree, + config_hash=config_hash, + on_progress=on_progress, + ) diff --git a/src/cocoindex_code/layers/__init__.py b/src/cocoindex_code/layers/__init__.py new file mode 100644 index 0000000..0c71473 --- /dev/null +++ b/src/cocoindex_code/layers/__init__.py @@ -0,0 +1,18 @@ +from .layer import Layer +from .layer_kind import LayerKind +from .layer_manifest import LayerManifest +from .layer_paths import LayerPaths +from .layer_runtime import LayerRuntime +from .layer_stack import LayerBuildResult, LayerStack +from .layer_store import LayerStore + +__all__ = [ + "Layer", + "LayerBuildResult", + "LayerKind", + "LayerManifest", + "LayerPaths", + "LayerRuntime", + "LayerStack", + "LayerStore", +] diff --git a/src/cocoindex_code/layers/layer.py b/src/cocoindex_code/layers/layer.py new file mode 100644 index 0000000..469dbdf --- /dev/null +++ b/src/cocoindex_code/layers/layer.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path + +from .layer_kind import LayerKind +from .layer_manifest import LayerManifest +from .layer_paths import LayerPaths + + +@dataclass(frozen=True) +class Layer: + id: str + repo_id: str + kind: LayerKind + paths: LayerPaths + manifest: LayerManifest | None + ref_name: str | None + commit_hash: str | None + base_commit_hash: str | None + merge_base_hash: str | None + base_layer_id: str | None + worktree_id: str | None + config_hash: str | None + status: str + created_at: float + last_accessed_at: float + + @property + def layer_id(self) -> str: + return self.id + + @property + def source_dir(self) -> Path: + return self.paths.source + + @property + def db_dir(self) -> Path: + return self.paths.db_dir + + @property + def commit(self) -> str | None: + return self.commit_hash + + @property + def base_commit(self) -> str | None: + return self.base_commit_hash diff --git a/src/cocoindex_code/layers/layer_kind.py b/src/cocoindex_code/layers/layer_kind.py new file mode 100644 index 0000000..e62a181 --- /dev/null +++ b/src/cocoindex_code/layers/layer_kind.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +from enum import StrEnum + + +class LayerKind(StrEnum): + BASE = "base" + BRANCH = "branch" + DIRTY = "dirty" diff --git a/src/cocoindex_code/layers/layer_manifest.py b/src/cocoindex_code/layers/layer_manifest.py new file mode 100644 index 0000000..87a3772 --- /dev/null +++ b/src/cocoindex_code/layers/layer_manifest.py @@ -0,0 +1,11 @@ +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class LayerManifest: + affected_paths: frozenset[str] + tombstoned_paths: frozenset[str] + created_at: float + expires_at: float | None diff --git a/src/cocoindex_code/layers/layer_paths.py b/src/cocoindex_code/layers/layer_paths.py new file mode 100644 index 0000000..156d416 --- /dev/null +++ b/src/cocoindex_code/layers/layer_paths.py @@ -0,0 +1,27 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path + + +@dataclass(frozen=True) +class LayerPaths: + root: Path + source: Path + cocoindex_db: Path + target_sqlite: Path + + @property + def db_dir(self) -> Path: + return self.cocoindex_db.parent + + @classmethod + def for_layer(cls, state_dir: Path, repo_id: str, layer_id: str) -> LayerPaths: + root = state_dir / "repos" / repo_id / "layers" / layer_id + db_dir = root / "db" + return cls( + root=root, + source=root / "src", + cocoindex_db=db_dir / "cocoindex.db", + target_sqlite=db_dir / "target_sqlite.db", + ) diff --git a/src/cocoindex_code/layers/layer_runtime.py b/src/cocoindex_code/layers/layer_runtime.py new file mode 100644 index 0000000..ea1a229 --- /dev/null +++ b/src/cocoindex_code/layers/layer_runtime.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from collections.abc import Callable +from pathlib import Path +from typing import Any + +from cocoindex_code.project import Project +from cocoindex_code.protocol import IndexingProgress +from cocoindex_code.shared import Embedder + +from .layer import Layer + + +class LayerRuntime: + """CocoIndex adapter for one immutable overlay layer.""" + + def __init__( + self, + *, + layer: Layer, + project: Project, + environment_strategy: str = "per-layer", + ) -> None: + self.layer = layer + self.project = project + self.environment_strategy = environment_strategy + + @classmethod + async def create( + cls, + *, + layer: Layer, + project_root: Path, + embedder: Embedder, + indexing_params: dict[str, Any], + query_params: dict[str, Any], + chunker_registry: dict[str, Any], + project_cache: dict[str, Project], + ) -> LayerRuntime: + cached_project = project_cache.get(layer.id) + if cached_project is None: + cached_project = await Project.create( + project_root, + embedder, + indexing_params=indexing_params, + query_params=query_params, + chunker_registry=chunker_registry, + source_root=layer.paths.source, + db_dir=layer.paths.db_dir, + ) + project_cache[layer.id] = cached_project + return cls(layer=layer, project=cached_project) + + async def run_index( + self, on_progress: Callable[[IndexingProgress], None] | None = None + ) -> None: + await self.project.run_index(on_progress=on_progress) diff --git a/src/cocoindex_code/layers/layer_stack.py b/src/cocoindex_code/layers/layer_stack.py new file mode 100644 index 0000000..cf0855f --- /dev/null +++ b/src/cocoindex_code/layers/layer_stack.py @@ -0,0 +1,359 @@ +from __future__ import annotations + +import hashlib +import shutil +import time +from collections.abc import Callable +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from cocoindex_code.project import Project +from cocoindex_code.protocol import IndexingProgress, SearchResult +from cocoindex_code.shared import Embedder +from cocoindex_code.version_control import Worktree +from cocoindex_code.version_control.git import ( + branch_changes, + materialize_commit, + materialize_paths_from_commit, + materialize_paths_from_worktree, +) + +from .layer import Layer +from .layer_kind import LayerKind +from .layer_manifest import LayerManifest +from .layer_paths import LayerPaths +from .layer_runtime import LayerRuntime +from .layer_store import LayerStore + +_BRANCH_TTL_SECONDS = 14 * 24 * 60 * 60 +_DIRTY_TTL_SECONDS = 24 * 60 * 60 + + +def _sha_short(value: str) -> str: + return hashlib.sha256(value.encode()).hexdigest()[:24] + + +@dataclass(frozen=True) +class LayerBuildResult: + layer: Layer + manifest: LayerManifest + runtime: LayerRuntime + + @property + def record(self) -> Layer: + return self.layer + + @property + def project(self) -> Project: + return self.runtime.project + + +class LayerStack: + """Builds and queries ordered Git overlay layers.""" + + def __init__( + self, + *, + project_root: Path, + state_dir: Path, + store: LayerStore, + embedder: Embedder, + indexing_params: dict[str, Any], + query_params: dict[str, Any], + chunker_registry: dict[str, Any], + project_cache: dict[str, Project], + ) -> None: + self.project_root = project_root + self.state_dir = state_dir + self.store = store + self.embedder = embedder + self.indexing_params = indexing_params + self.query_params = query_params + self.chunker_registry = chunker_registry + self.project_cache = project_cache + + async def ensure( + self, + *, + worktree: Worktree, + config_hash: str, + on_progress: Callable[[IndexingProgress], None] | None, + ) -> list[LayerBuildResult]: + self.store.upsert_repository( + repo_id=worktree.repository.id, + repo_name=worktree.repository.repo_name, + remote_url=worktree.repository.remote_url, + normalized_remote_url=worktree.repository.normalized_remote_url, + repo_relative_root=worktree.repository.repo_relative_root, + last_seen_root=worktree.repository.last_seen_root, + ) + self.store.upsert_worktree( + worktree_id=worktree.id, + repo_id=worktree.repository.id, + worktree_name=worktree.name, + branch_name=worktree.branch.name, + last_seen_path=worktree.path, + ) + base = await self._ensure_base(worktree, config_hash, on_progress) + layers: list[LayerBuildResult] = [base] + branch = await self._ensure_branch(worktree, base.layer.id, config_hash, on_progress) + if branch is not None: + layers.insert(0, branch) + dirty = await self._ensure_dirty(worktree, base.layer.id, config_hash, on_progress) + if dirty is not None: + layers.insert(0, dirty) + for layer in layers: + self.store.touch_layer(layer.layer.id) + return layers + + async def _ensure_base( + self, + worktree: Worktree, + config_hash: str, + on_progress: Callable[[IndexingProgress], None] | None, + ) -> LayerBuildResult: + layer_id = _sha_short( + "\0".join( + [ + "base", + worktree.repository.id, + worktree.branch.base_ref, + worktree.branch.base_commit, + config_hash, + ] + ) + ) + return await self._ensure_layer( + worktree=worktree, + layer_id=layer_id, + kind=LayerKind.BASE, + ref_name=worktree.branch.base_ref, + commit=worktree.branch.base_commit, + base_commit=None, + merge_base=None, + base_layer_id=None, + worktree_id=None, + config_hash=config_hash, + expires_at=None, + materialize=lambda source_dir: materialize_commit( + worktree.repository.root, worktree.branch.base_commit, source_dir + ), + affected_paths=(), + tombstoned_paths=(), + on_progress=on_progress, + ) + + async def _ensure_branch( + self, + worktree: Worktree, + base_layer_id: str, + config_hash: str, + on_progress: Callable[[IndexingProgress], None] | None, + ) -> LayerBuildResult | None: + changes = branch_changes( + worktree.repository.root, worktree.branch.merge_base, worktree.branch.head_commit + ) + if changes.is_empty: + return None + layer_id = _sha_short( + "\0".join( + [ + "branch", + worktree.repository.id, + worktree.branch.name, + worktree.branch.head_commit, + worktree.branch.merge_base, + base_layer_id, + config_hash, + ] + ) + ) + return await self._ensure_layer( + worktree=worktree, + layer_id=layer_id, + kind=LayerKind.BRANCH, + ref_name=worktree.branch.name, + commit=worktree.branch.head_commit, + base_commit=worktree.branch.merge_base, + merge_base=worktree.branch.merge_base, + base_layer_id=base_layer_id, + worktree_id=None, + config_hash=config_hash, + expires_at=time.time() + _BRANCH_TTL_SECONDS, + materialize=lambda source_dir: materialize_paths_from_commit( + worktree.repository.root, + worktree.branch.head_commit, + changes.affected_paths, + source_dir, + ), + affected_paths=changes.affected_paths, + tombstoned_paths=changes.tombstoned_paths, + on_progress=on_progress, + ) + + async def _ensure_dirty( + self, + worktree: Worktree, + base_layer_id: str, + config_hash: str, + on_progress: Callable[[IndexingProgress], None] | None, + ) -> LayerBuildResult | None: + if worktree.dirty.snapshot_hash is None: + return None + layer_id = _sha_short( + "\0".join( + [ + "dirty", + worktree.repository.id, + worktree.id, + worktree.branch.name, + worktree.branch.head_commit, + worktree.dirty.snapshot_hash, + config_hash, + ] + ) + ) + return await self._ensure_layer( + worktree=worktree, + layer_id=layer_id, + kind=LayerKind.DIRTY, + ref_name=worktree.branch.name, + commit=worktree.branch.head_commit, + base_commit=worktree.branch.merge_base, + merge_base=worktree.branch.merge_base, + base_layer_id=base_layer_id, + worktree_id=worktree.id, + config_hash=config_hash, + expires_at=time.time() + _DIRTY_TTL_SECONDS, + materialize=lambda source_dir: materialize_paths_from_worktree( + worktree.repository.root, worktree.dirty.affected_paths, source_dir + ), + affected_paths=worktree.dirty.affected_paths, + tombstoned_paths=worktree.dirty.tombstoned_paths, + on_progress=on_progress, + ) + + async def _ensure_layer( + self, + *, + worktree: Worktree, + layer_id: str, + kind: LayerKind, + ref_name: str | None, + commit: str | None, + base_commit: str | None, + merge_base: str | None, + base_layer_id: str | None, + worktree_id: str | None, + config_hash: str, + expires_at: float | None, + materialize: Callable[[Path], None], + affected_paths: tuple[str, ...], + tombstoned_paths: tuple[str, ...], + on_progress: Callable[[IndexingProgress], None] | None, + ) -> LayerBuildResult: + paths = LayerPaths.for_layer(self.state_dir, worktree.repository.id, layer_id) + existing = self.store.get_layer(layer_id) + if ( + existing is None + or existing.status != "ready" + or not paths.target_sqlite.exists() + ): + shutil.rmtree(paths.root, ignore_errors=True) + paths.source.mkdir(parents=True, exist_ok=True) + paths.db_dir.mkdir(parents=True, exist_ok=True) + self.store.upsert_layer( + layer_id=layer_id, + repo_id=worktree.repository.id, + kind=kind, + ref_name=ref_name, + commit=commit, + base_commit=base_commit, + merge_base=merge_base, + base_layer_id=base_layer_id, + worktree_id=worktree_id, + config_hash=config_hash, + source_dir=paths.source, + db_dir=paths.db_dir, + status="building", + ) + materialize(paths.source) + layer = self._require_layer(layer_id) + runtime = await self._runtime(layer) + await runtime.run_index(on_progress=on_progress) + self.store.replace_manifest( + layer_id, + affected_paths=affected_paths, + tombstoned_paths=tombstoned_paths, + expires_at=expires_at, + ) + self.store.mark_layer_ready(layer_id) + layer = self._require_layer(layer_id) + manifest = self.store.get_manifest(layer_id) + if manifest is None: + raise RuntimeError(f"Layer manifest missing after build: {layer_id}") + runtime = await self._runtime(layer) + return LayerBuildResult(layer=layer, manifest=manifest, runtime=runtime) + + def _require_layer(self, layer_id: str) -> Layer: + layer = self.store.get_layer(layer_id) + if layer is None: + raise RuntimeError(f"Layer metadata missing after build: {layer_id}") + return layer + + async def _runtime(self, layer: Layer) -> LayerRuntime: + return await LayerRuntime.create( + layer=layer, + project_root=self.project_root, + embedder=self.embedder, + indexing_params=self.indexing_params, + query_params=self.query_params, + chunker_registry=self.chunker_registry, + project_cache=self.project_cache, + ) + + async def search( + self, + *, + layers: list[LayerBuildResult], + query: str, + languages: list[str] | None, + paths: list[str] | None, + limit: int, + offset: int, + ) -> list[SearchResult]: + query_embedding = await self.embedder.embed(query, **self.query_params) + embedding_bytes = query_embedding.astype("float32").tobytes() + higher_shadowed: set[str] = set() + merged: list[SearchResult] = [] + for layer in layers: + raw_results = layer.project.search_with_embedding( + embedding_bytes=embedding_bytes, + languages=languages, + paths=paths, + limit=limit + offset + 20, + offset=0, + ) + for result in raw_results: + if result.file_path in higher_shadowed: + continue + merged.append( + SearchResult( + file_path=result.file_path, + language=result.language, + content=result.content, + start_line=result.start_line, + end_line=result.end_line, + score=result.score, + repo_id=layer.layer.repo_id, + branch=layer.layer.ref_name, + commit=layer.layer.commit_hash, + layer_kind=layer.layer.kind.value, + layer_id=layer.layer.id, + ) + ) + higher_shadowed.update(layer.manifest.affected_paths) + higher_shadowed.update(layer.manifest.tombstoned_paths) + merged.sort(key=lambda r: r.score, reverse=True) + return merged[offset : offset + limit] diff --git a/src/cocoindex_code/layers/layer_store.py b/src/cocoindex_code/layers/layer_store.py new file mode 100644 index 0000000..baf3379 --- /dev/null +++ b/src/cocoindex_code/layers/layer_store.py @@ -0,0 +1,398 @@ +from __future__ import annotations + +import json +import sqlite3 +import time +from pathlib import Path + +from .layer import Layer +from .layer_kind import LayerKind +from .layer_manifest import LayerManifest +from .layer_paths import LayerPaths + + +class LayerStore: + """Persistent daemon metadata store for Git overlay control-plane state.""" + + def __init__(self, path: Path) -> None: + self.path = path + self.state_dir = path.parent + self.path.parent.mkdir(parents=True, exist_ok=True) + self._init_schema() + + def _connect(self) -> sqlite3.Connection: + conn = sqlite3.connect(self.path) + conn.row_factory = sqlite3.Row + return conn + + def _init_schema(self) -> None: + with self._connect() as conn: + conn.executescript( + """ + CREATE TABLE IF NOT EXISTS schema_version ( + version INTEGER PRIMARY KEY + ); + + CREATE TABLE IF NOT EXISTS repositories ( + repo_id TEXT PRIMARY KEY, + repo_name TEXT NOT NULL, + remote_url TEXT NOT NULL, + normalized_remote_url TEXT NOT NULL, + repo_relative_root TEXT NOT NULL, + last_seen_root TEXT NOT NULL, + last_seen_at REAL NOT NULL + ); + + CREATE TABLE IF NOT EXISTS worktrees ( + worktree_id TEXT PRIMARY KEY, + repo_id TEXT NOT NULL, + worktree_name TEXT NOT NULL, + branch_name TEXT NOT NULL, + last_seen_path TEXT NOT NULL, + last_seen_at REAL NOT NULL + ); + + CREATE TABLE IF NOT EXISTS overlay_policies ( + repo_id TEXT PRIMARY KEY, + base_ref TEXT NOT NULL, + dirty_enabled INTEGER NOT NULL, + environment_strategy TEXT NOT NULL, + branch_ttl_seconds REAL NOT NULL, + dirty_ttl_seconds REAL NOT NULL, + updated_at REAL NOT NULL + ); + + CREATE TABLE IF NOT EXISTS layers ( + layer_id TEXT PRIMARY KEY, + repo_id TEXT NOT NULL, + kind TEXT NOT NULL, + ref_name TEXT, + commit_sha TEXT, + base_commit TEXT, + merge_base TEXT, + base_layer_id TEXT, + worktree_id TEXT, + config_hash TEXT, + source_dir TEXT NOT NULL, + db_dir TEXT NOT NULL, + status TEXT NOT NULL, + created_at REAL NOT NULL, + last_accessed_at REAL NOT NULL + ); + + CREATE TABLE IF NOT EXISTS overlay_manifests ( + layer_id TEXT PRIMARY KEY, + affected_paths_json TEXT NOT NULL, + tombstoned_paths_json TEXT NOT NULL, + created_at REAL NOT NULL, + expires_at REAL + ); + """ + ) + columns = {row[1] for row in conn.execute("PRAGMA table_info(layers)").fetchall()} + if "merge_base" not in columns: + conn.execute("ALTER TABLE layers ADD COLUMN merge_base TEXT") + if "worktree_id" not in columns: + conn.execute("ALTER TABLE layers ADD COLUMN worktree_id TEXT") + if "config_hash" not in columns: + conn.execute("ALTER TABLE layers ADD COLUMN config_hash TEXT") + conn.execute("INSERT OR IGNORE INTO schema_version(version) VALUES (1)") + + def upsert_overlay_policy( + self, + *, + repo_id: str, + base_ref: str, + dirty_enabled: bool = True, + environment_strategy: str = "per-layer", + branch_ttl_seconds: float = 14 * 24 * 60 * 60, + dirty_ttl_seconds: float = 24 * 60 * 60, + ) -> None: + now = time.time() + with self._connect() as conn: + conn.execute( + """ + INSERT INTO overlay_policies ( + repo_id, base_ref, dirty_enabled, environment_strategy, + branch_ttl_seconds, dirty_ttl_seconds, updated_at + ) VALUES (?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(repo_id) DO UPDATE SET + base_ref=excluded.base_ref, + dirty_enabled=excluded.dirty_enabled, + environment_strategy=excluded.environment_strategy, + branch_ttl_seconds=excluded.branch_ttl_seconds, + dirty_ttl_seconds=excluded.dirty_ttl_seconds, + updated_at=excluded.updated_at + """, + ( + repo_id, + base_ref, + 1 if dirty_enabled else 0, + environment_strategy, + branch_ttl_seconds, + dirty_ttl_seconds, + now, + ), + ) + + def get_overlay_base_ref(self, repo_id: str) -> str | None: + with self._connect() as conn: + row = conn.execute( + "SELECT base_ref FROM overlay_policies WHERE repo_id = ?", (repo_id,) + ).fetchone() + return row["base_ref"] if row is not None else None + + def upsert_repository( + self, + *, + repo_id: str, + repo_name: str, + remote_url: str, + normalized_remote_url: str, + repo_relative_root: str, + last_seen_root: Path, + ) -> None: + now = time.time() + with self._connect() as conn: + conn.execute( + """ + INSERT INTO repositories ( + repo_id, repo_name, remote_url, normalized_remote_url, + repo_relative_root, last_seen_root, last_seen_at + ) VALUES (?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(repo_id) DO UPDATE SET + repo_name=excluded.repo_name, + remote_url=excluded.remote_url, + normalized_remote_url=excluded.normalized_remote_url, + repo_relative_root=excluded.repo_relative_root, + last_seen_root=excluded.last_seen_root, + last_seen_at=excluded.last_seen_at + """, + ( + repo_id, + repo_name, + remote_url, + normalized_remote_url, + repo_relative_root, + str(last_seen_root), + now, + ), + ) + + def upsert_worktree( + self, + *, + worktree_id: str, + repo_id: str, + worktree_name: str, + branch_name: str, + last_seen_path: Path, + ) -> None: + now = time.time() + with self._connect() as conn: + conn.execute( + """ + INSERT INTO worktrees ( + worktree_id, repo_id, worktree_name, branch_name, + last_seen_path, last_seen_at + ) VALUES (?, ?, ?, ?, ?, ?) + ON CONFLICT(worktree_id) DO UPDATE SET + repo_id=excluded.repo_id, + worktree_name=excluded.worktree_name, + branch_name=excluded.branch_name, + last_seen_path=excluded.last_seen_path, + last_seen_at=excluded.last_seen_at + """, + (worktree_id, repo_id, worktree_name, branch_name, str(last_seen_path), now), + ) + + def _row_to_layer(self, row: sqlite3.Row) -> Layer: + db_dir = Path(row["db_dir"]) + manifest = self.get_manifest(row["layer_id"]) + return Layer( + id=row["layer_id"], + repo_id=row["repo_id"], + kind=LayerKind(row["kind"]), + paths=LayerPaths( + root=Path(row["source_dir"]).parent, + source=Path(row["source_dir"]), + cocoindex_db=db_dir / "cocoindex.db", + target_sqlite=db_dir / "target_sqlite.db", + ), + manifest=manifest, + ref_name=row["ref_name"], + commit_hash=row["commit_sha"], + base_commit_hash=row["base_commit"], + merge_base_hash=row["merge_base"], + base_layer_id=row["base_layer_id"], + worktree_id=row["worktree_id"], + config_hash=row["config_hash"], + status=row["status"], + created_at=row["created_at"], + last_accessed_at=row["last_accessed_at"], + ) + + def upsert_layer( + self, + *, + layer_id: str, + repo_id: str, + kind: LayerKind, + ref_name: str | None, + commit: str | None, + base_commit: str | None, + base_layer_id: str | None, + source_dir: Path, + db_dir: Path, + status: str, + merge_base: str | None = None, + worktree_id: str | None = None, + config_hash: str | None = None, + ) -> Layer: + now = time.time() + existing = self.get_layer(layer_id) + created_at = existing.created_at if existing is not None else now + with self._connect() as conn: + conn.execute( + """ + INSERT INTO layers ( + layer_id, repo_id, kind, ref_name, commit_sha, base_commit, + merge_base, base_layer_id, worktree_id, config_hash, + source_dir, db_dir, status, created_at, last_accessed_at + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(layer_id) DO UPDATE SET + repo_id=excluded.repo_id, + kind=excluded.kind, + ref_name=excluded.ref_name, + commit_sha=excluded.commit_sha, + base_commit=excluded.base_commit, + merge_base=excluded.merge_base, + base_layer_id=excluded.base_layer_id, + worktree_id=excluded.worktree_id, + config_hash=excluded.config_hash, + source_dir=excluded.source_dir, + db_dir=excluded.db_dir, + status=excluded.status, + last_accessed_at=excluded.last_accessed_at + """, + ( + layer_id, + repo_id, + kind.value, + ref_name, + commit, + base_commit, + merge_base, + base_layer_id, + worktree_id, + config_hash, + str(source_dir), + str(db_dir), + status, + created_at, + now, + ), + ) + record = self.get_layer(layer_id) + assert record is not None + return record + + def get_layer(self, layer_id: str) -> Layer | None: + with self._connect() as conn: + row = conn.execute("SELECT * FROM layers WHERE layer_id = ?", (layer_id,)).fetchone() + return self._row_to_layer(row) if row is not None else None + + def mark_layer_ready(self, layer_id: str) -> None: + with self._connect() as conn: + conn.execute( + "UPDATE layers SET status = 'ready', last_accessed_at = ? WHERE layer_id = ?", + (time.time(), layer_id), + ) + + def touch_layer(self, layer_id: str) -> None: + with self._connect() as conn: + conn.execute( + "UPDATE layers SET last_accessed_at = ? WHERE layer_id = ?", + (time.time(), layer_id), + ) + + def replace_manifest( + self, + layer_id: str, + *, + affected_paths: list[str] | tuple[str, ...], + tombstoned_paths: list[str] | tuple[str, ...], + expires_at: float | None, + ) -> None: + now = time.time() + with self._connect() as conn: + conn.execute( + """ + INSERT INTO overlay_manifests ( + layer_id, affected_paths_json, tombstoned_paths_json, + created_at, expires_at + ) + VALUES (?, ?, ?, ?, ?) + ON CONFLICT(layer_id) DO UPDATE SET + affected_paths_json=excluded.affected_paths_json, + tombstoned_paths_json=excluded.tombstoned_paths_json, + expires_at=excluded.expires_at + """, + ( + layer_id, + json.dumps(sorted(set(affected_paths))), + json.dumps(sorted(set(tombstoned_paths))), + now, + expires_at, + ), + ) + + def get_manifest(self, layer_id: str) -> LayerManifest | None: + with self._connect() as conn: + row = conn.execute( + "SELECT * FROM overlay_manifests WHERE layer_id = ?", (layer_id,) + ).fetchone() + if row is None: + return None + return LayerManifest( + affected_paths=frozenset(json.loads(row["affected_paths_json"])), + tombstoned_paths=frozenset(json.loads(row["tombstoned_paths_json"])), + created_at=row["created_at"], + expires_at=row["expires_at"], + ) + + def list_layers(self, *, repo_id: str | None = None) -> list[Layer]: + with self._connect() as conn: + if repo_id is None: + rows = conn.execute("SELECT * FROM layers ORDER BY created_at").fetchall() + else: + rows = conn.execute( + "SELECT * FROM layers WHERE repo_id = ? ORDER BY created_at", + (repo_id,), + ).fetchall() + return [self._row_to_layer(row) for row in rows] + + def prune_expired(self, now: float | None = None) -> list[Layer]: + cutoff = time.time() if now is None else now + with self._connect() as conn: + rows = conn.execute( + """ + SELECT layers.* + FROM layers + JOIN overlay_manifests USING(layer_id) + WHERE overlay_manifests.expires_at IS NOT NULL + AND overlay_manifests.expires_at < ? + """, + (cutoff,), + ).fetchall() + layer_ids = [row["layer_id"] for row in rows] + conn.executemany( + "DELETE FROM overlay_manifests WHERE layer_id = ?", [(i,) for i in layer_ids] + ) + conn.executemany("DELETE FROM layers WHERE layer_id = ?", [(i,) for i in layer_ids]) + return [self._row_to_layer(row) for row in rows] + + +LayerRecord = Layer +OverlayManifest = LayerManifest diff --git a/src/cocoindex_code/project.py b/src/cocoindex_code/project.py index f661c21..aa31966 100644 --- a/src/cocoindex_code/project.py +++ b/src/cocoindex_code/project.py @@ -22,20 +22,15 @@ ProjectStatusResponse, SearchResult, ) -from .query import query_codebase -from .settings import ( - cocoindex_db_path as _cocoindex_db_path, -) +from .query import query_codebase, query_codebase_with_embedding from .settings import ( resolve_db_dir, ) -from .settings import ( - target_sqlite_db_path as _target_sqlite_db_path, -) from .shared import ( CODEBASE_DIR, EMBEDDER, INDEXING_EMBED_PARAMS, + PROJECT_ROOT, QUERY_EMBED_PARAMS, SQLITE_DB, Embedder, @@ -46,6 +41,8 @@ class Project: _env: coco.Environment _app: coco.App[[], None] _project_root: Path + _source_root: Path + _target_sqlite_db_path: Path _index_lock: asyncio.Lock _initial_index_done: asyncio.Event _indexing_stats: IndexingProgress | None = None @@ -183,10 +180,39 @@ async def search( offset: int = 0, ) -> list[SearchResult]: """Search within this project.""" - target_db = _target_sqlite_db_path(self._project_root) results = await query_codebase( query=query, - target_sqlite_db_path=target_db, + target_sqlite_db_path=self._target_sqlite_db_path, + env=self._env, + limit=limit, + offset=offset, + languages=languages, + paths=paths, + ) + return [ + SearchResult( + file_path=r.file_path, + language=r.language, + content=r.content, + start_line=r.start_line, + end_line=r.end_line, + score=r.score, + ) + for r in results + ] + + def search_with_embedding( + self, + embedding_bytes: bytes, + languages: list[str] | None = None, + paths: list[str] | None = None, + limit: int = 5, + offset: int = 0, + ) -> list[SearchResult]: + """Search using a caller-provided query embedding.""" + results = query_codebase_with_embedding( + embedding_bytes=embedding_bytes, + target_sqlite_db_path=self._target_sqlite_db_path, env=self._env, limit=limit, offset=offset, @@ -263,6 +289,8 @@ async def create( indexing_params: dict[str, Any], query_params: dict[str, Any], chunker_registry: dict[str, ChunkerFn] | None = None, + source_root: Path | None = None, + db_dir: Path | None = None, ) -> Project: """Create a project with explicit embedder and per-call params. @@ -282,19 +310,21 @@ async def create( to a ``ChunkerFn``. When a suffix matches, the registered chunker is called instead of the built-in splitter. """ + source_root = source_root or project_root settings_dir = project_root / ".cocoindex_code" settings_dir.mkdir(parents=True, exist_ok=True) - db_dir = resolve_db_dir(project_root) + db_dir = db_dir or resolve_db_dir(project_root) db_dir.mkdir(parents=True, exist_ok=True) - cocoindex_db = _cocoindex_db_path(project_root) - target_sqlite_db = _target_sqlite_db_path(project_root) + cocoindex_db = db_dir / "cocoindex.db" + target_sqlite_db = db_dir / "target_sqlite.db" settings = coco.Settings.from_env(cocoindex_db) context = coco.ContextProvider() - context.provide(CODEBASE_DIR, project_root) + context.provide(CODEBASE_DIR, source_root) + context.provide(PROJECT_ROOT, project_root) context.provide(SQLITE_DB, coco_sqlite.connect(str(target_sqlite_db), load_vec=True)) context.provide(EMBEDDER, embedder) context.provide(INDEXING_EMBED_PARAMS, dict(indexing_params)) @@ -314,6 +344,8 @@ async def create( result._env = env result._app = app result._project_root = project_root + result._source_root = source_root + result._target_sqlite_db_path = target_sqlite_db result._index_lock = asyncio.Lock() result._initial_index_done = asyncio.Event() return result diff --git a/src/cocoindex_code/protocol.py b/src/cocoindex_code/protocol.py index b584a4d..52e7e7c 100644 --- a/src/cocoindex_code/protocol.py +++ b/src/cocoindex_code/protocol.py @@ -15,11 +15,15 @@ class HandshakeRequest(_msgspec.Struct, tag="handshake"): class IndexRequest(_msgspec.Struct, tag="index"): project_root: str + cwd: str | None = None + base_ref: str | None = None class SearchRequest(_msgspec.Struct, tag="search"): project_root: str query: str + cwd: str | None = None + base_ref: str | None = None languages: list[str] | None = None paths: list[str] | None = None limit: int = 5 @@ -50,6 +54,16 @@ class DaemonEnvRequest(_msgspec.Struct, tag="daemon_env"): pass +class OverlayStatusRequest(_msgspec.Struct, tag="overlay_status"): + project_root: str + cwd: str | None = None + base_ref: str | None = None + + +class OverlayPruneRequest(_msgspec.Struct, tag="overlay_prune"): + pass + + Request = ( HandshakeRequest | IndexRequest @@ -60,6 +74,8 @@ class DaemonEnvRequest(_msgspec.Struct, tag="daemon_env"): | StopRequest | DoctorRequest | DaemonEnvRequest + | OverlayStatusRequest + | OverlayPruneRequest ) # --------------------------------------------------------------------------- @@ -111,6 +127,11 @@ class SearchResult(_msgspec.Struct): start_line: int end_line: int score: float + repo_id: str | None = None + branch: str | None = None + commit: str | None = None + layer_kind: str | None = None + layer_id: str | None = None class SearchResponse(_msgspec.Struct, tag="search"): @@ -173,6 +194,26 @@ class DaemonEnvResponse(_msgspec.Struct, tag="daemon_env"): host_path_mappings: list[DbPathMappingEntry] = [] +class OverlayLayerInfo(_msgspec.Struct): + layer_id: str + repo_id: str + kind: str + ref_name: str | None + commit: str | None + status: str + affected_count: int = 0 + tombstoned_count: int = 0 + + +class OverlayStatusResponse(_msgspec.Struct, tag="overlay_status"): + repo_id: str | None + layers: list[OverlayLayerInfo] + + +class OverlayPruneResponse(_msgspec.Struct, tag="overlay_prune"): + pruned_layer_ids: list[str] + + class ErrorResponse(_msgspec.Struct, tag="error"): message: str @@ -189,6 +230,8 @@ class ErrorResponse(_msgspec.Struct, tag="error"): | StopResponse | DoctorResponse | DaemonEnvResponse + | OverlayStatusResponse + | OverlayPruneResponse | ErrorResponse ) diff --git a/src/cocoindex_code/query.py b/src/cocoindex_code/query.py index a2991ee..4925d16 100644 --- a/src/cocoindex_code/query.py +++ b/src/cocoindex_code/query.py @@ -104,7 +104,6 @@ async def query_codebase( "Please run a query with refresh_index=True first." ) - db = env.get_context(SQLITE_DB) embedder = env.get_context(EMBEDDER) query_params = env.get_context(QUERY_EMBED_PARAMS) @@ -112,7 +111,35 @@ async def query_codebase( query_embedding = await embedder.embed(query, **query_params) embedding_bytes = query_embedding.astype("float32").tobytes() + return query_codebase_with_embedding( + embedding_bytes=embedding_bytes, + target_sqlite_db_path=target_sqlite_db_path, + env=env, + limit=limit, + offset=offset, + languages=languages, + paths=paths, + ) + + +def query_codebase_with_embedding( + *, + embedding_bytes: bytes, + target_sqlite_db_path: Path, + env: Any, + limit: int = 10, + offset: int = 0, + languages: list[str] | None = None, + paths: list[str] | None = None, +) -> list[QueryResult]: + """Perform vector search using a precomputed query embedding.""" + if not target_sqlite_db_path.exists(): + raise RuntimeError( + f"Index database not found at {target_sqlite_db_path}. " + "Please run a query with refresh_index=True first." + ) + db = env.get_context(SQLITE_DB) with db.readonly() as conn: if paths: rows = _full_scan_query(conn, embedding_bytes, limit, offset, languages, paths) diff --git a/src/cocoindex_code/server.py b/src/cocoindex_code/server.py index 2708c86..847b3a3 100644 --- a/src/cocoindex_code/server.py +++ b/src/cocoindex_code/server.py @@ -56,9 +56,10 @@ class SearchResultModel(BaseModel): # === Daemon-backed MCP server factory === -def create_mcp_server(project_root: str) -> FastMCP: +def create_mcp_server(project_root: str, cwd: str | None = None) -> FastMCP: """Create a lightweight MCP server that delegates to the daemon.""" mcp = FastMCP("cocoindex-code", instructions=_MCP_INSTRUCTIONS) + request_cwd = cwd or str(Path.cwd()) @mcp.tool( name="search", @@ -124,12 +125,15 @@ async def search( loop = asyncio.get_event_loop() try: if refresh_index: - await loop.run_in_executor(None, lambda: _client.index(project_root)) + await loop.run_in_executor( + None, lambda: _client.index(project_root, cwd=request_cwd) + ) resp = await loop.run_in_executor( None, lambda: _client.search( project_root=project_root, query=query, + cwd=request_cwd, languages=languages, paths=paths, limit=limit, @@ -321,7 +325,7 @@ def _on_progress(progress: IndexingProgress) -> None: print(f"Indexing failed: {resp.message}") else: # Default: run MCP server - mcp_server = create_mcp_server(str(project_root)) + mcp_server = create_mcp_server(str(project_root), cwd=str(Path.cwd().resolve())) async def _serve() -> None: from .cli import _bg_index diff --git a/src/cocoindex_code/shared.py b/src/cocoindex_code/shared.py index b42e722..2b0ce6f 100644 --- a/src/cocoindex_code/shared.py +++ b/src/cocoindex_code/shared.py @@ -31,6 +31,7 @@ EMBEDDER = coco.ContextKey[Embedder]("embedder", detect_change=True) SQLITE_DB = coco.ContextKey[sqlite.ManagedConnection]("index_db") CODEBASE_DIR = coco.ContextKey[pathlib.Path]("codebase") +PROJECT_ROOT = coco.ContextKey[pathlib.Path]("project_root") INDEXING_EMBED_PARAMS = coco.ContextKey[dict[str, Any]]("indexing_embed_params") QUERY_EMBED_PARAMS = coco.ContextKey[dict[str, Any]]("query_embed_params") diff --git a/src/cocoindex_code/version_control/__init__.py b/src/cocoindex_code/version_control/__init__.py new file mode 100644 index 0000000..d197183 --- /dev/null +++ b/src/cocoindex_code/version_control/__init__.py @@ -0,0 +1,16 @@ +from .branch import Branch +from .change_set import ChangeSet, GitStatusEntry +from .git import GitContextError, normalize_remote_url, resolve_worktree +from .repository import Repository +from .worktree import Worktree + +__all__ = [ + "Branch", + "ChangeSet", + "GitContextError", + "GitStatusEntry", + "Repository", + "Worktree", + "normalize_remote_url", + "resolve_worktree", +] diff --git a/src/cocoindex_code/version_control/branch.py b/src/cocoindex_code/version_control/branch.py new file mode 100644 index 0000000..089a5a4 --- /dev/null +++ b/src/cocoindex_code/version_control/branch.py @@ -0,0 +1,12 @@ +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class Branch: + name: str + head_commit: str + base_ref: str + base_commit: str + merge_base: str diff --git a/src/cocoindex_code/version_control/change_set.py b/src/cocoindex_code/version_control/change_set.py new file mode 100644 index 0000000..085f6a5 --- /dev/null +++ b/src/cocoindex_code/version_control/change_set.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class GitStatusEntry: + index_status: str + worktree_status: str + path: str + original_path: str | None = None + + +@dataclass(frozen=True) +class ChangeSet: + affected_paths: tuple[str, ...] + tombstoned_paths: tuple[str, ...] + snapshot_hash: str | None = None + + @property + def is_empty(self) -> bool: + return not self.affected_paths and not self.tombstoned_paths diff --git a/src/cocoindex_code/version_control/git.py b/src/cocoindex_code/version_control/git.py new file mode 100644 index 0000000..10c7099 --- /dev/null +++ b/src/cocoindex_code/version_control/git.py @@ -0,0 +1,303 @@ +from __future__ import annotations + +import hashlib +import os +import tarfile +from pathlib import Path +from typing import Any, cast +from urllib.parse import urlparse + +import pygit2 + +from .branch import Branch +from .change_set import ChangeSet, GitStatusEntry +from .repository import Repository +from .worktree import Worktree + + +class GitContextError(RuntimeError): + """Raised when a directory cannot be resolved as a usable Git worktree.""" + + +def _sha_short(value: str) -> str: + return hashlib.sha256(value.encode()).hexdigest()[:24] + + +def _open_repo(cwd: Path) -> pygit2.Repository: + try: + discovered = pygit2.discover_repository(str(cwd)) + except (KeyError, ValueError, pygit2.GitError) as e: + raise GitContextError(f"No Git repository found from {cwd}") from e + if discovered is None: + raise GitContextError(f"No Git repository found from {cwd}") + try: + return pygit2.Repository(discovered) + except (KeyError, ValueError, pygit2.GitError) as e: + raise GitContextError(f"Cannot open Git repository at {discovered}") from e + + +def normalize_remote_url(url: str) -> str: + """Normalize common Git remote URL forms into a stable lowercase identity.""" + raw = url.strip() + if raw.endswith(".git"): + raw = raw[:-4] + if raw.startswith("git@") and ":" in raw: + host, path = raw[4:].split(":", 1) + return f"{host.lower()}/{path.strip('/').lower()}" + parsed = urlparse(raw) + if parsed.scheme and parsed.netloc: + path = parsed.path.strip("/") + return f"{parsed.netloc.lower()}/{path.lower()}" + return raw.strip("/").lower() + + +def _repo_name(normalized_remote_url: str, repo_root: Path) -> str: + remote_name = normalized_remote_url.rstrip("/").rsplit("/", 1)[-1] + return remote_name or repo_root.name + + +def _worktree_name(repo_root: Path) -> str: + return repo_root.name + + +def _status_char(flags: int, *, staged: bool) -> str: + if staged: + if flags & pygit2.enums.FileStatus.INDEX_NEW: + return "A" + if flags & pygit2.enums.FileStatus.INDEX_MODIFIED: + return "M" + if flags & pygit2.enums.FileStatus.INDEX_DELETED: + return "D" + if flags & pygit2.enums.FileStatus.INDEX_RENAMED: + return "R" + if flags & pygit2.enums.FileStatus.INDEX_TYPECHANGE: + return "T" + else: + if flags & pygit2.enums.FileStatus.WT_NEW: + return "?" + if flags & pygit2.enums.FileStatus.WT_MODIFIED: + return "M" + if flags & pygit2.enums.FileStatus.WT_DELETED: + return "D" + if flags & pygit2.enums.FileStatus.WT_RENAMED: + return "R" + if flags & pygit2.enums.FileStatus.WT_TYPECHANGE: + return "T" + if flags & pygit2.enums.FileStatus.WT_UNREADABLE: + return "U" + return " " + + +def _status_entries(repo: pygit2.Repository) -> tuple[GitStatusEntry, ...]: + status = repo.status(untracked_files="all", ignored=False) + if not status: + return () + entries: list[GitStatusEntry] = [] + for path, flags in sorted(status.items()): + entries.append( + GitStatusEntry( + index_status=_status_char(flags, staged=True), + worktree_status=_status_char(flags, staged=False), + path=path, + ) + ) + return tuple(entries) + + +def _dirty_snapshot_hash(repo_root: Path, entries: tuple[GitStatusEntry, ...]) -> str | None: + if not entries: + return None + digest = hashlib.sha256() + for entry in sorted(entries, key=lambda e: (e.path, e.original_path or "")): + digest.update(entry.index_status.encode()) + digest.update(entry.worktree_status.encode()) + digest.update(entry.path.encode()) + if entry.original_path is not None: + digest.update(entry.original_path.encode()) + path = repo_root / entry.path + if path.is_file(): + digest.update(hashlib.sha256(path.read_bytes()).digest()) + return digest.hexdigest()[:24] + + +def _resolve_base_ref(repo: pygit2.Repository, requested: str | None) -> str: + candidates = [requested] if requested else ["origin/main", "main", "master", "HEAD"] + for candidate in candidates: + if candidate is None: + continue + try: + repo.revparse_single(candidate) + return candidate + except (KeyError, ValueError, pygit2.GitError): + continue + raise GitContextError("No usable base ref found") + + +def _git_common_dir(repo: pygit2.Repository) -> Path: + git_dir = Path(repo.path).resolve() + if git_dir.parent.name == "worktrees": + return git_dir.parent.parent.resolve() + return git_dir + + +def _paths_from_status( + entries: tuple[GitStatusEntry, ...], +) -> tuple[tuple[str, ...], tuple[str, ...]]: + affected: list[str] = [] + tombstoned: list[str] = [] + for entry in entries: + if entry.index_status == "D" or entry.worktree_status == "D": + tombstoned.append(entry.path) + else: + affected.append(entry.path) + if entry.original_path is not None: + tombstoned.append(entry.original_path) + return tuple(sorted(set(affected))), tuple(sorted(set(tombstoned))) + + +def resolve_worktree( + cwd: str | os.PathLike[str] | Path, + *, + base_ref: str | None = None, + index_config_hash: str, +) -> Worktree: + """Resolve Git identity and dirty state for *cwd*.""" + start = Path(cwd).resolve() + repo = _open_repo(start) + if repo.workdir is None: + raise GitContextError(f"Repository at {repo.path} has no worktree") + repo_root = Path(repo.workdir).resolve() + git_common_dir = _git_common_dir(repo) + try: + remote_url = repo.remotes["origin"].url + except (KeyError, IndexError) as e: + raise GitContextError("Git repository has no origin remote") from e + if remote_url is None: + raise GitContextError("Git origin remote has no URL") + normalized_remote = normalize_remote_url(remote_url) + try: + branch_name = repo.head.shorthand or "HEAD" + head_obj = repo.revparse_single("HEAD") + except (KeyError, ValueError, pygit2.GitError) as e: + raise GitContextError("Git repository has no HEAD commit") from e + head_commit = str(head_obj.id) + resolved_base_ref = _resolve_base_ref(repo, base_ref) + try: + base_obj = repo.revparse_single(resolved_base_ref) + merge_base_oid = repo.merge_base(head_obj.id, base_obj.id) + except (KeyError, ValueError, pygit2.GitError) as e: + raise GitContextError(f"Cannot resolve base ref {resolved_base_ref}") from e + if merge_base_oid is None: + raise GitContextError(f"No merge base between HEAD and {resolved_base_ref}") + base_commit = str(base_obj.id) + merge_base = str(merge_base_oid) + status_entries = _status_entries(repo) + dirty_hash = _dirty_snapshot_hash(repo_root, status_entries) + affected, tombstoned = _paths_from_status(status_entries) + + repo_relative_root = "." + repo_id = _sha_short(f"{normalized_remote}\0{repo_relative_root}\0{index_config_hash}") + worktree_name = _worktree_name(repo_root) + worktree_id = _sha_short(f"{repo_id}\0{worktree_name}\0{branch_name}") + repository = Repository( + id=repo_id, + root=repo_root, + git_common_dir=git_common_dir, + remote_url=remote_url, + normalized_remote_url=normalized_remote, + repo_name=_repo_name(normalized_remote, repo_root), + repo_relative_root=repo_relative_root, + last_seen_root=repo_root, + ) + branch = Branch( + name=branch_name, + head_commit=head_commit, + base_ref=resolved_base_ref, + base_commit=base_commit, + merge_base=merge_base, + ) + dirty = ChangeSet( + affected_paths=affected, + tombstoned_paths=tombstoned, + snapshot_hash=dirty_hash, + ) + return Worktree( + id=worktree_id, + path=repo_root, + name=worktree_name, + repository=repository, + branch=branch, + dirty=dirty, + status_entries=status_entries, + ) + + +def branch_changes(repo_root: Path, base: str, head: str) -> ChangeSet: + repo = _open_repo(repo_root) + try: + diff = repo.diff(base, head) + diff.find_similar() + except (KeyError, ValueError, pygit2.GitError) as e: + raise GitContextError(f"Cannot diff {base}..{head}") from e + if len(diff) == 0: + return ChangeSet(affected_paths=(), tombstoned_paths=()) + affected: list[str] = [] + tombstoned: list[str] = [] + for patch in diff: + if patch is None: + continue + delta = patch.delta + status = delta.status_char() + old_path = delta.old_file.path + new_path = delta.new_file.path + if status in {"R", "C"}: + affected.append(new_path) + if status == "R": + tombstoned.append(old_path) + elif status == "D": + tombstoned.append(old_path) + else: + affected.append(new_path) + return ChangeSet( + affected_paths=tuple(sorted(set(affected))), + tombstoned_paths=tuple(sorted(set(tombstoned))), + ) + + +def materialize_commit(repo_root: Path, commit: str, source_dir: Path) -> None: + repo = _open_repo(repo_root) + obj = repo.revparse_single(commit) + with tarfile.open(source_dir / ".archive.tar", mode="w") as archive: + repo.write_archive(obj, archive) + with tarfile.open(source_dir / ".archive.tar", mode="r:") as archive: + archive.extractall(source_dir) + (source_dir / ".archive.tar").unlink(missing_ok=True) + + +def _write_file(path: Path, data: bytes) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes(data) + + +def materialize_paths_from_commit( + repo_root: Path, commit: str, paths: tuple[str, ...], source_dir: Path +) -> None: + repo = _open_repo(repo_root) + commit_obj = repo.revparse_single(commit) + for path in paths: + try: + entry = commit_obj.tree[path] + blob = repo[entry.id] + data = cast(Any, blob).data + except (KeyError, ValueError, pygit2.GitError): + continue + _write_file(source_dir / path, data) + + +def materialize_paths_from_worktree( + repo_root: Path, paths: tuple[str, ...], source_dir: Path +) -> None: + for path in paths: + source = repo_root / path + if source.is_file(): + _write_file(source_dir / path, source.read_bytes()) diff --git a/src/cocoindex_code/version_control/repository.py b/src/cocoindex_code/version_control/repository.py new file mode 100644 index 0000000..0bc8f44 --- /dev/null +++ b/src/cocoindex_code/version_control/repository.py @@ -0,0 +1,16 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path + + +@dataclass(frozen=True) +class Repository: + id: str + root: Path + git_common_dir: Path + remote_url: str + normalized_remote_url: str + repo_name: str + repo_relative_root: str + last_seen_root: Path diff --git a/src/cocoindex_code/version_control/worktree.py b/src/cocoindex_code/version_control/worktree.py new file mode 100644 index 0000000..4eb84be --- /dev/null +++ b/src/cocoindex_code/version_control/worktree.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path + +from .branch import Branch +from .change_set import ChangeSet, GitStatusEntry +from .repository import Repository + + +@dataclass(frozen=True) +class Worktree: + id: str + path: Path + name: str + repository: Repository + branch: Branch + dirty: ChangeSet + status_entries: tuple[GitStatusEntry, ...] + + @property + def worktree_id(self) -> str: + return self.id + + @property + def repo_id(self) -> str: + return self.repository.id + + @property + def repo_root(self) -> Path: + return self.repository.root + + @property + def git_common_dir(self) -> Path: + return self.repository.git_common_dir + + @property + def remote_url(self) -> str: + return self.repository.remote_url + + @property + def normalized_remote_url(self) -> str: + return self.repository.normalized_remote_url + + @property + def branch_name(self) -> str: + return self.branch.name + + @property + def head_commit(self) -> str: + return self.branch.head_commit + + @property + def base_ref(self) -> str: + return self.branch.base_ref + + @property + def base_commit(self) -> str: + return self.branch.base_commit + + @property + def merge_base(self) -> str: + return self.branch.merge_base + + @property + def dirty_snapshot_hash(self) -> str | None: + return self.dirty.snapshot_hash diff --git a/tests/e2e_docker/test_docker_workspace.py b/tests/e2e_docker/test_docker_workspace.py index ae26fda..3b66e8e 100644 --- a/tests/e2e_docker/test_docker_workspace.py +++ b/tests/e2e_docker/test_docker_workspace.py @@ -9,6 +9,7 @@ import shutil import subprocess import sys +import uuid from pathlib import Path import pytest @@ -177,6 +178,33 @@ def test_first_start_uses_baked_model(container: str) -> None: assert "Downloading" not in log_result.stderr +def test_docker_runtime_state_defaults_use_var_cocoindex(container: str) -> None: + """Docker image defaults keep durable daemon state on the data volume.""" + env_result = docker_exec( + container, + [ + "sh", + "-c", + "printf '%s\n%s\n%s\n' " + '"$COCOINDEX_CODE_STATE_DIR" ' + '"$COCOINDEX_CODE_RUNTIME_DIR" ' + '"$COCOINDEX_CODE_DB_PATH_MAPPING"', + ], + ) + lines = env_result.stdout.splitlines() + assert lines == [ + "/var/cocoindex/state", + "/var/run/cocoindex_code", + "/workspace=/var/cocoindex/db", + ] + + dir_result = docker_exec( + container, + ["sh", "-c", "test -d /var/cocoindex/state && test -d /var/cocoindex/db"], + ) + assert dir_result.returncode == 0 + + @pytest.mark.skipif(sys.platform != "linux", reason="PUID/PGID only meaningful on Linux") def test_linux_puid_gives_host_owned_files( docker_image: str, fixture_workspace: Path, tmp_path: Path @@ -230,6 +258,11 @@ def test_linux_puid_gives_host_owned_files( st = settings_file.stat() assert st.st_uid == uid, f"Expected uid {uid}, got {st.st_uid}" assert st.st_gid == gid, f"Expected gid {gid}, got {st.st_gid}" + + state_owner = docker_exec(name, ["stat", "-c", "%u:%g", "/var/cocoindex/state"]) + db_owner = docker_exec(name, ["stat", "-c", "%u:%g", "/var/cocoindex/db"]) + assert state_owner.stdout.strip() == f"{uid}:{gid}" + assert db_owner.stdout.strip() == f"{uid}:{gid}" finally: subprocess.run(["docker", "rm", "-f", name], capture_output=True, check=False) @@ -260,6 +293,7 @@ def test_docker_compose_smoke(docker_image: str, fixture_workspace: Path, tmp_pa env = dict(os.environ) env["COCOINDEX_HOST_WORKSPACE"] = str(fixture_workspace) + env["COCOINDEX_CODE_CONTAINER_NAME"] = f"ccc-compose-e2e-{uuid.uuid4().hex[:8]}" try: subprocess.run( @@ -298,6 +332,30 @@ def test_docker_compose_smoke(docker_image: str, fixture_workspace: Path, tmp_pa else: raise TimeoutError("Daemon did not become ready via compose") + defaults = subprocess.run( + [ + "docker", + "compose", + "-f", + str(compose_dst), + "exec", + "-T", + "cocoindex-code", + "sh", + "-c", + "printf '%s\n%s\n' \"$COCOINDEX_CODE_STATE_DIR\" \"$COCOINDEX_CODE_RUNTIME_DIR\"", + ], + cwd=tmp_path, + env=env, + capture_output=True, + text=True, + check=True, + ) + assert defaults.stdout.splitlines() == [ + "/var/cocoindex/state", + "/var/run/cocoindex_code", + ] + # Index, then search. subprocess.run( [ diff --git a/tests/test_docker_setup.py b/tests/test_docker_setup.py new file mode 100644 index 0000000..57c3ab6 --- /dev/null +++ b/tests/test_docker_setup.py @@ -0,0 +1,94 @@ +from __future__ import annotations + +import os +import shutil +import subprocess +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parent.parent + + +def test_dockerfile_sets_container_native_state_defaults() -> None: + content = (REPO_ROOT / "docker" / "Dockerfile").read_text() + + assert "git gosu" in content + assert "COCOINDEX_CODE_STATE_DIR=/var/cocoindex/state" in content + assert "COCOINDEX_CODE_RUNTIME_DIR=/var/run/cocoindex_code" in content + assert "COCOINDEX_CODE_DB_PATH_MAPPING=/workspace=/var/cocoindex/db" in content + assert "/var/cocoindex/state" in content + + +def test_docker_entrypoint_prepares_state_db_cache_and_runtime_dirs() -> None: + content = (REPO_ROOT / "docker" / "entrypoint.sh").read_text() + + assert 'COCOINDEX_CODE_STATE_DIR=${COCOINDEX_CODE_STATE_DIR:-/var/cocoindex/state}' in content + assert ( + 'COCOINDEX_CODE_RUNTIME_DIR=${COCOINDEX_CODE_RUNTIME_DIR:-/var/run/cocoindex_code}' + in content + ) + assert '"$COCOINDEX_CODE_STATE_DIR"' in content + assert "/var/cocoindex/db" in content + assert '"$HF_HOME"' in content + assert '"$SENTENCE_TRANSFORMERS_HOME"' in content + assert '"$COCOINDEX_CODE_RUNTIME_DIR"' in content + assert "chown -R coco:coco /var/cocoindex" in content + + +def test_docker_compose_exposes_local_use_knobs_and_healthcheck() -> None: + content = (REPO_ROOT / "docker" / "docker-compose.yml").read_text() + + assert "${COCOINDEX_CODE_IMAGE:-cocoindex/cocoindex-code:latest}" in content + assert "${COCOINDEX_CODE_CONTAINER_NAME:-cocoindex-code}" in content + assert "${COCOINDEX_HOST_WORKSPACE:-${HOME}}:/workspace" in content + assert "COCOINDEX_CODE_STATE_DIR: ${COCOINDEX_CODE_STATE_DIR:-/var/cocoindex/state}" in content + assert ( + "COCOINDEX_CODE_RUNTIME_DIR: ${COCOINDEX_CODE_RUNTIME_DIR:-/var/run/cocoindex_code}" + in content + ) + assert ( + "COCOINDEX_CODE_DB_PATH_MAPPING: " + "${COCOINDEX_CODE_DB_PATH_MAPPING:-/workspace=/var/cocoindex/db}" + ) in content + assert ( + "COCOINDEX_CODE_HOST_PATH_MAPPING: " + "${COCOINDEX_CODE_HOST_PATH_MAPPING:-/workspace=${COCOINDEX_HOST_WORKSPACE:-${HOME}}}" + ) in content + assert "ccc daemon status" in content + assert "daemon.sock" in content + + +def test_readme_documents_docker_state_runtime_and_host_cwd_mapping() -> None: + content = (REPO_ROOT / "README.md").read_text() + + assert "COCOINDEX_CODE_HOST_CWD=\"$PWD\"" in content + assert "docker exec \"${flags[@]}\"" in content + assert "ccc mcp" in content + assert "COCOINDEX_CODE_STATE_DIR" in content + assert "/var/cocoindex/state" in content + assert "COCOINDEX_CODE_RUNTIME_DIR" in content + assert "/var/run/cocoindex_code" in content + assert "COCOINDEX_CODE_DB_PATH_MAPPING" in content + assert "COCOINDEX_CODE_HOST_PATH_MAPPING" in content + + +def test_docker_compose_config_is_valid(tmp_path: Path) -> None: + if shutil.which("docker") is None: + pytest.skip("Docker CLI not available") + + compose_file = REPO_ROOT / "docker" / "docker-compose.yml" + env = dict(os.environ) + env.setdefault("HOME", str(tmp_path / "home")) + result = subprocess.run( + ["docker", "compose", "-f", str(compose_file), "config"], + env=env, + capture_output=True, + text=True, + check=False, + ) + + if result.returncode != 0 and "docker daemon" in result.stderr.lower(): + pytest.skip("Docker daemon not available") + assert result.returncode == 0, result.stderr + assert "cocoindex-code" in result.stdout diff --git a/tests/test_git_layers.py b/tests/test_git_layers.py new file mode 100644 index 0000000..189a3e1 --- /dev/null +++ b/tests/test_git_layers.py @@ -0,0 +1,172 @@ +from __future__ import annotations + +import shutil +import subprocess +from pathlib import Path + +import pytest + +from cocoindex_code._daemon_paths import daemon_state_dir +from cocoindex_code.git_context import normalize_remote_url, resolve_worktree_context +from cocoindex_code.layer_store import LayerKind, LayerStore +from cocoindex_code.layered_project import LayeredProject +from cocoindex_code.settings import default_project_settings, save_project_settings + + +def _git(repo: Path, *args: str) -> str: + return subprocess.check_output(["git", "-C", str(repo), *args], text=True).strip() + + +def _init_repo(path: Path) -> Path: + path.mkdir() + _git(path, "init", "-b", "main") + _git(path, "config", "user.email", "test@example.com") + _git(path, "config", "user.name", "Test User") + save_project_settings(path, default_project_settings()) + (path / "main.py").write_text("def base_function() -> str:\n return 'base'\n") + _git(path, "add", ".") + _git(path, "commit", "-m", "initial") + _git(path, "remote", "add", "origin", "git@github.com:Example/Repo.git") + return path + + +def test_daemon_state_dir_defaults_to_xdg_data_home( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.delenv("COCOINDEX_CODE_STATE_DIR", raising=False) + monkeypatch.setenv("XDG_DATA_HOME", str(tmp_path / "xdg")) + + assert daemon_state_dir() == tmp_path / "xdg" / "cocoindex-code" + + +def test_normalize_remote_url_equates_common_github_forms() -> None: + assert normalize_remote_url("git@github.com:Example/Repo.git") == normalize_remote_url( + "https://github.com/example/repo" + ) + + +def test_resolve_worktree_context_has_stable_repo_id_across_worktrees(tmp_path: Path) -> None: + repo = _init_repo(tmp_path / "repo") + linked = tmp_path / "linked" + _git(repo, "worktree", "add", "-b", "feature", str(linked), "main") + + first = resolve_worktree_context(repo, base_ref="main", index_config_hash="cfg") + second = resolve_worktree_context(linked, base_ref="main", index_config_hash="cfg") + + assert first.repo_id == second.repo_id + assert first.worktree_id != second.worktree_id + assert first.repo_root == repo.resolve() + assert second.repo_root == linked.resolve() + + +def test_resolve_worktree_context_repo_id_survives_repo_move(tmp_path: Path) -> None: + repo = _init_repo(tmp_path / "repo") + moved = tmp_path / "moved-repo" + shutil.copytree(repo, moved) + + first = resolve_worktree_context(repo, base_ref="main", index_config_hash="cfg") + second = resolve_worktree_context(moved, base_ref="main", index_config_hash="cfg") + + assert first.repo_id == second.repo_id + assert first.repo_root != second.repo_root + + +def test_resolve_worktree_context_worktree_id_uses_name_and_branch(tmp_path: Path) -> None: + first_parent = tmp_path / "first" + second_parent = tmp_path / "second" + first_parent.mkdir() + second_parent.mkdir() + first = _init_repo(first_parent / "feature-1") + second = _init_repo(second_parent / "feature-1") + + first_ctx = resolve_worktree_context(first, base_ref="main", index_config_hash="cfg") + second_ctx = resolve_worktree_context(second, base_ref="main", index_config_hash="cfg") + + assert first_ctx.worktree_id == second_ctx.worktree_id + assert first_ctx.repo_root != second_ctx.repo_root + + +def test_layer_store_persists_ready_layers_and_manifests(tmp_path: Path) -> None: + store = LayerStore(tmp_path / "daemon.db") + record = store.upsert_layer( + layer_id="layer-1", + repo_id="repo", + kind=LayerKind.BASE, + ref_name="main", + commit="abc", + base_commit=None, + base_layer_id=None, + source_dir=tmp_path / "src", + db_dir=tmp_path / "db", + status="building", + ) + store.replace_manifest( + "layer-1", + affected_paths=["a.py"], + tombstoned_paths=["old.py"], + expires_at=None, + ) + store.mark_layer_ready("layer-1") + + reopened = LayerStore(tmp_path / "daemon.db") + ready = reopened.get_layer("layer-1") + assert ready is not None + assert ready.layer_id == record.layer_id + assert ready.status == "ready" + manifest = reopened.get_manifest("layer-1") + assert manifest is not None + assert manifest.affected_paths == frozenset({"a.py"}) + assert manifest.tombstoned_paths == frozenset({"old.py"}) + + +def test_layer_store_persists_overlay_policy(tmp_path: Path) -> None: + store = LayerStore(tmp_path / "daemon.db") + + store.upsert_overlay_policy(repo_id="repo", base_ref="main") + + reopened = LayerStore(tmp_path / "daemon.db") + assert reopened.get_overlay_base_ref("repo") == "main" + + +@pytest.mark.asyncio +async def test_layered_project_creates_base_and_branch_manifests( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + from conftest import make_test_user_settings + + from cocoindex_code.daemon import _resolve_chunker_registry + from cocoindex_code.embedder_params import resolve_embedder_params + from cocoindex_code.shared import create_embedder + + monkeypatch.setenv("COCOINDEX_CODE_STATE_DIR", str(tmp_path / "state")) + repo = _init_repo(tmp_path / "repo") + _git(repo, "checkout", "-b", "feature") + (repo / "main.py").write_text("def branch_function() -> str:\n return 'branch'\n") + (repo / "extra.py").write_text("def extra() -> str:\n return 'extra'\n") + _git(repo, "add", ".") + _git(repo, "commit", "-m", "feature") + + user_settings = make_test_user_settings() + params = resolve_embedder_params(user_settings.embedding) + state_dir = daemon_state_dir() + project = LayeredProject( + project_root=repo, + cwd=repo, + base_ref="main", + state_dir=state_dir, + store=LayerStore(state_dir / "daemon.db"), + embedder=create_embedder(user_settings.embedding, indexing_params=params.indexing), + indexing_params=params.indexing, + query_params=params.query, + chunker_registry=_resolve_chunker_registry(default_project_settings().chunkers), + project_cache={}, + ) + + await project.run_index() + + layers = project.store.list_layers() + assert {layer.kind for layer in layers} == {LayerKind.BASE, LayerKind.BRANCH} + branch_layer = next(layer for layer in layers if layer.kind == LayerKind.BRANCH) + manifest = project.store.get_manifest(branch_layer.layer_id) + assert manifest is not None + assert manifest.affected_paths == frozenset({"extra.py", "main.py"}) diff --git a/tests/test_protocol.py b/tests/test_protocol.py index bf1d216..c38d07d 100644 --- a/tests/test_protocol.py +++ b/tests/test_protocol.py @@ -49,6 +49,8 @@ def test_encode_decode_search_request_with_defaults() -> None: data = encode_request(req) decoded = decode_request(data) assert isinstance(decoded, SearchRequest) + assert decoded.cwd is None + assert decoded.base_ref is None assert decoded.languages is None assert decoded.limit == 5 assert decoded.offset == 0 @@ -62,6 +64,8 @@ def test_encode_decode_search_request_with_all_fields() -> None: paths=["src/*"], limit=20, offset=5, + cwd="/tmp/proj/src", + base_ref="main", ) data = encode_request(req) decoded = decode_request(data) @@ -72,6 +76,8 @@ def test_encode_decode_search_request_with_all_fields() -> None: assert decoded.paths == ["src/*"] assert decoded.limit == 20 assert decoded.offset == 5 + assert decoded.cwd == "/tmp/proj/src" + assert decoded.base_ref == "main" def test_encode_decode_search_response_with_results() -> None: @@ -85,6 +91,11 @@ def test_encode_decode_search_response_with_results() -> None: start_line=1, end_line=1, score=0.95, + repo_id="repo", + branch="main", + commit="abc", + layer_kind="base", + layer_id="layer", ), ], total_returned=1, @@ -97,6 +108,7 @@ def test_encode_decode_search_response_with_results() -> None: assert len(decoded.results) == 1 assert decoded.results[0].file_path == "main.py" assert decoded.results[0].score == 0.95 + assert decoded.results[0].layer_kind == "base" def test_encode_decode_error_response() -> None: @@ -126,11 +138,13 @@ def test_encode_decode_daemon_status_response() -> None: def test_tagged_union_dispatch() -> None: - req = IndexRequest(project_root="/tmp") + req = IndexRequest(project_root="/tmp", cwd="/tmp/sub", base_ref="main") data = encode_request(req) decoded = decode_request(data) assert isinstance(decoded, IndexRequest) assert not isinstance(decoded, HandshakeRequest) + assert decoded.cwd == "/tmp/sub" + assert decoded.base_ref == "main" def test_encode_decode_doctor_request() -> None: diff --git a/uv.lock b/uv.lock index 513add1..d9944bf 100644 --- a/uv.lock +++ b/uv.lock @@ -380,6 +380,7 @@ dependencies = [ { name = "numpy" }, { name = "pathspec" }, { name = "pydantic" }, + { name = "pygit2" }, { name = "pyyaml" }, { name = "questionary" }, { name = "sqlite-vec" }, @@ -429,6 +430,7 @@ requires-dist = [ { name = "pathspec", specifier = ">=0.12.1" }, { name = "prek", marker = "extra == 'dev'", specifier = ">=0.1.0" }, { name = "pydantic", specifier = ">=2.0.0" }, + { name = "pygit2", specifier = ">=1.19.0" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0.0" }, { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.21.0" }, { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=4.0.0" }, @@ -2082,6 +2084,72 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c1/60/5d4751ba3f4a40a6891f24eec885f51afd78d208498268c734e256fb13c4/pydantic_settings-2.12.0-py3-none-any.whl", hash = "sha256:fddb9fd99a5b18da837b29710391e945b1e30c135477f484084ee513adb93809", size = 51880, upload-time = "2025-11-10T14:25:45.546Z" }, ] +[[package]] +name = "pygit2" +version = "1.19.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3a/a4/10ce00feef5c43eddacab19ae6610c4d4ef3ab77e544e9ee938772cd1c17/pygit2-1.19.2.tar.gz", hash = "sha256:cbeb3dbca9ca6ee3d5ea5d02f5e844c2d6084a2d5d6621e3e06aa2b11c645bfd", size = 803448, upload-time = "2026-03-29T14:57:27.565Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/9c/388648963f4be4bde89e32ca1a6f60adabb5f782c0e78598790b56e41967/pygit2-1.19.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:70c7efc426bdae6b67465a03729b79277e7757a29a7d6550b40c18ed36cb7232", size = 5706937, upload-time = "2026-03-29T14:56:02.061Z" }, + { url = "https://files.pythonhosted.org/packages/02/4c/e89013ff45350affac11f5893b3b7b555be35d5f279ff89c1d9310872378/pygit2-1.19.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7b96d6ed7251eef70cfd4126269f1044fa47bc6da6367300027c5e5d74789f7f", size = 5695668, upload-time = "2026-03-29T14:56:03.92Z" }, + { url = "https://files.pythonhosted.org/packages/6f/84/db7281407c4481b64559f400f87f60190cc59615637bbc6f0afb8681dd7d/pygit2-1.19.2-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96f3235db6b553b8fb4d3c1dc86af9be1eab445f1d6c42f4ade5cf5f60efd333", size = 6034309, upload-time = "2026-03-29T14:56:05.282Z" }, + { url = "https://files.pythonhosted.org/packages/b9/06/d8623933341e79220ab0c14c1e2bc5a78645738ce62699f942065e8699c4/pygit2-1.19.2-cp311-cp311-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:02a35d56126f82a303668f4198c138627b3e9820f9f1eec38fff0409be274b9e", size = 4637953, upload-time = "2026-03-29T14:56:06.589Z" }, + { url = "https://files.pythonhosted.org/packages/42/27/6b20c5d424297623b22737f54a5c67ea00d498b5d2ebb98d3175d01de10c/pygit2-1.19.2-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e59a2e9eddd59edf999403c266c891dfc171eb95939d229ed614bc21e0c95804", size = 5794511, upload-time = "2026-03-29T14:56:07.953Z" }, + { url = "https://files.pythonhosted.org/packages/ec/cb/75a09f2319dc8798d80085b059506462af95b576d782c447f502dc807553/pygit2-1.19.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d0d2437bd5f8dbd652e8a6c318cbcaa245c0528ee48f6d64f4aaef8fd9b36b93", size = 6039969, upload-time = "2026-03-29T14:56:09.441Z" }, + { url = "https://files.pythonhosted.org/packages/34/20/fd2ac2f397fed5fab1f2838f2460226734f5a616371a388d8b0d7c995b7f/pygit2-1.19.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:60d011496e57436b0c8e3fbd4d12745777427b3f33a60710ec3d94d2f76304b7", size = 5764242, upload-time = "2026-03-29T14:56:10.947Z" }, + { url = "https://files.pythonhosted.org/packages/fe/c0/bee8c2fce9d577cdc167e82d8999a57f997c213b5226fbaa9b977e3ce95f/pygit2-1.19.2-cp311-cp311-win32.whl", hash = "sha256:9b0d5a44ca6d77a8c0e2526f6556d9b37cc85d44983ff3549bf5adbf95d289c4", size = 945852, upload-time = "2026-03-29T14:56:12.209Z" }, + { url = "https://files.pythonhosted.org/packages/e0/e9/9ecafac82a8729ebfc948636147235770354ee48956ad43ed628b8396a68/pygit2-1.19.2-cp311-cp311-win_amd64.whl", hash = "sha256:0d9c795155086c95ef890c87b50e02792146cfaede2c715698e6988a122373e7", size = 1163883, upload-time = "2026-03-29T14:56:13.28Z" }, + { url = "https://files.pythonhosted.org/packages/02/09/24f3f55ebda489755d757dd254a612caa19e6a2bd2cbc5ccf7127e4caa30/pygit2-1.19.2-cp311-cp311-win_arm64.whl", hash = "sha256:837f0a9a0093cbb213176284d29f0ab754ded3e5af967e7ec6419d590a7da92a", size = 969221, upload-time = "2026-03-29T14:56:14.441Z" }, + { url = "https://files.pythonhosted.org/packages/d5/2d/4fdeb7c6e044588cef9f0fca8b93fbc40fcdb2dfc64367999f45e88c0e7e/pygit2-1.19.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:cf479077d48a60b09569a5bb50866d8609f434f8982058594b0d2e2950bd6fce", size = 5704810, upload-time = "2026-03-29T14:56:15.671Z" }, + { url = "https://files.pythonhosted.org/packages/c7/d8/926415c996ca283c4f7ccf63322ea23135ff17ecd1d2faaba704f6b4d883/pygit2-1.19.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6e6e7eb5fb49203735627b8e1d410afe19e7d610c9a9733c11084fabd17f0920", size = 5696366, upload-time = "2026-03-29T14:56:17.241Z" }, + { url = "https://files.pythonhosted.org/packages/8c/5e/c0329db9c980552c5c853dc1e429d13d55c691749db0540ef6bc77c04a98/pygit2-1.19.2-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1a810da2d108d6bd16115c72a1c3d69fa1528ef927719bdfc94d2cdbc4198288", size = 6035334, upload-time = "2026-03-29T14:56:19.555Z" }, + { url = "https://files.pythonhosted.org/packages/a6/39/e08003a59a4d58bbba923d1c2be683a84b7b30c7270a19ff3ea02f9558df/pygit2-1.19.2-cp312-cp312-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d0b8ae5a822afb2771cbacf7c75140e663bc801c44eaaf2e4017f850cb27227c", size = 4636920, upload-time = "2026-03-29T14:56:20.93Z" }, + { url = "https://files.pythonhosted.org/packages/c9/3c/d3a9ed478add4cd77403416897459750ef2f6a06d8febe452ba03f5b7a27/pygit2-1.19.2-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:330430b6c1a3e6d45d1f5f950734d37d849c07924b5b0475cd995a7e541e6ab1", size = 5798652, upload-time = "2026-03-29T14:56:22.773Z" }, + { url = "https://files.pythonhosted.org/packages/d4/15/5440f00005db1769062ecc9fab7059ac7ae89217a06e1976734f53c2d040/pygit2-1.19.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7b7f165d1ddfa1e0f205c1115ee10f5fea700fd3584c727b0d61a57192238449", size = 6041142, upload-time = "2026-03-29T14:56:24.618Z" }, + { url = "https://files.pythonhosted.org/packages/9d/5b/b9f9979a56606a661c0cff24c7aa6f5b1ad34118e116b7d67295be42aaad/pygit2-1.19.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e46ec6a97a5c43704473e42a926f7f20f9934ceef4f4891660313f573c4f0ab8", size = 5769220, upload-time = "2026-03-29T14:56:26.423Z" }, + { url = "https://files.pythonhosted.org/packages/30/81/9594e604eb19ae02f6a2023840e25574b0abcfc8d58c03cf96c59dd4ba72/pygit2-1.19.2-cp312-cp312-win32.whl", hash = "sha256:6b4de5469e88e7b069143f7a5d6336a4b3e7d911de4633ef18c113e416feb948", size = 946691, upload-time = "2026-03-29T14:56:27.813Z" }, + { url = "https://files.pythonhosted.org/packages/35/2d/c9bcdaef8f57ba0cdf129a6823f95cadd8ead002f38fba3465732c7517a8/pygit2-1.19.2-cp312-cp312-win_amd64.whl", hash = "sha256:f064748202928f4e882501521229e378e0b7b69b0e7c433cdb2626d007745973", size = 1164290, upload-time = "2026-03-29T14:56:29.13Z" }, + { url = "https://files.pythonhosted.org/packages/b8/d4/6e9c98d227a8e816e2ffc7304f733e8b924afd8198b16888972fedbe05bd/pygit2-1.19.2-cp312-cp312-win_arm64.whl", hash = "sha256:222f439d751799dc74c3fa75f187abdbc415d12f9a091efa66f0c9ff51893d32", size = 969330, upload-time = "2026-03-29T14:56:30.363Z" }, + { url = "https://files.pythonhosted.org/packages/c1/77/c925eee8496961729f029a4edda67485c7637248c0e730e0b41122357be5/pygit2-1.19.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:df207f93a33851a110dec70108e3f2a1c69578932919fd356303eda83a5624db", size = 5704802, upload-time = "2026-03-29T14:56:31.635Z" }, + { url = "https://files.pythonhosted.org/packages/d8/fc/d46428b7ea0ce7bd3cac73b73206a2cba50580f54b58bd704d8755d5658c/pygit2-1.19.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ae884cd53e29b3d831f5261f36048a8d5db5642dc98cd63530810e7fd9c9e60d", size = 5696329, upload-time = "2026-03-29T14:56:33.343Z" }, + { url = "https://files.pythonhosted.org/packages/35/05/a3bb39095ef31e140cbeb30abbd08fafb13ed70b656a9de095fac74a1ff5/pygit2-1.19.2-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0bd4059964531d20aaf4577b3761590df9cc7c9e2395df5d33f0552224331b76", size = 6036095, upload-time = "2026-03-29T14:56:34.836Z" }, + { url = "https://files.pythonhosted.org/packages/4c/cb/36ebd241351bd1ced1f126bf0b21fbb6c0d48ce36122512cc51cde83d10b/pygit2-1.19.2-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c3befcccc7b3b62e45da2cc1ce4095964f7606d3d15b43dc667c6ef2a2ada20d", size = 4637435, upload-time = "2026-03-29T14:56:36.292Z" }, + { url = "https://files.pythonhosted.org/packages/36/35/779d6b8e9df0cc3236f675af5fc37e4047e1a6ab96f9c72ef5b5ed8d888b/pygit2-1.19.2-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1cf08b54553f997f6f60a7918504e22e7baa4ba2fbb11d1e1cb6c0a45ac7e04b", size = 5799881, upload-time = "2026-03-29T14:56:38.04Z" }, + { url = "https://files.pythonhosted.org/packages/eb/fa/cb361f4bd5342fa01a0f83b04eff8873a09771183bcb6e29947078577119/pygit2-1.19.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d7f630e5a763f01b4be6e2374c487086229c8f7392a2e5591d29095c5e481da4", size = 6042342, upload-time = "2026-03-29T14:56:39.523Z" }, + { url = "https://files.pythonhosted.org/packages/2f/6f/b9ea61266eb7d568ea17d8fec63dc766ebecec23860b4e5ac5bcfbbe15d7/pygit2-1.19.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6166845f41d4f6be3353997022d64035fe3df348c8e34d7d30c5f95817fbcab4", size = 5770452, upload-time = "2026-03-29T14:56:41.306Z" }, + { url = "https://files.pythonhosted.org/packages/fb/bb/403532429072a61d5498d17ddf6be3258953e73b6499f70a2b4e1345bb84/pygit2-1.19.2-cp313-cp313-win32.whl", hash = "sha256:5bebea045102e87dea142242298d4dd668d0227f76042f98efb1c5d5dd3db21e", size = 946658, upload-time = "2026-03-29T14:56:42.613Z" }, + { url = "https://files.pythonhosted.org/packages/01/08/6f37fb23514da02345889d7be7cea899d2a348fa4871492ea9a8837e70e4/pygit2-1.19.2-cp313-cp313-win_amd64.whl", hash = "sha256:7bbfeb680821001a5c1b6959da1eae906806c90c9992ae4564d3ea83a27bb19f", size = 1164264, upload-time = "2026-03-29T14:56:43.753Z" }, + { url = "https://files.pythonhosted.org/packages/90/b9/d11220d5f0cfc92895b02814ab36ac94edbf46ae1b9dc3077c457d03d718/pygit2-1.19.2-cp313-cp313-win_arm64.whl", hash = "sha256:033d489186145cf67b2c60840d2a308f6b1e9d641de12417c447f9829dacde70", size = 969348, upload-time = "2026-03-29T14:56:44.892Z" }, + { url = "https://files.pythonhosted.org/packages/2e/1b/1a935baeb29958d7e50a52c7a963ce5963f24fa8a5024e1082d43b07a770/pygit2-1.19.2-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:f5effee3f4ad0d9c89b34ebecf1acee26f6b117ef3c51345ad022bd521fd8dca", size = 5706909, upload-time = "2026-03-29T14:56:46.249Z" }, + { url = "https://files.pythonhosted.org/packages/a7/86/4bb6f196b13bd7ed825f4e931fb7152a36d01e8de24c8de44425702ad18c/pygit2-1.19.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1ed09804dc6b6de0be07a71443122fd7b6458f8466d1134003c2dea55af886fc", size = 5696293, upload-time = "2026-03-29T14:56:48.173Z" }, + { url = "https://files.pythonhosted.org/packages/2f/64/d674b3f854cecf53bccbc21a095734759cd3599624578ed3c78602eb22a3/pygit2-1.19.2-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2d114aa066e718d5ef3401b366dcb0b37b549c3b3b139f5f0042bd7059a4b0f7", size = 6038057, upload-time = "2026-03-29T14:56:50.118Z" }, + { url = "https://files.pythonhosted.org/packages/64/eb/2ce41735e27ee0f28f786aae62ea371f3beec0ef38d1712a2910421386c4/pygit2-1.19.2-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c1becc06071acfdd5ae8523aaeab6d4b0930b2bcb08f5eb878e052e61275000b", size = 4641475, upload-time = "2026-03-29T14:56:51.581Z" }, + { url = "https://files.pythonhosted.org/packages/e5/8d/35f6096c42caefb715ca29e991279b493275c0051a3c83081099644d3f4a/pygit2-1.19.2-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:06d2db3bdbf2906eb17112adb14a2fe6e34c1b2bce39c91819f59208d4e56665", size = 5801738, upload-time = "2026-03-29T14:56:53.043Z" }, + { url = "https://files.pythonhosted.org/packages/fc/31/dbbaa7a433008fec9046cc293c012ae5d5a31e66321e1fb05d64ae131e54/pygit2-1.19.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:8a7e99e5dfc8d3ed8f849b9688bc3fb1bdc86f34af28159140a8d1e18b703dd8", size = 6043074, upload-time = "2026-03-29T14:56:54.774Z" }, + { url = "https://files.pythonhosted.org/packages/f0/33/b34266efba6917081dafb50976155c2d31cd377f277e67348a810245c4b4/pygit2-1.19.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7659d59eba6c4a706978237d02e8d719f960843df749256f1656c938c1f4142b", size = 5770986, upload-time = "2026-03-29T14:56:56.796Z" }, + { url = "https://files.pythonhosted.org/packages/c6/ab/813f3af50987020cd90e810da147ebef16a61003b9af995070ec338634ba/pygit2-1.19.2-cp314-cp314-win32.whl", hash = "sha256:e551908dfd93d471c0b08cfcddbe4924417865aae6ac90d20f3815c9483b0a82", size = 967943, upload-time = "2026-03-29T14:56:58.196Z" }, + { url = "https://files.pythonhosted.org/packages/22/00/24df5ac51a316e36a07bbf9e4c91fade523b9e80a84d5c9e7acd10b22248/pygit2-1.19.2-cp314-cp314-win_amd64.whl", hash = "sha256:eb1fd8538372230f8a471a5f3629901bc2fc7df992853d97bedc8fa269a9caf3", size = 1194774, upload-time = "2026-03-29T14:56:59.721Z" }, + { url = "https://files.pythonhosted.org/packages/a7/ee/274a91b28864fd9c5cdd2949b4d7e0909fd6a89785a46308de098d3a22cd/pygit2-1.19.2-cp314-cp314-win_arm64.whl", hash = "sha256:3cc461245b70be45a936e925744e67a45f6b0ee970aeb8e7a385dd7fe9f40877", size = 996677, upload-time = "2026-03-29T14:57:01.013Z" }, + { url = "https://files.pythonhosted.org/packages/4f/22/3c05a56918e6fda5deb53aeb7436959a8880f4cc436a76771771479693de/pygit2-1.19.2-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:cb686bc81dfe5b13937047643fddb1dd253dae33b4a9ca62858c49ed294e05be", size = 5710172, upload-time = "2026-03-29T14:57:02.672Z" }, + { url = "https://files.pythonhosted.org/packages/59/eb/2fdd485c01b478c77dd2e949b424a61c70a8750ffb13c5035fe3edf6a8f6/pygit2-1.19.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5ec3538d81963bd05dd16c0de75938a9173966e1c853ad7848ebcb60bcfe21b0", size = 5699256, upload-time = "2026-03-29T14:57:04.401Z" }, + { url = "https://files.pythonhosted.org/packages/51/f4/f0608bb369da15f2973dfb33e7b7cba4c9bc8164e6a01e3f15e65e85efef/pygit2-1.19.2-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d02ebb50ea082d9631bbfda12787eb5324b8880a72cb8e3b9f11e9b323ad5781", size = 6096321, upload-time = "2026-03-29T14:57:06.33Z" }, + { url = "https://files.pythonhosted.org/packages/6e/1b/816d3700dc8bcc9028c5f81b190f2d770d1cb9cd2ccdd39939d0b6730718/pygit2-1.19.2-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8a3643e4dd569c2909e88586659f617f70315680ca3c619cd8ff9e9c28726c25", size = 4696179, upload-time = "2026-03-29T14:57:08.552Z" }, + { url = "https://files.pythonhosted.org/packages/ee/a6/0fc82f07c4dfee5856626c5d4b422c32e14cac0204eb1e9558ac0d717b07/pygit2-1.19.2-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:697e3684cb4ef2bfc084623c3f680d5ae8b4c8afca31a35a731b7b70204d9f83", size = 5853368, upload-time = "2026-03-29T14:57:10.449Z" }, + { url = "https://files.pythonhosted.org/packages/f8/60/0393786d7810b7f83def3738cb9be1a735cf6b555dc219d90f46010b87b1/pygit2-1.19.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:173165b54a2affed918302193f12dd369bec981b1d77904cdcd76b966a824e15", size = 6099319, upload-time = "2026-03-29T14:57:12.166Z" }, + { url = "https://files.pythonhosted.org/packages/13/ad/22e30e630a147e10a912e085c4cb816a0dc39bee8d39493b40101f3da4c7/pygit2-1.19.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:ff32adce1a48d76b10e790b36784f6cb5ef40699b758c8b84f7f53f13b13d237", size = 5822074, upload-time = "2026-03-29T14:57:13.84Z" }, + { url = "https://files.pythonhosted.org/packages/4b/08/71ea683386887a1aab8f9b8c282b6df7ce7fae45fc7c9959719c78baebba/pygit2-1.19.2-cp314-cp314t-win32.whl", hash = "sha256:637d7c023f6623da35cf02cd1091f260c709730dd615367f4524ec8d771d0898", size = 972866, upload-time = "2026-03-29T14:57:15.26Z" }, + { url = "https://files.pythonhosted.org/packages/da/67/efbde3954bdcbadfb61d183badd9a3e730c4ad94ed10966abe0b177abe0c/pygit2-1.19.2-cp314-cp314t-win_amd64.whl", hash = "sha256:2805a8abd546e38298ce5daf33e444960e483acce68cbfb5d338e72ad5bc3503", size = 1201537, upload-time = "2026-03-29T14:57:16.72Z" }, + { url = "https://files.pythonhosted.org/packages/0f/0c/28ae2c74038d1c51092f525658986a261f1963ec96528e7b41e721387343/pygit2-1.19.2-cp314-cp314t-win_arm64.whl", hash = "sha256:376a0d2c27c082f6bd8b97fd8ffc1939f16dfe8374ec846deee9b11151b37b8a", size = 997795, upload-time = "2026-03-29T14:57:17.878Z" }, + { url = "https://files.pythonhosted.org/packages/f8/47/8ca340fc8f0f5ec8ab9f8e96bb814a64a95f3836034fac00bb733c1f357c/pygit2-1.19.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:4c2d397c887ff5a26b48ebd1bb9c66d2195ad377f0a44e05b79c462fff4040cd", size = 5649264, upload-time = "2026-03-29T14:57:19.193Z" }, + { url = "https://files.pythonhosted.org/packages/fd/c4/8d26b20cb09ae862302b7b023e9089c70946cbdaf4f3cb8c1d4c7ba94a09/pygit2-1.19.2-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:69a0d377ee46110bbeea9e4191edee05132d1e7ac84b7cdebc640bc45868a2ec", size = 5646929, upload-time = "2026-03-29T14:57:20.717Z" }, + { url = "https://files.pythonhosted.org/packages/d3/35/b71ac88cda21ad440577543a317052eca9ab4f0119e9c2d74baa135731c4/pygit2-1.19.2-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:57d113a3eb61621ce16ceaa4bae7a93ffe525fd69da905445a0cf798d3601815", size = 5562858, upload-time = "2026-03-29T14:57:22.962Z" }, + { url = "https://files.pythonhosted.org/packages/08/d9/c419105e997031a34a1a7d87e832a3a0e5a4c1501bc2784250d000b1c044/pygit2-1.19.2-pp311-pypy311_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e0bc207abbef4d3be3bd37e0711e6974a148d41806fdc932aef9bb244b157c4", size = 5315754, upload-time = "2026-03-29T14:57:24.956Z" }, + { url = "https://files.pythonhosted.org/packages/e8/20/b52bc0ef2c5358d08e20e2d9fa8ae911283f3d20a7b9e52ec4a338b94983/pygit2-1.19.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:219c03bdbca59bd1df12b8bc7974b429872f4267aa2287ec0237c268593c0c5e", size = 1132796, upload-time = "2026-03-29T14:57:26.411Z" }, +] + [[package]] name = "pygments" version = "2.19.2" From 6703fc1ad89a793d99a8e56b11b48cc72ce0c3f7 Mon Sep 17 00:00:00 2001 From: "rudimar.ronsoni" Date: Tue, 19 May 2026 23:55:59 +0200 Subject: [PATCH 02/15] Add Docker sidecar layered indexing --- README.md | 37 +++++- docker/entrypoint.sh | 7 + docs/docker-layered-indexing.md | 185 ++++++++++++++------------ sample/.gitignore | 1 + sample/Makefile | 41 ++++++ sample/README.md | 74 +++++++++++ sample/bin/ccc | 183 +++++++++++++++++++++++++ sample/docker-compose.yml | 42 ++++++ src/cocoindex_code/_daemon_paths.py | 12 +- src/cocoindex_code/cli.py | 40 +++++- src/cocoindex_code/client.py | 52 ++++++-- src/cocoindex_code/daemon.py | 78 ++++++++++- src/cocoindex_code/layered_project.py | 8 ++ src/cocoindex_code/protocol.py | 1 + src/cocoindex_code/sidecar.py | 64 +++++++++ tests/test_daemon_paths.py | 31 +++++ tests/test_docker_setup.py | 39 ++++++ tests/test_protocol.py | 3 + 18 files changed, 786 insertions(+), 112 deletions(-) create mode 100644 sample/.gitignore create mode 100644 sample/Makefile create mode 100644 sample/README.md create mode 100755 sample/bin/ccc create mode 100644 sample/docker-compose.yml create mode 100644 src/cocoindex_code/sidecar.py create mode 100644 tests/test_daemon_paths.py diff --git a/README.md b/README.md index 590caf1..6409147 100644 --- a/README.md +++ b/README.md @@ -211,9 +211,15 @@ By default, `ccc search` scopes results to your current working directory (relat A Docker image is available for teams who want a reproducible, dependency-free setup — no Python, `uv`, or system dependencies required on the host. -The recommended approach is a **persistent container**: start it once, and use -`docker exec` to run CLI commands or connect MCP sessions to it. The daemon -inside stays warm across sessions, so the embedding model is loaded only once. +The recommended secure approach is the **central daemon + on-demand sidecar** +model: one daemon container owns shared state and short-lived sidecars mount +exactly one authorized repository when `ccc init`, `ccc index`, or `ccc search` +runs. This avoids giving the daemon broad access to `$HOME` or a whole source +tree. See [Docker Sidecar Layered Indexing](./docs/docker-layered-indexing.md). + +The persistent-workspace compose flow below is still useful for trusted local +development, but it mounts the configured workspace into the daemon container. +Do not use it when repository access must be granted selectively. ### Choosing an image @@ -234,7 +240,30 @@ The rest of this section uses `:latest` — substitute `:full` in the `image:` / > (slim) variant is unaffected — LiteLLM runs the model on the provider's > side, so Docker vs. native makes no difference. -### Quick start — `docker compose up -d` +### Secure quick start: daemon + sidecars + +Build the branch-local image and install/use the sidecar wrapper: + +```bash +cd sample +make build +make install-ccc-wrapper # optional; otherwise call sample/bin/ccc directly +``` + +Authorize and index exactly one repo: + +```bash +cd /path/to/repo +ccc init --base main +ccc index +ccc search "authentication logic" +``` + +`ccc init` records the current Git root as authorized. Later commands refuse to +run outside an authorized repo. Sidecars mount only the authorized repo at +`/workspace` and talk to the central daemon over a private Docker network. + +### Trusted-workspace compose: `docker compose up -d` Bring it up in one line — no clone needed (bash / zsh): diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index d95e8a9..43520f8 100644 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -36,6 +36,13 @@ if [ -n "$PUID" ] && [ -n "$PGID" ]; then fi fi +if [ "$#" -gt 0 ]; then + if [ -n "$PUID" ] && [ -n "$PGID" ]; then + exec gosu coco "$@" + fi + exec "$@" +fi + run_daemon() { if [ -n "$PUID" ] && [ -n "$PGID" ]; then gosu coco ccc run-daemon diff --git a/docs/docker-layered-indexing.md b/docs/docker-layered-indexing.md index 6af166e..cff4112 100644 --- a/docs/docker-layered-indexing.md +++ b/docs/docker-layered-indexing.md @@ -1,143 +1,152 @@ -# Docker Layered Indexing +# Docker Sidecar Layered Indexing This guide covers the Docker-specific configuration for Git layered indexing. For the core model, see [Git Layered Indexing](./layered-indexing.md). -## Recommended Compose Setup +The intended Docker architecture is: -Use the repository compose file: +- one central daemon container with no source-code mount +- Docker named volumes for daemon state, runtime files, config, caches, and layer databases +- short-lived sidecar containers for repo work +- each sidecar mounts exactly one authorized Git checkout at `/workspace` +- sidecars talk to the central daemon over a private Docker network + +Do not mount `$HOME` or a broad source tree just to make indexing work. + +## Repo-Scoped Sample + +Build the branch-local image: ```bash -docker compose -f docker/docker-compose.yml up -d +cd sample +make build ``` -The compose defaults are designed for layered indexing: +Authorize one repo and register its base ref: -```yaml -COCOINDEX_CODE_STATE_DIR: /var/cocoindex/state -COCOINDEX_CODE_RUNTIME_DIR: /var/run/cocoindex_code -COCOINDEX_CODE_DB_PATH_MAPPING: /workspace=/var/cocoindex/db -COCOINDEX_CODE_HOST_PATH_MAPPING: /workspace=$HOME +```bash +cd /path/to/repo +/path/to/cocoindex-code/sample/bin/ccc init --base main ``` -The important split is: +Then index and search: -- source code and settings live on the bind mount under `/workspace` -- durable daemon layer metadata lives under `/var/cocoindex/state` -- per-project non-layer DB paths are remapped to `/var/cocoindex/db` -- sockets, PID files, and logs stay under `/var/run/cocoindex_code` +```bash +/path/to/cocoindex-code/sample/bin/ccc index +/path/to/cocoindex-code/sample/bin/ccc search "query planner" +/path/to/cocoindex-code/sample/bin/ccc overlay status +``` -## Mount the Right Workspace +The wrapper refuses to run outside an authorized repo. Running `ccc init` from another repo authorizes that repo separately. Source access is granted only to the short-lived sidecar for that repo. -The default compose file mounts your home directory: +Linked worktrees must also be authorized explicitly: ```bash -COCOINDEX_HOST_WORKSPACE=$HOME docker compose -f docker/docker-compose.yml up -d +cd /path/to/repo.worktrees/feature-1 +/path/to/cocoindex-code/sample/bin/ccc init --base main +/path/to/cocoindex-code/sample/bin/ccc index ``` -For a narrower mount, point it at the parent containing both the root clone and linked worktrees: +When linked worktrees share the same Git common directory, they can share daemon layer state while each sidecar still mounts only the initialized checkout. -```bash -COCOINDEX_HOST_WORKSPACE=$HOME/src/github/cocoindex-io \ - docker compose -f docker/docker-compose.yml up -d -``` +## What Runs Where -Example host layout: +Central daemon container: ```text -$HOME/src/github/cocoindex-io/ - cocoindex-code/ - cocoindex-code.worktrees/ - feature-1/ +mounts: + cocoindex-code-local-state -> /var/cocoindex + cocoindex-code-local-runtime -> /var/run/cocoindex_code +network: + cocoindex-code-local +listens: + COCOINDEX_CODE_DAEMON_TCP=0.0.0.0:8765 +source access: + none ``` -Both paths must be visible inside the same container mount for the daemon to reuse repository and layer state across them. - -## Host Wrapper +Sidecar container: -Use this wrapper so Docker commands resolve the host current directory correctly: - -```bash -ccc() { - local container="${COCOINDEX_CODE_CONTAINER_NAME:-cocoindex-code}" - if [ "$(docker inspect -f '{{.State.Running}}' "$container" 2>/dev/null)" != "true" ]; then - echo "cocoindex-code container is not running. Start it with: docker compose -f docker/docker-compose.yml up -d" >&2 - return 1 - fi - - local flags=(-i) - if [ "${1:-}" != "mcp" ] && [ -t 0 ] && [ -t 1 ]; then - flags=(-it) - fi - - docker exec "${flags[@]}" \ - -e COCOINDEX_CODE_HOST_CWD="$PWD" \ - "$container" ccc "$@" -} +```text +mounts: + /authorized/repo -> /workspace + cocoindex-code-local-state -> /var/cocoindex + cocoindex-code-local-runtime -> /var/run/cocoindex_code +network: + cocoindex-code-local +connects: + COCOINDEX_CODE_DAEMON_TCP=cocoindex-code-local-daemon:8765 +source access: + only the authorized repo ``` -`COCOINDEX_CODE_HOST_CWD` is required for linked worktrees. It tells the container-side CLI which host directory you are actually in, then the path mapping translates it to `/workspace/...`. +Indexing runs in the sidecar because it is the process with Git/source access. The resulting layer metadata and layer databases are written to shared daemon state. Search sends the resolved layer IDs to the central daemon, and the daemon serves the query from shared layer databases without mounting the repository. -## Layered Workflow in Docker +## State -Root clone: +Host-side sample metadata: -```bash -cd $HOME/src/github/cocoindex-io/cocoindex-code -ccc init --base main -ccc index +```text +sample/data/authorized-repos.tsv ``` -Linked worktree: +Docker named volumes: + +| Volume | Mounted As | Purpose | +|---|---|---| +| `cocoindex-code-local-state` | `/var/cocoindex` | Global settings, daemon DB, layer metadata, layer DBs, caches | +| `cocoindex-code-local-runtime` | `/var/run/cocoindex_code` | PID/log runtime files | + +Reset sample Docker state: ```bash -git worktree add ../cocoindex-code.worktrees/feature-1 -b feature-1 main -cd ../cocoindex-code.worktrees/feature-1 -ccc index -ccc search "query planner" -ccc overlay status +cd sample +make reset ``` -The base layer is stored once under `/var/cocoindex/state` and reused by the linked worktree. - ## Environment Variables | Variable | Purpose | |---|---| -| `COCOINDEX_CODE_IMAGE` | Image used by compose, e.g. `cocoindex/cocoindex-code:full`. | -| `COCOINDEX_CODE_CONTAINER_NAME` | Container name used by compose and the wrapper. | -| `COCOINDEX_HOST_WORKSPACE` | Host directory mounted at `/workspace`. Mount a parent that contains all worktrees you want to share. | -| `COCOINDEX_CODE_HOST_PATH_MAPPING` | Container-to-host path mapping for display and host CWD translation. | -| `COCOINDEX_CODE_HOST_CWD` | Host current directory passed per `docker exec` invocation. | -| `COCOINDEX_CODE_STATE_DIR` | Durable daemon layer state. Default: `/var/cocoindex/state`. | -| `COCOINDEX_CODE_RUNTIME_DIR` | Runtime socket/PID/log directory. Default: `/var/run/cocoindex_code`. | -| `COCOINDEX_CODE_DB_PATH_MAPPING` | Non-layer project DB remapping. Default: `/workspace=/var/cocoindex/db`. | -| `PUID`, `PGID` | Linux-only ownership mapping for bind-mounted files and Docker-managed state. | +| `COCOINDEX_CODE_IMAGE` | Image used for central daemon and sidecars. Default: `cocoindex-code:local-layered`. | +| `COCOINDEX_CODE_DAEMON_CONTAINER` | Central daemon container name. Default: `cocoindex-code-local-daemon`. | +| `COCOINDEX_CODE_DOCKER_NETWORK` | Private Docker network. Default: `cocoindex-code-local`. | +| `COCOINDEX_CODE_STATE_VOLUME` | Shared daemon state named volume. Default: `cocoindex-code-local-state`. | +| `COCOINDEX_CODE_RUNTIME_VOLUME` | Shared runtime named volume. Default: `cocoindex-code-local-runtime`. | +| `COCOINDEX_CODE_SAMPLE_DATA_DIR` | Host-side allowlist directory. Default: `sample/data`. | +| `PUID`, `PGID` | Linux-only ownership mapping. | -## Debugging +Internal sidecar/daemon variables: -Check daemon status: +| Variable | Purpose | +|---|---| +| `COCOINDEX_CODE_SIDECAR=1` | Tells CLI to run repo-mounted indexing locally in the sidecar. | +| `COCOINDEX_CODE_DAEMON_TCP` | TCP daemon address. Central listens on `0.0.0.0:8765`; sidecars connect to the daemon container name. | +| `COCOINDEX_CODE_DIR=/var/cocoindex/config` | Shared global settings location. | +| `COCOINDEX_CODE_STATE_DIR=/var/cocoindex/state` | Durable daemon layer state. | +| `COCOINDEX_CODE_DB_PATH_MAPPING=/workspace=/var/cocoindex/db` | Keeps layer/project databases on Docker native storage. | -```bash -docker exec cocoindex-code ccc daemon status -``` +## Debugging -Inspect overlay status from the current host directory: +Check the central daemon: ```bash -ccc overlay status +cd sample +make ps +make logs ``` -Inspect state in the container: +Check through a repo-authorized sidecar: ```bash -docker exec -it cocoindex-code sh -ls -R /var/cocoindex/state +cd /path/to/repo +/path/to/cocoindex-code/sample/bin/ccc daemon status +/path/to/cocoindex-code/sample/bin/ccc overlay status ``` -Reset all Docker-managed index, layer, and cache state: +Inspect named volume contents: ```bash -docker compose -f docker/docker-compose.yml down -v +docker run --rm -it \ + -v cocoindex-code-local-state:/var/cocoindex \ + cocoindex-code:local-layered sh ``` - -This preserves your source tree because it is bind-mounted from the host. diff --git a/sample/.gitignore b/sample/.gitignore new file mode 100644 index 0000000..8fce603 --- /dev/null +++ b/sample/.gitignore @@ -0,0 +1 @@ +data/ diff --git a/sample/Makefile b/sample/Makefile new file mode 100644 index 0000000..7c5498b --- /dev/null +++ b/sample/Makefile @@ -0,0 +1,41 @@ +IMAGE ?= cocoindex-code:local-layered +COMPOSE ?= docker compose -f docker-compose.yml +IMAGE ?= cocoindex-code:local-layered + +.PHONY: build up restart ps logs down reset install-ccc-wrapper + +build: + docker build \ + -t "$(IMAGE)" \ + -f ../docker/Dockerfile \ + --build-arg CCC_INSTALL_SPEC=/ccc-src \ + .. + +ps: + docker ps --filter 'name=cocoindex-code-local-daemon' + +up: + COCOINDEX_CODE_IMAGE="$(IMAGE)" $(COMPOSE) up -d + +restart: + COCOINDEX_CODE_IMAGE="$(IMAGE)" $(COMPOSE) down + COCOINDEX_CODE_IMAGE="$(IMAGE)" $(COMPOSE) up -d + +logs: + $(COMPOSE) logs -f cocoindex-code-daemon + +down: + $(COMPOSE) down || docker rm -f "$${COCOINDEX_CODE_DAEMON_CONTAINER:-cocoindex-code-local-daemon}" || true + +reset: down + docker volume rm \ + "$${COCOINDEX_CODE_STATE_VOLUME:-cocoindex-code-local-state}" \ + "$${COCOINDEX_CODE_RUNTIME_VOLUME:-cocoindex-code-local-runtime}" \ + 2>/dev/null || true + docker network rm "$${COCOINDEX_CODE_DOCKER_NETWORK:-cocoindex-code-local}" 2>/dev/null || true + rm -rf data + +install-ccc-wrapper: + mkdir -p "$(HOME)/.local/bin" + cp "bin/ccc" "$(HOME)/.local/bin/ccc" + chmod +x "$(HOME)/.local/bin/ccc" diff --git a/sample/README.md b/sample/README.md new file mode 100644 index 0000000..f553588 --- /dev/null +++ b/sample/README.md @@ -0,0 +1,74 @@ +# Repo-Scoped Docker Sample + +This sample runs CocoIndex Code in Docker without mounting your home directory or a broad source tree. + +The wrapper grants access on demand: + +1. `ccc init` must be run inside a Git repository. +2. The wrapper records that Git root as authorized. +3. It starts one central daemon container with only shared state/runtime volumes. +4. Each `ccc` invocation runs a short-lived sidecar with only that repository mounted at `/workspace`. +5. Later commands only run when your current directory is inside an authorized repo. + +Build the image from this branch: + +```bash +cd sample +make build +``` + +Initialize and authorize one repo: + +```bash +cd /path/to/repo +/path/to/cocoindex-code/sample/bin/ccc init --base main +``` + +Index and search from the same repo: + +```bash +/path/to/cocoindex-code/sample/bin/ccc index +/path/to/cocoindex-code/sample/bin/ccc search "query" +/path/to/cocoindex-code/sample/bin/ccc overlay status +``` + +Install the wrapper globally if desired: + +```bash +cd /path/to/cocoindex-code/sample +make install-ccc-wrapper +``` + +Then use it as: + +```bash +cd /path/to/repo +ccc init --base main +ccc index +``` + +Linked worktrees must be authorized separately by running `ccc init` from that worktree. They share layer state when they share the same Git common directory, but each sidecar only receives access to the worktree you initialized. + +```bash +cd /path/to/repo.worktrees/feature-1 +ccc init --base main +ccc index +``` + +State is stored under `sample/data/`: + +- `authorized-repos.tsv`: host-side allowlist written by the wrapper + +Shared Docker state uses named volumes: + +- `cocoindex-code-local-state`: central daemon layer/index/config state mounted at `/var/cocoindex` +- `cocoindex-code-local-runtime`: daemon PID/log runtime files mounted at `/var/run/cocoindex_code` + +Sidecars talk to the central daemon over the private Docker network `cocoindex-code-local`. The daemon listens on `COCOINDEX_CODE_DAEMON_TCP=0.0.0.0:8765` inside that network; no host port is published. + +Stop the central daemon container: + +```bash +cd sample +make down +``` diff --git a/sample/bin/ccc b/sample/bin/ccc new file mode 100755 index 0000000..edd65a6 --- /dev/null +++ b/sample/bin/ccc @@ -0,0 +1,183 @@ +#!/usr/bin/env bash +set -euo pipefail + +image="${COCOINDEX_CODE_IMAGE:-cocoindex-code:local-layered}" +central_container="${COCOINDEX_CODE_DAEMON_CONTAINER:-cocoindex-code-local-daemon}" +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +sample_dir="$(cd "$script_dir/.." && pwd)" +data_dir="${COCOINDEX_CODE_SAMPLE_DATA_DIR:-$sample_dir/data}" +registry="$data_dir/authorized-repos.tsv" +state_dir="$data_dir/state" +runtime_dir="$data_dir/runtime" +state_volume="${COCOINDEX_CODE_STATE_VOLUME:-cocoindex-code-local-state}" +runtime_volume="${COCOINDEX_CODE_RUNTIME_VOLUME:-cocoindex-code-local-runtime}" +network="${COCOINDEX_CODE_DOCKER_NETWORK:-cocoindex-code-local}" + +mkdir -p "$data_dir" + +sha_12() { + printf '%s' "$1" | shasum -a 256 | awk '{print substr($1, 1, 12)}' +} + +canonical_path() { + python3 -c 'import pathlib, sys; print(pathlib.Path(sys.argv[1]).resolve())' "$1" +} + +git_root_for() { + git -C "$1" rev-parse --show-toplevel 2>/dev/null | xargs -I{} python3 -c 'import pathlib, sys; print(pathlib.Path(sys.argv[1]).resolve())' "{}" +} + +git_common_dir_for() { + local root="$1" + local raw + raw="$(git -C "$root" rev-parse --git-common-dir)" + if [[ "$raw" = /* ]]; then + canonical_path "$raw" + else + canonical_path "$root/$raw" + fi +} + +is_path_within() { + local child="$1" + local parent="$2" + [[ "$child" == "$parent" || "$child" == "$parent"/* ]] +} + +record_authorization() { + local root="$1" + local common_dir="$2" + local tmp + tmp="$(mktemp)" + if [[ -f "$registry" ]]; then + awk -F '\t' -v root="$root" '$1 != root { print }' "$registry" > "$tmp" + fi + printf '%s\t%s\n' "$root" "$common_dir" >> "$tmp" + mv "$tmp" "$registry" +} + +lookup_authorized_root() { + local cwd="$1" + local best="" + [[ -f "$registry" ]] || return 1 + while IFS=$'\t' read -r root _common; do + if is_path_within "$cwd" "$root"; then + if (( ${#root} > ${#best} )); then + best="$root" + fi + fi + done < "$registry" + [[ -n "$best" ]] || return 1 + echo "$best" +} + +common_dir_for_authorized_root() { + local wanted="$1" + awk -F '\t' -v root="$wanted" '$1 == root { print $2; exit }' "$registry" +} + +ensure_image_exists() { + if ! docker image inspect "$image" >/dev/null 2>&1; then + echo "Docker image '$image' does not exist." >&2 + echo "Build it with: cd '$sample_dir' && make build" >&2 + exit 1 + fi +} + +ensure_central_daemon() { + docker network inspect "$network" >/dev/null 2>&1 || docker network create "$network" >/dev/null + if [[ "$(docker inspect -f '{{.State.Running}}' "$central_container" 2>/dev/null || true)" == "true" ]]; then + return + fi + if docker inspect "$central_container" >/dev/null 2>&1; then + docker rm "$central_container" >/dev/null + fi + local run_args=( + run -d + --name "$central_container" + --network "$network" + --volume "$state_volume:/var/cocoindex" + --volume "$runtime_volume:/var/run/cocoindex_code" + -e COCOINDEX_CODE_DAEMON_TCP=0.0.0.0:8765 + -e COCOINDEX_CODE_DIR=/var/cocoindex/config + -e COCOINDEX_CODE_STATE_DIR=/var/cocoindex/state + -e COCOINDEX_CODE_RUNTIME_DIR=/var/run/cocoindex_code + -e COCOINDEX_CODE_DB_PATH_MAPPING=/workspace=/var/cocoindex/db + -e COCOINDEX_CODE_DAEMON_SUPERVISED=1 + ) + if [[ -n "${PUID:-}" ]]; then + run_args+=(-e "PUID=$PUID") + fi + if [[ -n "${PGID:-}" ]]; then + run_args+=(-e "PGID=$PGID") + fi + run_args+=("$image") + docker "${run_args[@]}" >/dev/null +} + +run_sidecar() { + local root="$1" + local common_dir="$2" + local host_cwd="$3" + shift 3 + + local run_args=( + run --rm -i + --network "$network" + --volume "$root:/workspace" + --volume "$state_volume:/var/cocoindex" + --volume "$runtime_volume:/var/run/cocoindex_code" + -e "COCOINDEX_CODE_DAEMON_TCP=$central_container:8765" + -e COCOINDEX_CODE_SIDECAR=1 + -e COCOINDEX_CODE_DAEMON_SUPERVISED=1 + -e COCOINDEX_CODE_DIR=/var/cocoindex/config + -e COCOINDEX_CODE_STATE_DIR=/var/cocoindex/state + -e COCOINDEX_CODE_RUNTIME_DIR=/var/run/cocoindex_code + -e COCOINDEX_CODE_DB_PATH_MAPPING=/workspace=/var/cocoindex/db + -e "COCOINDEX_CODE_HOST_PATH_MAPPING=/workspace=$root" + -e "COCOINDEX_CODE_HOST_CWD=$host_cwd" + ) + + if [[ "${1-}" != "mcp" && -t 0 && -t 1 ]]; then + run_args=(run --rm -it "${run_args[@]:3}") + fi + + if [[ "$common_dir" != "$root/.git" ]]; then + run_args+=(--volume "$common_dir:$common_dir:ro") + fi + if [[ -n "${PUID:-}" ]]; then + run_args+=(-e "PUID=$PUID") + fi + if [[ -n "${PGID:-}" ]]; then + run_args+=(-e "PGID=$PGID") + fi + run_args+=("$image" ccc "$@") + exec docker "${run_args[@]}" +} + +ensure_image_exists +ensure_central_daemon + +cwd="$(canonical_path "${COCOINDEX_CODE_HOST_CWD:-$PWD}")" + +if [[ "${1-}" == "init" ]]; then + root="$(git_root_for "$cwd")" + if [[ -z "$root" ]]; then + echo "ccc init must be run inside a Git repository for Docker authorization." >&2 + exit 1 + fi + common_dir="$(git_common_dir_for "$root")" + record_authorization "$root" "$common_dir" + run_sidecar "$root" "$common_dir" "$cwd" "$@" +fi + +root="$(lookup_authorized_root "$cwd" || true)" +if [[ -z "$root" ]]; then + echo "This path has not been authorized for Docker-backed ccc access:" >&2 + echo " $cwd" >&2 + echo "Run ccc init from the Git repo root or a subdirectory first." >&2 + exit 1 +fi + +common_dir="$(common_dir_for_authorized_root "$root")" +run_sidecar "$root" "$common_dir" "$cwd" "$@" diff --git a/sample/docker-compose.yml b/sample/docker-compose.yml new file mode 100644 index 0000000..3520642 --- /dev/null +++ b/sample/docker-compose.yml @@ -0,0 +1,42 @@ +# Central daemon compose file for the sidecar Docker model. +# +# This container does not mount source code. It only owns shared state/runtime +# volumes. Repo access happens through short-lived sidecars started by +# `sample/bin/ccc` after `ccc init` authorizes a specific Git repo. + +services: + cocoindex-code-daemon: + image: ${COCOINDEX_CODE_IMAGE:-cocoindex-code:local-layered} + container_name: ${COCOINDEX_CODE_DAEMON_CONTAINER:-cocoindex-code-local-daemon} + volumes: + - cocoindex-code-local-state:/var/cocoindex + - cocoindex-code-local-runtime:/var/run/cocoindex_code + environment: + COCOINDEX_CODE_DAEMON_TCP: 0.0.0.0:8765 + COCOINDEX_CODE_DIR: /var/cocoindex/config + COCOINDEX_CODE_STATE_DIR: /var/cocoindex/state + COCOINDEX_CODE_RUNTIME_DIR: /var/run/cocoindex_code + COCOINDEX_CODE_DB_PATH_MAPPING: /workspace=/var/cocoindex/db + COCOINDEX_CODE_DAEMON_SUPERVISED: "1" + PUID: ${PUID:-} + PGID: ${PGID:-} + healthcheck: + test: + [ + "CMD-SHELL", + "ccc daemon status >/dev/null 2>&1 || test -S /var/run/cocoindex_code/daemon.sock", + ] + interval: 10s + timeout: 5s + retries: 12 + start_period: 10s + networks: + - cocoindex-code-local + +volumes: + cocoindex-code-local-state: + cocoindex-code-local-runtime: + +networks: + cocoindex-code-local: + name: cocoindex-code-local diff --git a/src/cocoindex_code/_daemon_paths.py b/src/cocoindex_code/_daemon_paths.py index 75d8c79..82b8e3f 100644 --- a/src/cocoindex_code/_daemon_paths.py +++ b/src/cocoindex_code/_daemon_paths.py @@ -9,6 +9,7 @@ import os import sys from pathlib import Path +from typing import TypeAlias from .settings import user_settings_dir @@ -48,13 +49,22 @@ def daemon_state_dir() -> Path: return Path.home() / ".local" / "share" / "cocoindex-code" +DaemonAddress: TypeAlias = str | tuple[str, int] + + def connection_family() -> str: """Return the multiprocessing connection family for this platform.""" + if os.environ.get("COCOINDEX_CODE_DAEMON_TCP"): + return "AF_INET" return "AF_PIPE" if sys.platform == "win32" else "AF_UNIX" -def daemon_socket_path() -> str: +def daemon_socket_path() -> DaemonAddress: """Return the daemon socket/pipe address.""" + tcp = os.environ.get("COCOINDEX_CODE_DAEMON_TCP") + if tcp: + host, _, port = tcp.partition(":") + return (host or "127.0.0.1", int(port or "8765")) if sys.platform == "win32": import hashlib diff --git a/src/cocoindex_code/cli.py b/src/cocoindex_code/cli.py index 6f326f0..95c9859 100644 --- a/src/cocoindex_code/cli.py +++ b/src/cocoindex_code/cli.py @@ -2,6 +2,7 @@ from __future__ import annotations +import asyncio import functools import os import sys @@ -217,6 +218,8 @@ def _run_index_with_progress( from rich.spinner import Spinner as _Spinner from . import client as _client + from .protocol import IndexResponse as _IndexResponse + from .sidecar import run_sidecar_index, sidecar_enabled err_console = _Console(stderr=True) last_progress_line: str | None = None @@ -237,13 +240,24 @@ def _on_progress(progress: IndexingProgress) -> None: live.update(_Spinner("dots", last_progress_line)) try: - resp = _client.index( - project_root, - cwd=cwd, - base_ref=base_ref, - on_progress=_on_progress, - on_waiting=_on_waiting, - ) + if sidecar_enabled(): + asyncio.run( + run_sidecar_index( + project_root=Path(project_root), + cwd=Path(cwd) if cwd is not None else Path(project_root), + base_ref=base_ref, + on_progress=_on_progress, + ) + ) + resp = _IndexResponse(success=True) + else: + resp = _client.index( + project_root, + cwd=cwd, + base_ref=base_ref, + on_progress=_on_progress, + on_waiting=_on_waiting, + ) except RuntimeError as e: live.stop() # Let DaemonStartError propagate to the decorator for consistent handling. @@ -277,6 +291,7 @@ def _search_with_wait_spinner( from rich.spinner import Spinner as _Spinner from . import client as _client + from .sidecar import ensure_sidecar_layer_ids, sidecar_enabled err_console = _Console(stderr=True) @@ -288,11 +303,22 @@ def _on_waiting() -> None: refresh=True, ) + layer_ids = None + if sidecar_enabled(): + layer_ids = asyncio.run( + ensure_sidecar_layer_ids( + project_root=Path(project_root), + cwd=Path(cwd) if cwd is not None else Path(project_root), + base_ref=base_ref, + ) + ) + resp = _client.search( project_root=project_root, query=query, cwd=cwd, base_ref=base_ref, + layer_ids=layer_ids, languages=languages, paths=paths, limit=limit, diff --git a/src/cocoindex_code/client.py b/src/cocoindex_code/client.py index 7fbfa70..80d23bc 100644 --- a/src/cocoindex_code/client.py +++ b/src/cocoindex_code/client.py @@ -158,7 +158,7 @@ def _connect_and_handshake() -> Connection: def _raw_connect_and_handshake() -> Connection: """Low-level connect + handshake without auto-start logic.""" sock = daemon_socket_path() - if sys.platform != "win32" and not os.path.exists(sock): + if connection_family() == "AF_UNIX" and isinstance(sock, str) and not os.path.exists(sock): raise ConnectionRefusedError(f"Daemon socket not found: {sock}") try: conn = Client(sock, family=connection_family()) @@ -186,6 +186,34 @@ def _raw_connect_and_handshake() -> Connection: return conn +def _raw_connect_version_handshake_only() -> Connection: + """Connect and perform only protocol-version validation. + + Used for shutdown: if settings changed, the normal handshake intentionally + raises ``DaemonVersionError`` so regular requests restart, but shutdown must + still be able to tell the old daemon to exit. + """ + sock = daemon_socket_path() + conn = Client(sock, family=connection_family()) + try: + conn.send_bytes(encode_request(HandshakeRequest(version=__version__))) + data = conn.recv_bytes() + resp = decode_response(data) + except Exception: + conn.close() + raise + if isinstance(resp, ErrorResponse): + conn.close() + raise RuntimeError(f"Daemon error: {resp.message}") + if not isinstance(resp, HandshakeResponse): + conn.close() + raise RuntimeError(f"Unexpected handshake response: {type(resp).__name__}") + if not resp.ok: + conn.close() + raise DaemonVersionError(resp) + return conn + + class DaemonVersionError(RuntimeError): """Raised when the daemon has a version or settings mismatch. @@ -288,6 +316,7 @@ def search( query: str, cwd: str | None = None, base_ref: str | None = None, + layer_ids: list[str] | None = None, languages: list[str] | None = None, paths: list[str] | None = None, limit: int = 5, @@ -312,6 +341,7 @@ def search( query=query, cwd=cwd, base_ref=base_ref, + layer_ids=layer_ids, languages=languages, paths=paths, limit=limit, @@ -415,14 +445,15 @@ def doctor( def is_daemon_running() -> bool: """Check if the daemon is running.""" - if sys.platform == "win32": + if connection_family() != "AF_UNIX": try: conn = Client(daemon_socket_path(), family=connection_family()) conn.close() return True except (ConnectionRefusedError, OSError): return False - return os.path.exists(daemon_socket_path()) + sock = daemon_socket_path() + return isinstance(sock, str) and os.path.exists(sock) def start_daemon() -> subprocess.Popen[bytes]: @@ -529,7 +560,7 @@ def stop_daemon() -> None: # 1) Graceful StopRequest via socket (bypass auto-start) try: - conn = _raw_connect_and_handshake() + conn = _raw_connect_version_handshake_only() try: conn.send_bytes(encode_request(StopRequest())) conn.recv_bytes() @@ -564,10 +595,11 @@ def _cleanup_stale_files(pid_path: Path, pid: int | None) -> None: """Remove socket and PID file after the daemon has exited.""" if sys.platform != "win32": sock = daemon_socket_path() - try: - Path(sock).unlink(missing_ok=True) - except Exception: - pass + if isinstance(sock, str): + try: + Path(sock).unlink(missing_ok=True) + except Exception: + pass if pid is not None: try: stored = pid_path.read_text().strip() @@ -601,7 +633,7 @@ def _wait_for_daemon( deadline = time.monotonic() + timeout sock_path = daemon_socket_path() while time.monotonic() < deadline: - if sys.platform == "win32": + if connection_family() != "AF_UNIX": try: conn = Client(sock_path, family=connection_family()) conn.close() @@ -609,7 +641,7 @@ def _wait_for_daemon( except (ConnectionRefusedError, OSError): pass else: - if os.path.exists(sock_path): + if isinstance(sock_path, str) and os.path.exists(sock_path): return # Daemon socket not yet up — if we spawned a subprocess that already diff --git a/src/cocoindex_code/daemon.py b/src/cocoindex_code/daemon.py index 9b31d84..f5fa3ef 100644 --- a/src/cocoindex_code/daemon.py +++ b/src/cocoindex_code/daemon.py @@ -29,6 +29,7 @@ from .git_context import GitContextError, resolve_worktree_context from .layer_store import LayerStore from .layered_project import LayeredProject, build_index_config_hash +from .layers import LayerBuildResult, LayerRuntime from .project import Project from .protocol import ( DaemonEnvRequest, @@ -58,6 +59,7 @@ Response, SearchRequest, SearchResponse, + SearchResult, SearchStreamResponse, StopRequest, StopResponse, @@ -245,6 +247,61 @@ def list_projects(self) -> list[DaemonProjectInfo]: for root, project in self._projects.items() ] + async def search_layers( + self, + *, + project_root: str, + layer_ids: list[str], + query: str, + languages: list[str] | None, + paths: list[str] | None, + limit: int, + offset: int, + ) -> list[SearchResult]: + if self._embedder is None: + raise RuntimeError( + "Daemon has no global settings loaded. Run `ccc init` to set up cocoindex-code." + ) + stack = [] + for layer_id in layer_ids: + layer = self.layer_store.get_layer(layer_id) + if layer is None: + raise RuntimeError(f"Layer not found: {layer_id}") + manifest = self.layer_store.get_manifest(layer_id) + if manifest is None: + raise RuntimeError(f"Layer manifest not found: {layer_id}") + runtime = await LayerRuntime.create( + layer=layer, + project_root=Path(project_root), + embedder=self._embedder, + indexing_params=self.indexing_params, + query_params=self.query_params, + chunker_registry={}, + project_cache=self._layer_project_cache, + ) + stack.append(LayerBuildResult(layer=layer, manifest=manifest, runtime=runtime)) + + from .layers.layer_stack import LayerStack + + layer_stack = LayerStack( + project_root=Path(project_root), + state_dir=self.state_dir, + store=self.layer_store, + embedder=self._embedder, + indexing_params=self.indexing_params, + query_params=self.query_params, + chunker_registry={}, + project_cache=self._layer_project_cache, + ) + return await layer_stack.search( + layers=stack, + query=query, + languages=languages, + paths=paths, + limit=limit, + offset=offset, + ) + # --------------------------------------------------------------------------- # Connection handler @@ -532,6 +589,23 @@ async def _dispatch( return project.stream_index() if isinstance(req, SearchRequest): + if req.layer_ids is not None: + results = await registry.search_layers( + project_root=req.project_root, + layer_ids=req.layer_ids, + query=req.query, + languages=req.languages, + paths=req.paths, + limit=req.limit, + offset=req.offset, + ) + return SearchResponse( + success=True, + results=results, + total_returned=len(results), + offset=req.offset, + ) + project = await registry.get_project( req.project_root, cwd=req.cwd, base_ref=req.base_ref ) @@ -715,7 +789,7 @@ def run_daemon() -> None: ) sock_path = daemon_socket_path() - if sys.platform != "win32": + if connection_family() == "AF_UNIX" and isinstance(sock_path, str): try: Path(sock_path).unlink(missing_ok=True) except Exception: @@ -785,7 +859,7 @@ def _accept_loop() -> None: loop.close() # 4. Remove socket and PID file. - if sys.platform != "win32": + if connection_family() == "AF_UNIX" and isinstance(sock_path, str): try: Path(sock_path).unlink(missing_ok=True) except Exception: diff --git a/src/cocoindex_code/layered_project.py b/src/cocoindex_code/layered_project.py index 4c736e8..bc48658 100644 --- a/src/cocoindex_code/layered_project.py +++ b/src/cocoindex_code/layered_project.py @@ -154,6 +154,14 @@ async def search( offset=offset, ) + async def ensure_layer_ids( + self, + on_progress: Callable[[IndexingProgress], None] | None = None, + ) -> list[str]: + layers = await self._ensure_layers(on_progress=on_progress) + self._last_layers = layers + return [layer.layer.id for layer in layers] + def get_status(self) -> ProjectStatusResponse: total_chunks = 0 total_files_set: set[str] = set() diff --git a/src/cocoindex_code/protocol.py b/src/cocoindex_code/protocol.py index 52e7e7c..7425d46 100644 --- a/src/cocoindex_code/protocol.py +++ b/src/cocoindex_code/protocol.py @@ -24,6 +24,7 @@ class SearchRequest(_msgspec.Struct, tag="search"): query: str cwd: str | None = None base_ref: str | None = None + layer_ids: list[str] | None = None languages: list[str] | None = None paths: list[str] | None = None limit: int = 5 diff --git a/src/cocoindex_code/sidecar.py b/src/cocoindex_code/sidecar.py new file mode 100644 index 0000000..9d25891 --- /dev/null +++ b/src/cocoindex_code/sidecar.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +import os +from collections.abc import Callable +from pathlib import Path + +from ._daemon_paths import daemon_state_dir +from .daemon import _resolve_chunker_registry +from .embedder_params import resolve_embedder_params +from .layer_store import LayerStore +from .layered_project import LayeredProject +from .protocol import IndexingProgress +from .settings import load_project_settings, load_user_settings +from .shared import create_embedder + + +def sidecar_enabled() -> bool: + return os.environ.get("COCOINDEX_CODE_SIDECAR") == "1" + + +async def ensure_sidecar_layer_ids( + *, + project_root: Path, + cwd: Path, + base_ref: str | None, + on_progress: Callable[[IndexingProgress], None] | None = None, +) -> list[str]: + user_settings = load_user_settings() + for key, value in user_settings.envs.items(): + os.environ[key] = value + params = resolve_embedder_params(user_settings.embedding) + project_settings = load_project_settings(project_root) + state_dir = daemon_state_dir() + project = LayeredProject( + project_root=project_root, + cwd=cwd, + base_ref=base_ref, + state_dir=state_dir, + store=LayerStore(state_dir / "daemon.db"), + embedder=create_embedder(user_settings.embedding, indexing_params=params.indexing), + indexing_params=params.indexing, + query_params=params.query, + chunker_registry=_resolve_chunker_registry(project_settings.chunkers), + project_cache={}, + ) + try: + return await project.ensure_layer_ids(on_progress=on_progress) + finally: + project.close() + + +async def run_sidecar_index( + *, + project_root: Path, + cwd: Path, + base_ref: str | None, + on_progress: Callable[[IndexingProgress], None] | None = None, +) -> None: + await ensure_sidecar_layer_ids( + project_root=project_root, + cwd=cwd, + base_ref=base_ref, + on_progress=on_progress, + ) diff --git a/tests/test_daemon_paths.py b/tests/test_daemon_paths.py new file mode 100644 index 0000000..cfaa7c6 --- /dev/null +++ b/tests/test_daemon_paths.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +from pathlib import Path + +import pytest + +from cocoindex_code import _daemon_paths + + +def test_default_daemon_address_uses_unix_socket( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.delenv("COCOINDEX_CODE_DAEMON_TCP", raising=False) + monkeypatch.setenv("COCOINDEX_CODE_RUNTIME_DIR", str(tmp_path)) + monkeypatch.setattr(_daemon_paths.sys, "platform", "linux") + + assert _daemon_paths.connection_family() == "AF_UNIX" + assert _daemon_paths.daemon_socket_path() == str(tmp_path / "daemon.sock") + + +def test_daemon_tcp_env_switches_to_af_inet(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("COCOINDEX_CODE_DAEMON_TCP", "daemon:8765") + + assert _daemon_paths.connection_family() == "AF_INET" + assert _daemon_paths.daemon_socket_path() == ("daemon", 8765) + + +def test_daemon_tcp_defaults_empty_host_and_port(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("COCOINDEX_CODE_DAEMON_TCP", ":") + + assert _daemon_paths.daemon_socket_path() == ("127.0.0.1", 8765) diff --git a/tests/test_docker_setup.py b/tests/test_docker_setup.py index 57c3ab6..d612203 100644 --- a/tests/test_docker_setup.py +++ b/tests/test_docker_setup.py @@ -34,6 +34,7 @@ def test_docker_entrypoint_prepares_state_db_cache_and_runtime_dirs() -> None: assert '"$SENTENCE_TRANSFORMERS_HOME"' in content assert '"$COCOINDEX_CODE_RUNTIME_DIR"' in content assert "chown -R coco:coco /var/cocoindex" in content + assert 'exec gosu coco "$@"' in content def test_docker_compose_exposes_local_use_knobs_and_healthcheck() -> None: @@ -62,6 +63,8 @@ def test_docker_compose_exposes_local_use_knobs_and_healthcheck() -> None: def test_readme_documents_docker_state_runtime_and_host_cwd_mapping() -> None: content = (REPO_ROOT / "README.md").read_text() + assert "central daemon + on-demand sidecar" in content + assert "Sidecars mount only the authorized repo" in content assert "COCOINDEX_CODE_HOST_CWD=\"$PWD\"" in content assert "docker exec \"${flags[@]}\"" in content assert "ccc mcp" in content @@ -73,6 +76,42 @@ def test_readme_documents_docker_state_runtime_and_host_cwd_mapping() -> None: assert "COCOINDEX_CODE_HOST_PATH_MAPPING" in content +def test_docker_sidecar_docs_describe_repo_scoped_architecture() -> None: + content = (REPO_ROOT / "docs" / "docker-layered-indexing.md").read_text() + + assert "one central daemon container with no source-code mount" in content + assert "short-lived sidecar containers" in content + assert "Do not mount `$HOME` or a broad source tree" in content + assert "COCOINDEX_CODE_DAEMON_TCP" in content + assert "COCOINDEX_CODE_SIDECAR=1" in content + + +def test_sample_compose_uses_daemon_without_source_mount() -> None: + content = (REPO_ROOT / "sample" / "docker-compose.yml").read_text() + + assert ":/workspace" not in content + assert "COCOINDEX_CODE_DAEMON_TCP: 0.0.0.0:8765" in content + assert "cocoindex-code-local-state:/var/cocoindex" in content + assert "cocoindex-code-local-runtime:/var/run/cocoindex_code" in content + + +def test_sample_wrapper_mounts_only_authorized_repo_sidecar() -> None: + content = (REPO_ROOT / "sample" / "bin" / "ccc").read_text() + + assert 'record_authorization "$root" "$common_dir"' in content + assert '--volume "$root:/workspace"' in content + assert "COCOINDEX_CODE_SIDECAR=1" in content + assert 'COCOINDEX_CODE_DAEMON_TCP=$central_container:8765' in content + + +def test_sample_makefile_has_default_image_and_reset_target() -> None: + content = (REPO_ROOT / "sample" / "Makefile").read_text() + + assert "IMAGE ?= cocoindex-code:local-layered" in content + assert "reset: down" in content + assert "docker volume rm" in content + + def test_docker_compose_config_is_valid(tmp_path: Path) -> None: if shutil.which("docker") is None: pytest.skip("Docker CLI not available") diff --git a/tests/test_protocol.py b/tests/test_protocol.py index c38d07d..91c69b3 100644 --- a/tests/test_protocol.py +++ b/tests/test_protocol.py @@ -51,6 +51,7 @@ def test_encode_decode_search_request_with_defaults() -> None: assert isinstance(decoded, SearchRequest) assert decoded.cwd is None assert decoded.base_ref is None + assert decoded.layer_ids is None assert decoded.languages is None assert decoded.limit == 5 assert decoded.offset == 0 @@ -66,6 +67,7 @@ def test_encode_decode_search_request_with_all_fields() -> None: offset=5, cwd="/tmp/proj/src", base_ref="main", + layer_ids=["base-layer", "branch-layer", "dirty-layer"], ) data = encode_request(req) decoded = decode_request(data) @@ -78,6 +80,7 @@ def test_encode_decode_search_request_with_all_fields() -> None: assert decoded.offset == 5 assert decoded.cwd == "/tmp/proj/src" assert decoded.base_ref == "main" + assert decoded.layer_ids == ["base-layer", "branch-layer", "dirty-layer"] def test_encode_decode_search_response_with_results() -> None: From 2a6a39887c793d862160458e22bb3f15f0b62e5f Mon Sep 17 00:00:00 2001 From: "rudimar.ronsoni" Date: Wed, 20 May 2026 00:47:30 +0200 Subject: [PATCH 03/15] Expand Docker sidecar test coverage --- tests/test_cli_helpers.py | 111 +++++++++++++++++++++++++++++++ tests/test_client.py | 130 ++++++++++++++++++++++++++++++++++++ tests/test_daemon_paths.py | 36 ++++++++++ tests/test_docker_setup.py | 28 ++++++++ tests/test_sidecar.py | 131 +++++++++++++++++++++++++++++++++++++ 5 files changed, 436 insertions(+) create mode 100644 tests/test_sidecar.py diff --git a/tests/test_cli_helpers.py b/tests/test_cli_helpers.py index ec9876a..7570b6f 100644 --- a/tests/test_cli_helpers.py +++ b/tests/test_cli_helpers.py @@ -13,6 +13,7 @@ require_project_root, resolve_default_path, ) +from cocoindex_code.protocol import SearchResponse def test_require_project_root_success(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: @@ -206,6 +207,116 @@ def test_apply_host_cwd_noop_when_unset( assert capsys.readouterr().err == "" +def test_search_with_wait_spinner_resolves_sidecar_layers_before_daemon_search( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + import cocoindex_code.client as client + import cocoindex_code.sidecar as sidecar + + async def fake_ensure_sidecar_layer_ids(**kwargs: object) -> list[str]: + captured["ensure_kwargs"] = kwargs + return ["base", "dirty"] + + def fake_search(**kwargs: object) -> SearchResponse: + captured["search_kwargs"] = kwargs + return SearchResponse(success=True) + + captured: dict[str, object] = {} + monkeypatch.setattr(sidecar, "sidecar_enabled", lambda: True) + monkeypatch.setattr(sidecar, "ensure_sidecar_layer_ids", fake_ensure_sidecar_layer_ids) + monkeypatch.setattr(client, "search", fake_search) + + resp = cli._search_with_wait_spinner( + project_root=str(tmp_path / "repo"), + cwd=str(tmp_path / "repo" / "src"), + base_ref="main", + query="hello", + languages=["python"], + paths=["src/*"], + limit=3, + offset=1, + ) + + assert resp.success is True + ensure_kwargs = captured["ensure_kwargs"] + assert isinstance(ensure_kwargs, dict) + assert ensure_kwargs["project_root"] == tmp_path / "repo" + assert ensure_kwargs["cwd"] == tmp_path / "repo" / "src" + assert ensure_kwargs["base_ref"] == "main" + + search_kwargs = captured["search_kwargs"] + assert isinstance(search_kwargs, dict) + assert search_kwargs["project_root"] == str(tmp_path / "repo") + assert search_kwargs["cwd"] == str(tmp_path / "repo" / "src") + assert search_kwargs["base_ref"] == "main" + assert search_kwargs["layer_ids"] == ["base", "dirty"] + assert search_kwargs["languages"] == ["python"] + assert search_kwargs["paths"] == ["src/*"] + assert search_kwargs["limit"] == 3 + assert search_kwargs["offset"] == 1 + + +def test_search_with_wait_spinner_omits_layer_ids_outside_sidecar( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + import cocoindex_code.client as client + import cocoindex_code.sidecar as sidecar + + def fail_ensure(**_kwargs: object) -> list[str]: + raise AssertionError("sidecar layer resolution should not run") + + def fake_search(**kwargs: object) -> SearchResponse: + captured["search_kwargs"] = kwargs + return SearchResponse(success=True) + + captured: dict[str, object] = {} + monkeypatch.setattr(sidecar, "sidecar_enabled", lambda: False) + monkeypatch.setattr(sidecar, "ensure_sidecar_layer_ids", fail_ensure) + monkeypatch.setattr(client, "search", fake_search) + + resp = cli._search_with_wait_spinner( + project_root=str(tmp_path / "repo"), + query="hello", + ) + + assert resp.success is True + search_kwargs = captured["search_kwargs"] + assert isinstance(search_kwargs, dict) + assert search_kwargs["layer_ids"] is None + + +def test_run_index_with_progress_uses_sidecar_indexer( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] +) -> None: + import cocoindex_code.client as client + import cocoindex_code.sidecar as sidecar + + async def fake_run_sidecar_index(**kwargs: object) -> None: + captured["index_kwargs"] = kwargs + + def fail_client_index(*_args: object, **_kwargs: object) -> object: + raise AssertionError("daemon index should not run in sidecar mode") + + captured: dict[str, object] = {} + monkeypatch.setattr(sidecar, "sidecar_enabled", lambda: True) + monkeypatch.setattr(sidecar, "run_sidecar_index", fake_run_sidecar_index) + monkeypatch.setattr(client, "index", fail_client_index) + + cli._run_index_with_progress( + str(tmp_path / "repo"), + cwd=str(tmp_path / "repo" / "src"), + base_ref="main", + ) + + kwargs = captured["index_kwargs"] + assert isinstance(kwargs, dict) + assert kwargs["project_root"] == tmp_path / "repo" + assert kwargs["cwd"] == tmp_path / "repo" / "src" + assert kwargs["base_ref"] == "main" + assert callable(kwargs["on_progress"]) + assert "Indexing failed" not in capsys.readouterr().err + + # --------------------------------------------------------------------------- # ccc init — auto-populate indexing_params / query_params from curated table # --------------------------------------------------------------------------- diff --git a/tests/test_client.py b/tests/test_client.py index 340fbe6..3480079 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -8,6 +8,30 @@ import pytest from cocoindex_code import client +from cocoindex_code._version import __version__ +from cocoindex_code.protocol import ( + HandshakeResponse, + SearchRequest, + SearchResponse, + decode_request, + encode_response, +) + + +class _FakeConnection: + def __init__(self, responses: list[bytes]) -> None: + self.responses = responses + self.sent: list[bytes] = [] + self.closed = False + + def send_bytes(self, data: bytes) -> None: + self.sent.append(data) + + def recv_bytes(self) -> bytes: + return self.responses.pop(0) + + def close(self) -> None: + self.closed = True def test_client_connect_refuses_when_no_daemon( @@ -77,3 +101,109 @@ def test_print_handshake_warnings_no_warnings_prints_nothing( monkeypatch.setattr(client, "_surfaced_warnings", set()) client._print_handshake_warnings(HandshakeResponse(ok=True, daemon_version="x")) assert capsys.readouterr().err == "" + + +def test_search_sends_layer_ids_and_filters(monkeypatch: pytest.MonkeyPatch) -> None: + conn = _FakeConnection([encode_response(SearchResponse(success=True))]) + monkeypatch.setattr(client, "_connect_and_handshake", lambda: conn) + + resp = client.search( + project_root="/tmp/project", + cwd="/tmp/project/src", + base_ref="main", + query="hello", + layer_ids=["base", "branch", "dirty"], + languages=["python"], + paths=["src/*"], + limit=7, + offset=2, + ) + + assert resp.success is True + assert conn.closed is True + assert len(conn.sent) == 1 + req = decode_request(conn.sent[0]) + assert isinstance(req, SearchRequest) + assert req.project_root == "/tmp/project" + assert req.cwd == "/tmp/project/src" + assert req.base_ref == "main" + assert req.layer_ids == ["base", "branch", "dirty"] + assert req.languages == ["python"] + assert req.paths == ["src/*"] + assert req.limit == 7 + assert req.offset == 2 + + +def test_raw_connect_version_handshake_only_ignores_settings_mismatch( + monkeypatch: pytest.MonkeyPatch, +) -> None: + conn = _FakeConnection( + [ + encode_response( + HandshakeResponse( + ok=True, + daemon_version=__version__, + global_settings_mtime_us=-1, + ) + ) + ] + ) + + monkeypatch.setattr(client, "Client", lambda *_args, **_kwargs: conn) + monkeypatch.setattr(client, "connection_family", lambda: "AF_INET") + monkeypatch.setattr(client, "daemon_socket_path", lambda: ("daemon", 8765)) + + returned = client._raw_connect_version_handshake_only() + + assert returned is conn + assert len(conn.sent) == 1 + + +def test_raw_connect_version_handshake_only_rejects_protocol_mismatch( + monkeypatch: pytest.MonkeyPatch, +) -> None: + conn = _FakeConnection( + [encode_response(HandshakeResponse(ok=False, daemon_version="old"))] + ) + + monkeypatch.setattr(client, "Client", lambda *_args, **_kwargs: conn) + monkeypatch.setattr(client, "connection_family", lambda: "AF_INET") + monkeypatch.setattr(client, "daemon_socket_path", lambda: ("daemon", 8765)) + + with pytest.raises(client.DaemonVersionError): + client._raw_connect_version_handshake_only() + + assert conn.closed is True + + +def test_raw_connect_and_handshake_rejects_restart_needed( + monkeypatch: pytest.MonkeyPatch, +) -> None: + conn = _FakeConnection( + [encode_response(HandshakeResponse(ok=True, daemon_version=__version__))] + ) + + monkeypatch.setattr(client, "Client", lambda *_args, **_kwargs: conn) + monkeypatch.setattr(client, "connection_family", lambda: "AF_INET") + monkeypatch.setattr(client, "daemon_socket_path", lambda: ("daemon", 8765)) + monkeypatch.setattr(client, "_needs_restart", lambda _resp: True) + + with pytest.raises(client.DaemonVersionError): + client._raw_connect_and_handshake() + + assert conn.closed is True + + +def test_raw_connect_version_handshake_only_closes_on_unexpected_response( + monkeypatch: pytest.MonkeyPatch, +) -> None: + conn = _FakeConnection([encode_response(SearchResponse(success=True))]) + + monkeypatch.setattr(client, "Client", lambda *_args, **_kwargs: conn) + monkeypatch.setattr(client, "connection_family", lambda: "AF_INET") + monkeypatch.setattr(client, "daemon_socket_path", lambda: ("daemon", 8765)) + + with pytest.raises(RuntimeError, match="Unexpected handshake response"): + client._raw_connect_version_handshake_only() + + assert conn.closed is True diff --git a/tests/test_daemon_paths.py b/tests/test_daemon_paths.py index cfaa7c6..257beb3 100644 --- a/tests/test_daemon_paths.py +++ b/tests/test_daemon_paths.py @@ -29,3 +29,39 @@ def test_daemon_tcp_defaults_empty_host_and_port(monkeypatch: pytest.MonkeyPatch monkeypatch.setenv("COCOINDEX_CODE_DAEMON_TCP", ":") assert _daemon_paths.daemon_socket_path() == ("127.0.0.1", 8765) + + +def test_daemon_tcp_defaults_missing_port(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("COCOINDEX_CODE_DAEMON_TCP", "daemon") + + assert _daemon_paths.daemon_socket_path() == ("daemon", 8765) + + +def test_daemon_tcp_rejects_non_numeric_port(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("COCOINDEX_CODE_DAEMON_TCP", "daemon:not-a-port") + + with pytest.raises(ValueError): + _daemon_paths.daemon_socket_path() + + +def test_tcp_env_takes_precedence_over_windows_pipe( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr(_daemon_paths.sys, "platform", "win32") + monkeypatch.setenv("COCOINDEX_CODE_DAEMON_TCP", "daemon:9000") + + assert _daemon_paths.connection_family() == "AF_INET" + assert _daemon_paths.daemon_socket_path() == ("daemon", 9000) + + +def test_windows_pipe_name_includes_runtime_dir_hash( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.delenv("COCOINDEX_CODE_DAEMON_TCP", raising=False) + monkeypatch.setattr(_daemon_paths.sys, "platform", "win32") + monkeypatch.setenv("COCOINDEX_CODE_RUNTIME_DIR", str(tmp_path)) + + assert _daemon_paths.connection_family() == "AF_PIPE" + pipe = _daemon_paths.daemon_socket_path() + assert isinstance(pipe, str) + assert pipe.startswith(r"\\.\pipe\cocoindex_code_") diff --git a/tests/test_docker_setup.py b/tests/test_docker_setup.py index d612203..ad078f7 100644 --- a/tests/test_docker_setup.py +++ b/tests/test_docker_setup.py @@ -90,6 +90,7 @@ def test_sample_compose_uses_daemon_without_source_mount() -> None: content = (REPO_ROOT / "sample" / "docker-compose.yml").read_text() assert ":/workspace" not in content + assert "ports:" not in content assert "COCOINDEX_CODE_DAEMON_TCP: 0.0.0.0:8765" in content assert "cocoindex-code-local-state:/var/cocoindex" in content assert "cocoindex-code-local-runtime:/var/run/cocoindex_code" in content @@ -100,8 +101,35 @@ def test_sample_wrapper_mounts_only_authorized_repo_sidecar() -> None: assert 'record_authorization "$root" "$common_dir"' in content assert '--volume "$root:/workspace"' in content + assert '--network "$network"' in content assert "COCOINDEX_CODE_SIDECAR=1" in content + assert "COCOINDEX_CODE_DAEMON_SUPERVISED=1" in content assert 'COCOINDEX_CODE_DAEMON_TCP=$central_container:8765' in content + assert 'exec docker "${run_args[@]}"' in content + + +def test_sample_wrapper_authorization_handles_nested_repos_and_worktrees() -> None: + content = (REPO_ROOT / "sample" / "bin" / "ccc").read_text() + + assert 'if (( ${#root} > ${#best} )); then' in content + assert 'git_common_dir_for()' in content + assert 'common_dir="$(git_common_dir_for "$root")"' in content + assert 'if [[ "$common_dir" != "$root/.git" ]]; then' in content + assert '--volume "$common_dir:$common_dir:ro"' in content + + +def test_sample_wrapper_refuses_unauthorized_paths_and_requires_git_for_init() -> None: + content = (REPO_ROOT / "sample" / "bin" / "ccc").read_text() + + assert "ccc init must be run inside a Git repository for Docker authorization." in content + assert "This path has not been authorized for Docker-backed ccc access:" in content + assert "Run ccc init from the Git repo root or a subdirectory first." in content + + +def test_sample_gitignore_excludes_runtime_authorization_state() -> None: + content = (REPO_ROOT / "sample" / ".gitignore").read_text() + + assert "data/" in content def test_sample_makefile_has_default_image_and_reset_target() -> None: diff --git a/tests/test_sidecar.py b/tests/test_sidecar.py new file mode 100644 index 0000000..8884d09 --- /dev/null +++ b/tests/test_sidecar.py @@ -0,0 +1,131 @@ +from __future__ import annotations + +from pathlib import Path +from types import SimpleNamespace +from typing import Any + +import pytest + +from cocoindex_code import sidecar +from cocoindex_code.protocol import IndexingProgress +from cocoindex_code.settings import EmbeddingSettings, ProjectSettings, UserSettings + + +class _FakeLayeredProject: + created: list[_FakeLayeredProject] = [] + + def __init__(self, **kwargs: Any) -> None: + self.kwargs = kwargs + self.closed = False + _FakeLayeredProject.created.append(self) + + async def ensure_layer_ids(self, on_progress: Any = None) -> list[str]: + if on_progress is not None: + on_progress( + IndexingProgress( + num_execution_starts=1, + num_unchanged=0, + num_adds=1, + num_deletes=0, + num_reprocesses=0, + num_errors=0, + ) + ) + return ["base", "branch", "dirty"] + + def close(self) -> None: + self.closed = True + + +class _FailingLayeredProject(_FakeLayeredProject): + async def ensure_layer_ids(self, on_progress: Any = None) -> list[str]: + raise RuntimeError("boom") + + +@pytest.mark.asyncio +async def test_ensure_sidecar_layer_ids_sets_env_and_closes_project( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + _FakeLayeredProject.created.clear() + monkeypatch.setenv("COCOINDEX_CODE_STATE_DIR", str(tmp_path / "state")) + monkeypatch.setattr( + sidecar, + "load_user_settings", + lambda: UserSettings( + embedding=EmbeddingSettings(provider="litellm", model="test-model"), + envs={"TEST_SIDECAR_ENV": "value"}, + ), + ) + monkeypatch.setattr( + sidecar, + "resolve_embedder_params", + lambda _embedding: SimpleNamespace(indexing={"input": "doc"}, query={"input": "query"}), + ) + monkeypatch.setattr(sidecar, "load_project_settings", lambda _root: ProjectSettings()) + monkeypatch.setattr(sidecar, "create_embedder", lambda *_args, **_kwargs: object()) + monkeypatch.setattr(sidecar, "_resolve_chunker_registry", lambda _chunkers: {".x": object()}) + monkeypatch.setattr(sidecar, "LayeredProject", _FakeLayeredProject) + + progress: list[IndexingProgress] = [] + layer_ids = await sidecar.ensure_sidecar_layer_ids( + project_root=tmp_path / "repo", + cwd=tmp_path / "repo" / "src", + base_ref="main", + on_progress=progress.append, + ) + + assert layer_ids == ["base", "branch", "dirty"] + assert progress and progress[0].num_adds == 1 + assert sidecar.os.environ["TEST_SIDECAR_ENV"] == "value" + [project] = _FakeLayeredProject.created + assert project.closed is True + assert project.kwargs["project_root"] == tmp_path / "repo" + assert project.kwargs["cwd"] == tmp_path / "repo" / "src" + assert project.kwargs["base_ref"] == "main" + assert project.kwargs["indexing_params"] == {"input": "doc"} + assert project.kwargs["query_params"] == {"input": "query"} + assert ".x" in project.kwargs["chunker_registry"] + assert project.kwargs["project_cache"] == {} + + +@pytest.mark.asyncio +async def test_ensure_sidecar_layer_ids_closes_project_on_failure( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + _FailingLayeredProject.created.clear() + monkeypatch.setenv("COCOINDEX_CODE_STATE_DIR", str(tmp_path / "state")) + monkeypatch.setattr( + sidecar, + "load_user_settings", + lambda: UserSettings(embedding=EmbeddingSettings(provider="litellm", model="test-model")), + ) + monkeypatch.setattr( + sidecar, + "resolve_embedder_params", + lambda _embedding: SimpleNamespace(indexing={}, query={}), + ) + monkeypatch.setattr(sidecar, "load_project_settings", lambda _root: ProjectSettings()) + monkeypatch.setattr(sidecar, "create_embedder", lambda *_args, **_kwargs: object()) + monkeypatch.setattr(sidecar, "_resolve_chunker_registry", lambda _chunkers: {}) + monkeypatch.setattr(sidecar, "LayeredProject", _FailingLayeredProject) + + with pytest.raises(RuntimeError, match="boom"): + await sidecar.ensure_sidecar_layer_ids( + project_root=tmp_path / "repo", + cwd=tmp_path / "repo", + base_ref=None, + ) + + [project] = _FailingLayeredProject.created + assert project.closed is True + + +def test_sidecar_enabled_requires_exact_one(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("COCOINDEX_CODE_SIDECAR", raising=False) + assert sidecar.sidecar_enabled() is False + + monkeypatch.setenv("COCOINDEX_CODE_SIDECAR", "true") + assert sidecar.sidecar_enabled() is False + + monkeypatch.setenv("COCOINDEX_CODE_SIDECAR", "1") + assert sidecar.sidecar_enabled() is True From d08f783ebd2885889790d5bb3da9653a8c42814e Mon Sep 17 00:00:00 2001 From: Rudimar Ronsoni Date: Wed, 20 May 2026 11:15:23 +0200 Subject: [PATCH 04/15] Make repo-local config opt-in --- README.md | 25 +++++++-- docker/docker-compose.yml | 64 ++++++++------------- docs/docker-layered-indexing.md | 11 +++- sample/Makefile | 16 +++++- sample/README.md | 30 ++++++++++ sample/bin/ccc | 51 +++++++++-------- sample/docker-compose.yml | 3 +- skills/ccc/references/management.md | 6 +- skills/ccc/references/settings.md | 4 +- src/cocoindex_code/cli.py | 24 ++++++-- src/cocoindex_code/settings.py | 12 ++-- tests/test_cli_helpers.py | 17 ++++++ tests/test_docker_setup.py | 73 ++++++++++++++++++------ tests/test_e2e.py | 87 +++++++++++++++++++++-------- tests/test_settings.py | 16 +++++- 15 files changed, 307 insertions(+), 132 deletions(-) diff --git a/README.md b/README.md index 6409147..e58e908 100644 --- a/README.md +++ b/README.md @@ -157,14 +157,16 @@ Returns matching code chunks with file path, language, code content, line number You can also use the CLI directly — useful for manual control, running indexing after changing settings, checking status, or searching outside an agent. ```bash -ccc init # initialize project (creates settings) +ccc init # initialize daemon/global settings ccc index # build the index ccc search "authentication logic" # search! ``` The background daemon starts automatically on first use. -> **Tip:** `ccc index` auto-initializes if you haven't run `ccc init` yet, so you can skip straight to indexing. +> **Tip:** once global daemon settings exist, Git repositories can be indexed +> without repo-local `.cocoindex_code/settings.yml`; built-in project defaults +> are used unless you create overrides with `ccc init --project-settings`. For Git repositories, you can configure layered indexing once from the root clone: @@ -180,7 +182,7 @@ Linked worktrees reuse the same daemon-owned base layer and only index branch an | Command | Description | |---------|-------------| -| `ccc init` | Initialize a project — creates settings files, adds `.cocoindex_code/` to `.gitignore` | +| `ccc init` | Initialize daemon/global settings and optional Git overlay metadata | | `ccc index` | Build or update the index (auto-inits if needed). Shows streaming progress. | | `ccc search ` | Semantic search across the codebase | | `ccc status` | Show index stats (chunk count, file count, language breakdown) | @@ -547,7 +549,9 @@ docker build -t cocoindex-code:local -f docker/Dockerfile . ## Configuration -Configuration lives in two YAML files, both created automatically by `ccc init`. +Configuration is daemon-first. `ccc init` creates global daemon settings when +needed and does not write repo-local files by default. Repo-local settings are +optional overrides. ### User Settings (`~/.cocoindex_code/global_settings.yml`) @@ -602,7 +606,9 @@ OpenAI embeddings (`text-embedding-3-*`, `text-embedding-ada-002`) are intention ### Project Settings (`/.cocoindex_code/settings.yml`) -Per-project. Controls which files to index. +Optional per-project overrides. Controls which files to index when present. +If this file does not exist, `ccc` uses the built-in defaults without creating +anything in the repository. ```yaml include_patterns: @@ -629,7 +635,14 @@ chunkers: module: example_toml_chunker:toml_chunker ``` -> `.cocoindex_code/` is automatically added to `.gitignore` during init. +Create repo-local overrides explicitly: + +```bash +ccc init --project-settings +``` + +`ccc init` also does not edit `.gitignore` by default. Use +`ccc init --gitignore` if you want it to add `/.cocoindex_code/` for you. Use `chunkers` when you want to control how a file type is split into chunks before indexing. diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 5788984..003dc11 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -1,47 +1,24 @@ -# cocoindex-code — Docker Compose quickstart. +# Central daemon compose file for the sidecar Docker model. # -# # macOS / Windows -# docker compose up -d -# -# # Linux (aligns file ownership with your host user) -# PUID=$(id -u) PGID=$(id -g) docker compose up -d -# -# By default your home directory is mounted into the container as the -# workspace. Set COCOINDEX_HOST_WORKSPACE=/path/to/code to mount a narrower -# directory instead. -# -# Override the image via COCOINDEX_CODE_IMAGE — for example: -# COCOINDEX_CODE_IMAGE=cocoindex/cocoindex-code:full docker compose up -d -# COCOINDEX_CODE_IMAGE=ghcr.io/cocoindex-io/cocoindex-code:latest docker compose up -d -# -# Optional knobs: -# COCOINDEX_CODE_CONTAINER_NAME=my-ccc -# COCOINDEX_CODE_STATE_DIR=/var/cocoindex/state -# COCOINDEX_CODE_RUNTIME_DIR=/var/run/cocoindex_code -# COCOINDEX_CODE_DB_PATH_MAPPING=/workspace=/var/cocoindex/db -# -# For Git layered indexing, mount a workspace parent that contains both the -# root clone and linked worktrees. Keep COCOINDEX_CODE_STATE_DIR on the -# persistent cocoindex-data volume so base/branch layers survive container -# recreation. See docs/docker-layered-indexing.md. +# This container does not mount source code. It only owns shared state/runtime +# volumes. Repo access happens through short-lived sidecars started by +# the host wrapper after `ccc init` authorizes a specific Git repo. services: - cocoindex-code: - image: ${COCOINDEX_CODE_IMAGE:-cocoindex/cocoindex-code:latest} - container_name: ${COCOINDEX_CODE_CONTAINER_NAME:-cocoindex-code} + cocoindex-code-daemon: + image: ${COCOINDEX_CODE_IMAGE:-cocoindex-code:local-layered} + container_name: ${COCOINDEX_CODE_DAEMON_CONTAINER:-cocoindex-code-local-daemon} volumes: - - ${COCOINDEX_HOST_WORKSPACE:-${HOME}}:/workspace - - cocoindex-data:/var/cocoindex + - ${COCOINDEX_CODE_HOST_SETTINGS_DIR:-${HOME}/.cocoindex_code}:/home/coco/.cocoindex_code + - cocoindex-code-local-state:/var/cocoindex + - cocoindex-code-local-runtime:/var/run/cocoindex_code environment: - COCOINDEX_CODE_STATE_DIR: ${COCOINDEX_CODE_STATE_DIR:-/var/cocoindex/state} - COCOINDEX_CODE_RUNTIME_DIR: ${COCOINDEX_CODE_RUNTIME_DIR:-/var/run/cocoindex_code} - COCOINDEX_CODE_DB_PATH_MAPPING: ${COCOINDEX_CODE_DB_PATH_MAPPING:-/workspace=/var/cocoindex/db} - # Makes CLI and MCP output show your real paths - # (e.g. `/Users/you/myproject/...`) instead of container paths - # (e.g. `/workspace/myproject/...`). - COCOINDEX_CODE_HOST_PATH_MAPPING: ${COCOINDEX_CODE_HOST_PATH_MAPPING:-/workspace=${COCOINDEX_HOST_WORKSPACE:-${HOME}}} - # Linux only: set these so files written to your workspace are owned by - # you rather than root. Not needed on macOS / Windows — leave empty. + COCOINDEX_CODE_DAEMON_TCP: 0.0.0.0:8765 + COCOINDEX_CODE_DIR: /home/coco/.cocoindex_code + COCOINDEX_CODE_STATE_DIR: /var/cocoindex/state + COCOINDEX_CODE_RUNTIME_DIR: /var/run/cocoindex_code + COCOINDEX_CODE_DB_PATH_MAPPING: /workspace=/var/cocoindex/db + COCOINDEX_CODE_DAEMON_SUPERVISED: "1" PUID: ${PUID:-} PGID: ${PGID:-} healthcheck: @@ -54,6 +31,13 @@ services: timeout: 5s retries: 12 start_period: 10s + networks: + - cocoindex-code-local volumes: - cocoindex-data: + cocoindex-code-local-state: + cocoindex-code-local-runtime: + +networks: + cocoindex-code-local: + name: cocoindex-code-local diff --git a/docs/docker-layered-indexing.md b/docs/docker-layered-indexing.md index cff4112..aa98358 100644 --- a/docs/docker-layered-indexing.md +++ b/docs/docker-layered-indexing.md @@ -54,6 +54,7 @@ Central daemon container: ```text mounts: + $HOME/.cocoindex_code -> /home/coco/.cocoindex_code cocoindex-code-local-state -> /var/cocoindex cocoindex-code-local-runtime -> /var/run/cocoindex_code network: @@ -69,6 +70,7 @@ Sidecar container: ```text mounts: /authorized/repo -> /workspace + $HOME/.cocoindex_code -> /home/coco/.cocoindex_code cocoindex-code-local-state -> /var/cocoindex cocoindex-code-local-runtime -> /var/run/cocoindex_code network: @@ -96,6 +98,12 @@ Docker named volumes: | `cocoindex-code-local-state` | `/var/cocoindex` | Global settings, daemon DB, layer metadata, layer DBs, caches | | `cocoindex-code-local-runtime` | `/var/run/cocoindex_code` | PID/log runtime files | +Host user settings: + +| Host Path | Mounted As | Purpose | +|---|---|---| +| `${COCOINDEX_CODE_HOST_SETTINGS_DIR:-$HOME/.cocoindex_code}` | `/home/coco/.cocoindex_code` | Global `ccc` settings shared with the Docker daemon and sidecars | + Reset sample Docker state: ```bash @@ -112,6 +120,7 @@ make reset | `COCOINDEX_CODE_DOCKER_NETWORK` | Private Docker network. Default: `cocoindex-code-local`. | | `COCOINDEX_CODE_STATE_VOLUME` | Shared daemon state named volume. Default: `cocoindex-code-local-state`. | | `COCOINDEX_CODE_RUNTIME_VOLUME` | Shared runtime named volume. Default: `cocoindex-code-local-runtime`. | +| `COCOINDEX_CODE_HOST_SETTINGS_DIR` | Host user settings directory mounted into daemon and sidecars. Default: `$HOME/.cocoindex_code`. | | `COCOINDEX_CODE_SAMPLE_DATA_DIR` | Host-side allowlist directory. Default: `sample/data`. | | `PUID`, `PGID` | Linux-only ownership mapping. | @@ -121,7 +130,7 @@ Internal sidecar/daemon variables: |---|---| | `COCOINDEX_CODE_SIDECAR=1` | Tells CLI to run repo-mounted indexing locally in the sidecar. | | `COCOINDEX_CODE_DAEMON_TCP` | TCP daemon address. Central listens on `0.0.0.0:8765`; sidecars connect to the daemon container name. | -| `COCOINDEX_CODE_DIR=/var/cocoindex/config` | Shared global settings location. | +| `COCOINDEX_CODE_DIR=/home/coco/.cocoindex_code` | Container path for the host-mounted global settings directory. | | `COCOINDEX_CODE_STATE_DIR=/var/cocoindex/state` | Durable daemon layer state. | | `COCOINDEX_CODE_DB_PATH_MAPPING=/workspace=/var/cocoindex/db` | Keeps layer/project databases on Docker native storage. | diff --git a/sample/Makefile b/sample/Makefile index 7c5498b..3a6e984 100644 --- a/sample/Makefile +++ b/sample/Makefile @@ -1,16 +1,26 @@ IMAGE ?= cocoindex-code:local-layered COMPOSE ?= docker compose -f docker-compose.yml -IMAGE ?= cocoindex-code:local-layered +CCC_VARIANT ?= slim + +.PHONY: build build-local build-pypi up restart ps logs down reset install-ccc-wrapper -.PHONY: build up restart ps logs down reset install-ccc-wrapper +build: build-local -build: +build-local: docker build \ -t "$(IMAGE)" \ -f ../docker/Dockerfile \ + --build-arg CCC_VARIANT="$(CCC_VARIANT)" \ --build-arg CCC_INSTALL_SPEC=/ccc-src \ .. +build-pypi: + docker build \ + -t "$(IMAGE)" \ + -f ../docker/Dockerfile \ + --build-arg CCC_VARIANT="$(CCC_VARIANT)" \ + .. + ps: docker ps --filter 'name=cocoindex-code-local-daemon' diff --git a/sample/README.md b/sample/README.md index f553588..57a1d24 100644 --- a/sample/README.md +++ b/sample/README.md @@ -17,6 +17,14 @@ cd sample make build ``` +`make build` is the local-source build and is equivalent to `make build-local`. +Use `make build-pypi` to build the image using the package install path instead. +Set `CCC_VARIANT=full` if you want the full image with local embedding support: + +```bash +CCC_VARIANT=full make build-local +``` + Initialize and authorize one repo: ```bash @@ -64,6 +72,28 @@ Shared Docker state uses named volumes: - `cocoindex-code-local-state`: central daemon layer/index/config state mounted at `/var/cocoindex` - `cocoindex-code-local-runtime`: daemon PID/log runtime files mounted at `/var/run/cocoindex_code` +User settings are shared from your host account: + +- `${COCOINDEX_CODE_HOST_SETTINGS_DIR:-$HOME/.cocoindex_code}` is mounted into both the daemon and sidecars +- inside containers it is read as `COCOINDEX_CODE_DIR=/home/coco/.cocoindex_code` +- `ccc init` therefore writes global settings to your normal host path, for example `/Users/you/.cocoindex_code/global_settings.yml` + +Useful overrides: + +| Variable | Default | +|---|---| +| `COCOINDEX_CODE_IMAGE` | `cocoindex-code:local-layered` | +| `COCOINDEX_CODE_DAEMON_CONTAINER` | `cocoindex-code-local-daemon` | +| `COCOINDEX_CODE_DOCKER_NETWORK` | `cocoindex-code-local` | +| `COCOINDEX_CODE_HOST_SETTINGS_DIR` | `$HOME/.cocoindex_code` | +| `COCOINDEX_CODE_WORKSPACE_DIR` | `/workspace` | +| `COCOINDEX_CODE_CONTAINER_SETTINGS_DIR` | `/home/coco/.cocoindex_code` | +| `COCOINDEX_CODE_CONTAINER_STATE_ROOT` | `/var/cocoindex` | +| `COCOINDEX_CODE_RUNTIME_DIR` | `/var/run/cocoindex_code` | +| `COCOINDEX_CODE_DAEMON_PORT` | `8765` | +| `COCOINDEX_CODE_DAEMON_LISTEN` | `0.0.0.0:$COCOINDEX_CODE_DAEMON_PORT` | +| `COCOINDEX_CODE_DAEMON_CONNECT` | `$COCOINDEX_CODE_DAEMON_CONTAINER:$COCOINDEX_CODE_DAEMON_PORT` | + Sidecars talk to the central daemon over the private Docker network `cocoindex-code-local`. The daemon listens on `COCOINDEX_CODE_DAEMON_TCP=0.0.0.0:8765` inside that network; no host port is published. Stop the central daemon container: diff --git a/sample/bin/ccc b/sample/bin/ccc index edd65a6..d045bd6 100755 --- a/sample/bin/ccc +++ b/sample/bin/ccc @@ -7,17 +7,22 @@ script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" sample_dir="$(cd "$script_dir/.." && pwd)" data_dir="${COCOINDEX_CODE_SAMPLE_DATA_DIR:-$sample_dir/data}" registry="$data_dir/authorized-repos.tsv" -state_dir="$data_dir/state" -runtime_dir="$data_dir/runtime" +host_settings_dir="${COCOINDEX_CODE_HOST_SETTINGS_DIR:-$HOME/.cocoindex_code}" state_volume="${COCOINDEX_CODE_STATE_VOLUME:-cocoindex-code-local-state}" runtime_volume="${COCOINDEX_CODE_RUNTIME_VOLUME:-cocoindex-code-local-runtime}" network="${COCOINDEX_CODE_DOCKER_NETWORK:-cocoindex-code-local}" +workspace_dir="${COCOINDEX_CODE_WORKSPACE_DIR:-/workspace}" +container_settings_dir="${COCOINDEX_CODE_CONTAINER_SETTINGS_DIR:-/home/coco/.cocoindex_code}" +container_state_root="${COCOINDEX_CODE_CONTAINER_STATE_ROOT:-/var/cocoindex}" +container_state_dir="${COCOINDEX_CODE_STATE_DIR:-$container_state_root/state}" +container_runtime_dir="${COCOINDEX_CODE_RUNTIME_DIR:-/var/run/cocoindex_code}" +container_db_path_mapping="${COCOINDEX_CODE_DB_PATH_MAPPING:-$workspace_dir=$container_state_root/db}" +daemon_port="${COCOINDEX_CODE_DAEMON_PORT:-8765}" +daemon_listen_addr="${COCOINDEX_CODE_DAEMON_LISTEN:-0.0.0.0:$daemon_port}" +daemon_connect_addr="${COCOINDEX_CODE_DAEMON_CONNECT:-$central_container:$daemon_port}" mkdir -p "$data_dir" - -sha_12() { - printf '%s' "$1" | shasum -a 256 | awk '{print substr($1, 1, 12)}' -} +mkdir -p "$host_settings_dir" canonical_path() { python3 -c 'import pathlib, sys; print(pathlib.Path(sys.argv[1]).resolve())' "$1" @@ -96,13 +101,14 @@ ensure_central_daemon() { run -d --name "$central_container" --network "$network" - --volume "$state_volume:/var/cocoindex" - --volume "$runtime_volume:/var/run/cocoindex_code" - -e COCOINDEX_CODE_DAEMON_TCP=0.0.0.0:8765 - -e COCOINDEX_CODE_DIR=/var/cocoindex/config - -e COCOINDEX_CODE_STATE_DIR=/var/cocoindex/state - -e COCOINDEX_CODE_RUNTIME_DIR=/var/run/cocoindex_code - -e COCOINDEX_CODE_DB_PATH_MAPPING=/workspace=/var/cocoindex/db + --volume "$host_settings_dir:$container_settings_dir" + --volume "$state_volume:$container_state_root" + --volume "$runtime_volume:$container_runtime_dir" + -e "COCOINDEX_CODE_DAEMON_TCP=$daemon_listen_addr" + -e "COCOINDEX_CODE_DIR=$container_settings_dir" + -e "COCOINDEX_CODE_STATE_DIR=$container_state_dir" + -e "COCOINDEX_CODE_RUNTIME_DIR=$container_runtime_dir" + -e "COCOINDEX_CODE_DB_PATH_MAPPING=$container_db_path_mapping" -e COCOINDEX_CODE_DAEMON_SUPERVISED=1 ) if [[ -n "${PUID:-}" ]]; then @@ -124,17 +130,18 @@ run_sidecar() { local run_args=( run --rm -i --network "$network" - --volume "$root:/workspace" - --volume "$state_volume:/var/cocoindex" - --volume "$runtime_volume:/var/run/cocoindex_code" - -e "COCOINDEX_CODE_DAEMON_TCP=$central_container:8765" + --volume "$root:$workspace_dir" + --volume "$host_settings_dir:$container_settings_dir" + --volume "$state_volume:$container_state_root" + --volume "$runtime_volume:$container_runtime_dir" + -e "COCOINDEX_CODE_DAEMON_TCP=$daemon_connect_addr" -e COCOINDEX_CODE_SIDECAR=1 -e COCOINDEX_CODE_DAEMON_SUPERVISED=1 - -e COCOINDEX_CODE_DIR=/var/cocoindex/config - -e COCOINDEX_CODE_STATE_DIR=/var/cocoindex/state - -e COCOINDEX_CODE_RUNTIME_DIR=/var/run/cocoindex_code - -e COCOINDEX_CODE_DB_PATH_MAPPING=/workspace=/var/cocoindex/db - -e "COCOINDEX_CODE_HOST_PATH_MAPPING=/workspace=$root" + -e "COCOINDEX_CODE_DIR=$container_settings_dir" + -e "COCOINDEX_CODE_STATE_DIR=$container_state_dir" + -e "COCOINDEX_CODE_RUNTIME_DIR=$container_runtime_dir" + -e "COCOINDEX_CODE_DB_PATH_MAPPING=$container_db_path_mapping" + -e "COCOINDEX_CODE_HOST_PATH_MAPPING=$workspace_dir=$root" -e "COCOINDEX_CODE_HOST_CWD=$host_cwd" ) diff --git a/sample/docker-compose.yml b/sample/docker-compose.yml index 3520642..f66c33c 100644 --- a/sample/docker-compose.yml +++ b/sample/docker-compose.yml @@ -9,11 +9,12 @@ services: image: ${COCOINDEX_CODE_IMAGE:-cocoindex-code:local-layered} container_name: ${COCOINDEX_CODE_DAEMON_CONTAINER:-cocoindex-code-local-daemon} volumes: + - ${COCOINDEX_CODE_HOST_SETTINGS_DIR:-${HOME}/.cocoindex_code}:/home/coco/.cocoindex_code - cocoindex-code-local-state:/var/cocoindex - cocoindex-code-local-runtime:/var/run/cocoindex_code environment: COCOINDEX_CODE_DAEMON_TCP: 0.0.0.0:8765 - COCOINDEX_CODE_DIR: /var/cocoindex/config + COCOINDEX_CODE_DIR: /home/coco/.cocoindex_code COCOINDEX_CODE_STATE_DIR: /var/cocoindex/state COCOINDEX_CODE_RUNTIME_DIR: /var/run/cocoindex_code COCOINDEX_CODE_DB_PATH_MAPPING: /workspace=/var/cocoindex/db diff --git a/skills/ccc/references/management.md b/skills/ccc/references/management.md index 75441bb..ea508de 100644 --- a/skills/ccc/references/management.md +++ b/skills/ccc/references/management.md @@ -29,7 +29,7 @@ ccc init **First run (global settings don't exist yet)** — `ccc init` prompts interactively for the embedding provider (sentence-transformers / litellm) and model, then runs a one-off test embed via the daemon to confirm the model works. Accept the defaults for the sentence-transformers path, or pick litellm and enter a model identifier. -**Subsequent runs** (global settings already exist) — prompts are skipped; only project settings and `.gitignore` are set up. +**Subsequent runs** (global settings already exist) — prompts are skipped; optional Git overlay metadata is registered when `--base` is provided. To skip the interactive prompts on the first run (e.g. in a script or container), pass `--litellm-model MODEL`: @@ -41,9 +41,9 @@ This is also the only way to pick a LiteLLM model when stdin isn't a TTY and you `ccc init` creates: - `~/.cocoindex_code/global_settings.yml` (user-level, embedding config + env vars). -- `.cocoindex_code/settings.yml` (project-level, include/exclude patterns). +- `.cocoindex_code/settings.yml` (optional project-level include/exclude overrides; create with `ccc init --project-settings`). -If `.git` exists in the directory, `.cocoindex_code/` is automatically added to `.gitignore`. +`ccc init` does not edit `.gitignore` by default. Pass `--gitignore` if you want it to add `/.cocoindex_code/` for you. Use `-f` to skip the confirmation prompt if `ccc init` detects a potential parent project root. diff --git a/skills/ccc/references/settings.md b/skills/ccc/references/settings.md index 2ff07ec..83bc7db 100644 --- a/skills/ccc/references/settings.md +++ b/skills/ccc/references/settings.md @@ -1,6 +1,6 @@ # ccc Settings -Configuration lives in two YAML files, both created automatically by `ccc init`. +Configuration is daemon-first. `ccc init` creates global daemon settings when needed. Repo-local settings are optional overrides created with `ccc init --project-settings`. ## User-Level Settings (`~/.cocoindex_code/global_settings.yml`) @@ -90,7 +90,7 @@ ccc reset && ccc index ## Project-Level Settings (`/.cocoindex_code/settings.yml`) -Per-project. Controls which files to index. Created by `ccc init` and automatically added to `.gitignore`. +Optional per-project overrides. Controls which files to index when present. `ccc init` does not create this file by default; use `ccc init --project-settings` to create it. Use `ccc init --gitignore` if you want the CLI to add `/.cocoindex_code/` to `.gitignore`. ```yaml include_patterns: diff --git a/src/cocoindex_code/cli.py b/src/cocoindex_code/cli.py index 95c9859..4b79bdc 100644 --- a/src/cocoindex_code/cli.py +++ b/src/cocoindex_code/cli.py @@ -587,6 +587,16 @@ def init( ), force: bool = _typer.Option(False, "-f", "--force", help="Skip parent directory warning"), base_ref: str | None = _typer.Option(None, "--base", help="Git base ref for overlays"), + project_settings: bool = _typer.Option( + False, + "--project-settings", + help="Create repo-local .cocoindex_code/settings.yml overrides.", + ), + gitignore: bool = _typer.Option( + False, + "--gitignore", + help="Add /.cocoindex_code/ to .gitignore. Not done by default.", + ), ) -> None: """Initialize a project for cocoindex-code.""" cwd = Path.cwd().resolve() @@ -610,6 +620,8 @@ def init( _typer.echo("Project already initialized.") if base_ref is not None: _register_overlay_policy(cwd, base_ref) + if gitignore: + add_to_gitignore(cwd) return # Check parent directories for markers @@ -624,15 +636,17 @@ def init( ) raise _typer.Exit(code=1) - # Create project settings - save_project_settings(cwd, default_project_settings()) - _typer.echo(f"Created project settings: {format_path_for_display(settings_file)}") + if project_settings: + save_project_settings(cwd, default_project_settings()) + _typer.echo(f"Created project settings: {format_path_for_display(settings_file)}") + else: + _typer.echo("Using default project settings. Pass --project-settings to create overrides.") if base_ref is not None: _register_overlay_policy(cwd, base_ref) - # Add to .gitignore - add_to_gitignore(cwd) + if gitignore: + add_to_gitignore(cwd) _typer.echo("You can edit the settings files to customize indexing behavior.") _typer.echo("Run `ccc index` to build the index.") diff --git a/src/cocoindex_code/settings.py b/src/cocoindex_code/settings.py index 73b026b..0b89afc 100644 --- a/src/cocoindex_code/settings.py +++ b/src/cocoindex_code/settings.py @@ -322,14 +322,18 @@ def project_settings_path(project_root: Path) -> Path: def find_project_root(start: Path) -> Path | None: - """Walk up from *start* looking for ``.cocoindex_code/settings.yml``. + """Walk up from *start* looking for a project root. - Returns the directory containing it, or ``None``. + A repo-local ``.cocoindex_code/settings.yml`` is a project marker when it + exists. Otherwise a Git root is enough; project settings then fall back to + defaults instead of forcing a repo-local config file. """ current = start.resolve() while True: if (current / _SETTINGS_DIR_NAME / _SETTINGS_FILE_NAME).is_file(): return current + if (current / ".git").exists(): + return current parent = current.parent if parent == current: return None @@ -593,11 +597,11 @@ def save_initial_user_settings( def load_project_settings(project_root: Path) -> ProjectSettings: """Read ``$PROJECT_ROOT/.cocoindex_code/settings.yml``. - Raises ``FileNotFoundError`` if the file does not exist. + Falls back to default settings if no repo-local settings file exists. """ path = project_settings_path(project_root) if not path.is_file(): - raise FileNotFoundError(f"Project settings not found: {path}") + return default_project_settings() try: with open(path) as f: data = _yaml.safe_load(f) diff --git a/tests/test_cli_helpers.py b/tests/test_cli_helpers.py index 7570b6f..fc2c7bf 100644 --- a/tests/test_cli_helpers.py +++ b/tests/test_cli_helpers.py @@ -33,6 +33,23 @@ def test_require_project_root_success(tmp_path: Path, monkeypatch: pytest.Monkey assert require_project_root() == project +def test_require_project_root_success_for_git_repo_without_local_settings( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + project = tmp_path / "project" + (project / ".git").mkdir(parents=True) + subdir = project / "src" + subdir.mkdir() + monkeypatch.chdir(subdir) + settings_dir = tmp_path / "ccc_home" + settings_dir.mkdir() + (settings_dir / "global_settings.yml").write_text( + "embedding:\n model: test\n provider: litellm\n" + ) + monkeypatch.setenv("COCOINDEX_CODE_DIR", str(settings_dir)) + assert require_project_root() == project + + def test_require_project_root_exits_when_not_initialized( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: diff --git a/tests/test_docker_setup.py b/tests/test_docker_setup.py index ad078f7..8a864f4 100644 --- a/tests/test_docker_setup.py +++ b/tests/test_docker_setup.py @@ -37,25 +37,24 @@ def test_docker_entrypoint_prepares_state_db_cache_and_runtime_dirs() -> None: assert 'exec gosu coco "$@"' in content -def test_docker_compose_exposes_local_use_knobs_and_healthcheck() -> None: +def test_docker_compose_uses_sidecar_daemon_model() -> None: content = (REPO_ROOT / "docker" / "docker-compose.yml").read_text() - assert "${COCOINDEX_CODE_IMAGE:-cocoindex/cocoindex-code:latest}" in content - assert "${COCOINDEX_CODE_CONTAINER_NAME:-cocoindex-code}" in content - assert "${COCOINDEX_HOST_WORKSPACE:-${HOME}}:/workspace" in content - assert "COCOINDEX_CODE_STATE_DIR: ${COCOINDEX_CODE_STATE_DIR:-/var/cocoindex/state}" in content - assert ( - "COCOINDEX_CODE_RUNTIME_DIR: ${COCOINDEX_CODE_RUNTIME_DIR:-/var/run/cocoindex_code}" - in content - ) - assert ( - "COCOINDEX_CODE_DB_PATH_MAPPING: " - "${COCOINDEX_CODE_DB_PATH_MAPPING:-/workspace=/var/cocoindex/db}" - ) in content + assert "${COCOINDEX_CODE_IMAGE:-cocoindex-code:local-layered}" in content + assert "${COCOINDEX_CODE_DAEMON_CONTAINER:-cocoindex-code-local-daemon}" in content + assert ":/workspace" not in content + assert "ports:" not in content assert ( - "COCOINDEX_CODE_HOST_PATH_MAPPING: " - "${COCOINDEX_CODE_HOST_PATH_MAPPING:-/workspace=${COCOINDEX_HOST_WORKSPACE:-${HOME}}}" + "${COCOINDEX_CODE_HOST_SETTINGS_DIR:-${HOME}/.cocoindex_code}" + ":/home/coco/.cocoindex_code" ) in content + assert "COCOINDEX_CODE_DAEMON_TCP: 0.0.0.0:8765" in content + assert "COCOINDEX_CODE_DIR: /home/coco/.cocoindex_code" in content + assert "COCOINDEX_CODE_STATE_DIR: /var/cocoindex/state" in content + assert "COCOINDEX_CODE_RUNTIME_DIR: /var/run/cocoindex_code" in content + assert "COCOINDEX_CODE_DB_PATH_MAPPING: /workspace=/var/cocoindex/db" in content + assert "cocoindex-code-local-state:/var/cocoindex" in content + assert "cocoindex-code-local-runtime:/var/run/cocoindex_code" in content assert "ccc daemon status" in content assert "daemon.sock" in content @@ -92,6 +91,11 @@ def test_sample_compose_uses_daemon_without_source_mount() -> None: assert ":/workspace" not in content assert "ports:" not in content assert "COCOINDEX_CODE_DAEMON_TCP: 0.0.0.0:8765" in content + assert ( + "${COCOINDEX_CODE_HOST_SETTINGS_DIR:-${HOME}/.cocoindex_code}" + ":/home/coco/.cocoindex_code" + ) in content + assert "COCOINDEX_CODE_DIR: /home/coco/.cocoindex_code" in content assert "cocoindex-code-local-state:/var/cocoindex" in content assert "cocoindex-code-local-runtime:/var/run/cocoindex_code" in content @@ -100,14 +104,42 @@ def test_sample_wrapper_mounts_only_authorized_repo_sidecar() -> None: content = (REPO_ROOT / "sample" / "bin" / "ccc").read_text() assert 'record_authorization "$root" "$common_dir"' in content - assert '--volume "$root:/workspace"' in content + assert '--volume "$root:$workspace_dir"' in content + assert '--volume "$host_settings_dir:$container_settings_dir"' in content + assert '--volume "$state_volume:$container_state_root"' in content + assert '--volume "$runtime_volume:$container_runtime_dir"' in content assert '--network "$network"' in content assert "COCOINDEX_CODE_SIDECAR=1" in content assert "COCOINDEX_CODE_DAEMON_SUPERVISED=1" in content - assert 'COCOINDEX_CODE_DAEMON_TCP=$central_container:8765' in content + assert 'COCOINDEX_CODE_DAEMON_TCP=$daemon_connect_addr' in content + assert "COCOINDEX_CODE_DIR=$container_settings_dir" in content + assert "COCOINDEX_CODE_STATE_DIR=$container_state_dir" in content + assert "COCOINDEX_CODE_RUNTIME_DIR=$container_runtime_dir" in content + assert "COCOINDEX_CODE_DB_PATH_MAPPING=$container_db_path_mapping" in content + assert 'COCOINDEX_CODE_HOST_PATH_MAPPING=$workspace_dir=$root' in content assert 'exec docker "${run_args[@]}"' in content +def test_sample_wrapper_defaults_settings_dir_to_host_home() -> None: + content = (REPO_ROOT / "sample" / "bin" / "ccc").read_text() + + assert ( + 'host_settings_dir="${COCOINDEX_CODE_HOST_SETTINGS_DIR:-$HOME/.cocoindex_code}"' + in content + ) + assert 'workspace_dir="${COCOINDEX_CODE_WORKSPACE_DIR:-/workspace}"' in content + assert ( + 'container_settings_dir="${COCOINDEX_CODE_CONTAINER_SETTINGS_DIR:-' + '/home/coco/.cocoindex_code}"' in content + ) + assert 'daemon_port="${COCOINDEX_CODE_DAEMON_PORT:-8765}"' in content + assert ( + 'daemon_connect_addr="${COCOINDEX_CODE_DAEMON_CONNECT:-' + '$central_container:$daemon_port}"' in content + ) + assert 'mkdir -p "$host_settings_dir"' in content + + def test_sample_wrapper_authorization_handles_nested_repos_and_worktrees() -> None: content = (REPO_ROOT / "sample" / "bin" / "ccc").read_text() @@ -136,6 +168,11 @@ def test_sample_makefile_has_default_image_and_reset_target() -> None: content = (REPO_ROOT / "sample" / "Makefile").read_text() assert "IMAGE ?= cocoindex-code:local-layered" in content + assert "CCC_VARIANT ?= slim" in content + assert "build: build-local" in content + assert "build-local:" in content + assert "--build-arg CCC_INSTALL_SPEC=/ccc-src" in content + assert "build-pypi:" in content assert "reset: down" in content assert "docker volume rm" in content @@ -158,4 +195,4 @@ def test_docker_compose_config_is_valid(tmp_path: Path) -> None: if result.returncode != 0 and "docker daemon" in result.stderr.lower(): pytest.skip("Docker daemon not available") assert result.returncode == 0, result.stderr - assert "cocoindex-code" in result.stdout + assert "cocoindex-code-daemon" in result.stdout diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 2580170..f4aef35 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -152,13 +152,15 @@ def test_session_happy_path(e2e_project: Path) -> None: # Init result = runner.invoke(app, ["init"], catch_exceptions=False) assert result.exit_code == 0, result.output - assert (e2e_project / ".cocoindex_code" / "settings.yml").exists() - assert "Created project settings" in result.output or "settings" in result.output + assert not (e2e_project / ".cocoindex_code" / "settings.yml").exists() + assert "Using default project settings" in result.output + assert not (e2e_project / ".gitignore").exists() - # Init again — already initialized + # Init again remains non-mutating when repo-local settings are not requested. result = runner.invoke(app, ["init"], catch_exceptions=False) assert result.exit_code == 0 - assert "already initialized" in result.output + assert "Using default project settings" in result.output + assert not (e2e_project / ".cocoindex_code" / "settings.yml").exists() # Index result = runner.invoke(app, ["index"], catch_exceptions=False) @@ -237,8 +239,8 @@ def test_session_reset_databases(e2e_project: Path) -> None: assert result.exit_code == 0 assert "Databases deleted" in result.output - # Settings should still exist - assert (e2e_project / ".cocoindex_code" / "settings.yml").exists() + # Repo-local settings were not created by default. + assert not (e2e_project / ".cocoindex_code" / "settings.yml").exists() # DB files should be gone assert not (e2e_project / ".cocoindex_code" / "cocoindex.db").exists() @@ -262,10 +264,10 @@ def test_session_reset_databases(e2e_project: Path) -> None: def test_session_reset_all(e2e_project: Path) -> None: """Init → index → reset --all → verify full cleanup → search errors.""" - runner.invoke(app, ["init"], catch_exceptions=False) + runner.invoke(app, ["init", "--gitignore"], catch_exceptions=False) runner.invoke(app, ["index"], catch_exceptions=False) - # .gitignore should have the entry (project has .git dir) + # .gitignore should have the entry only when explicitly requested. gitignore = e2e_project / ".gitignore" assert gitignore.is_file() assert "/.cocoindex_code/" in gitignore.read_text() @@ -281,10 +283,47 @@ def test_session_reset_all(e2e_project: Path) -> None: # .gitignore entry should be removed assert "/.cocoindex_code/" not in gitignore.read_text() - # Search should fail — not initialized - result = runner.invoke(app, ["search", "fibonacci"]) - assert result.exit_code != 0 - assert "ccc init" in result.output + # The Git repo still works with daemon/global defaults; reset only removes + # local databases/settings, not the ability to use default project settings. + result = runner.invoke(app, ["search", "fibonacci"], catch_exceptions=False) + assert result.exit_code == 0, result.output + + +def test_init_gitignore_is_opt_in_for_new_project(e2e_project: Path) -> None: + result = runner.invoke(app, ["init"], catch_exceptions=False) + assert result.exit_code == 0, result.output + assert not (e2e_project / ".cocoindex_code" / "settings.yml").exists() + assert not (e2e_project / ".gitignore").exists() + + +def test_init_gitignore_flag_adds_entry_for_new_project(e2e_project: Path) -> None: + result = runner.invoke(app, ["init", "--gitignore"], catch_exceptions=False) + assert result.exit_code == 0, result.output + assert not (e2e_project / ".cocoindex_code" / "settings.yml").exists() + + gitignore = e2e_project / ".gitignore" + assert gitignore.is_file() + assert "/.cocoindex_code/" in gitignore.read_text() + + +def test_init_gitignore_flag_adds_entry_for_existing_project(e2e_project: Path) -> None: + result = runner.invoke(app, ["init"], catch_exceptions=False) + assert result.exit_code == 0, result.output + assert not (e2e_project / ".cocoindex_code" / "settings.yml").exists() + assert not (e2e_project / ".gitignore").exists() + + result = runner.invoke(app, ["init", "--gitignore"], catch_exceptions=False) + assert result.exit_code == 0, result.output + + gitignore = e2e_project / ".gitignore" + assert gitignore.is_file() + assert "/.cocoindex_code/" in gitignore.read_text() + + +def test_init_project_settings_flag_creates_repo_local_overrides(e2e_project: Path) -> None: + result = runner.invoke(app, ["init", "--project-settings"], catch_exceptions=False) + assert result.exit_code == 0, result.output + assert (e2e_project / ".cocoindex_code" / "settings.yml").exists() def test_session_reset_then_full_reinit(e2e_project: Path) -> None: @@ -301,7 +340,7 @@ def test_session_reset_then_full_reinit(e2e_project: Path) -> None: # Re-init from scratch result = runner.invoke(app, ["init"], catch_exceptions=False) assert result.exit_code == 0 - assert (e2e_project / ".cocoindex_code" / "settings.yml").exists() + assert not (e2e_project / ".cocoindex_code" / "settings.yml").exists() # Re-index result = runner.invoke(app, ["index"], catch_exceptions=False) @@ -396,11 +435,11 @@ def test_session_search_refresh() -> None: @pytest.mark.usefixtures("e2e_project") -def test_session_index_not_initialized_errors() -> None: - """Running ``ccc index`` from uninitialized dir should error.""" - result = runner.invoke(app, ["index"]) - assert result.exit_code != 0 - assert "ccc init" in result.output +def test_session_index_uses_git_repo_defaults_without_init() -> None: + """Git repos use default project settings without repo-local init files.""" + result = runner.invoke(app, ["index"], catch_exceptions=False) + assert result.exit_code == 0, result.output + assert "Chunks:" in result.output def test_session_subdirectory_path_default(e2e_project: Path) -> None: @@ -799,9 +838,10 @@ def _fake_doctor_fail( assert "ccc doctor" in combined assert "envs:" in combined - # Settings file was written (not rolled back) and project was initialized. + # Global settings were written (not rolled back). Repo-local settings are + # opt-in and are not created by default. assert user_settings_path().is_file() - assert (e2e_fresh_env / ".cocoindex_code" / "settings.yml").exists() + assert not (e2e_fresh_env / ".cocoindex_code" / "settings.yml").exists() def test_init_rejects_litellm_model_when_settings_exist(e2e_project: Path) -> None: @@ -921,8 +961,8 @@ def test_session_db_path_mapping( result = runner.invoke(app, ["init"], catch_exceptions=False) assert result.exit_code == 0, result.output - # Settings should be in the project dir, NOT the mapped dir - assert (project_dir / ".cocoindex_code" / "settings.yml").exists() + # Repo-local settings are not created by default. + assert not (project_dir / ".cocoindex_code" / "settings.yml").exists() # Index result = runner.invoke(app, ["index"], catch_exceptions=False) @@ -942,8 +982,7 @@ def test_session_db_path_mapping( result = runner.invoke(app, ["reset", "-f"], catch_exceptions=False) assert result.exit_code == 0 assert not (mapped_db_dir / "target_sqlite.db").exists() - # Settings still in place - assert (project_dir / ".cocoindex_code" / "settings.yml").exists() + assert not (project_dir / ".cocoindex_code" / "settings.yml").exists() # --------------------------------------------------------------------------- diff --git a/tests/test_settings.py b/tests/test_settings.py index 6c06af1..1a8df73 100644 --- a/tests/test_settings.py +++ b/tests/test_settings.py @@ -144,9 +144,11 @@ def test_save_default_settings_writes_explicit_embedding() -> None: assert "Snowflake/snowflake-arctic-embed-xs" in content -def test_load_project_settings_missing_file_raises(tmp_path: Path) -> None: - with pytest.raises(FileNotFoundError): - load_project_settings(tmp_path) +def test_load_project_settings_missing_file_returns_defaults(tmp_path: Path) -> None: + loaded = load_project_settings(tmp_path) + defaults = default_project_settings() + assert loaded.include_patterns == defaults.include_patterns + assert loaded.exclude_patterns == defaults.exclude_patterns def test_find_project_root_from_subdirectory(tmp_path: Path) -> None: @@ -171,6 +173,14 @@ def test_find_project_root_returns_none_when_not_initialized(tmp_path: Path) -> assert find_project_root(standalone) is None +def test_find_project_root_from_git_root_without_local_settings(tmp_path: Path) -> None: + project = tmp_path / "project" + (project / ".git").mkdir(parents=True) + subdir = project / "src" + subdir.mkdir() + assert find_project_root(subdir) == project + + def test_find_parent_with_marker_finds_git(tmp_path: Path) -> None: repo = tmp_path / "repo" (repo / ".git").mkdir(parents=True) From f79680806a5fe7d279d6b310df41f7296a979d5a Mon Sep 17 00:00:00 2001 From: Rudimar Ronsoni Date: Wed, 20 May 2026 11:25:13 +0200 Subject: [PATCH 05/15] Promote Docker sidecar helpers to root --- sample/Makefile => Makefile | 14 ++--- README.md | 4 +- {sample/bin => bin}/ccc | 8 +-- docs/docker-layered-indexing.md | 40 +++++++----- sample/.gitignore | 1 - sample/README.md | 104 -------------------------------- sample/docker-compose.yml | 43 ------------- tests/test_docker_setup.py | 36 +++++------ 8 files changed, 54 insertions(+), 196 deletions(-) rename sample/Makefile => Makefile (86%) rename {sample/bin => bin}/ccc (96%) mode change 100755 => 100644 delete mode 100644 sample/.gitignore delete mode 100644 sample/README.md delete mode 100644 sample/docker-compose.yml diff --git a/sample/Makefile b/Makefile similarity index 86% rename from sample/Makefile rename to Makefile index 3a6e984..cad69df 100644 --- a/sample/Makefile +++ b/Makefile @@ -1,6 +1,7 @@ IMAGE ?= cocoindex-code:local-layered -COMPOSE ?= docker compose -f docker-compose.yml +COMPOSE ?= docker compose -f docker/docker-compose.yml CCC_VARIANT ?= slim +CCC_WRAPPER ?= bin/ccc .PHONY: build build-local build-pypi up restart ps logs down reset install-ccc-wrapper @@ -9,17 +10,17 @@ build: build-local build-local: docker build \ -t "$(IMAGE)" \ - -f ../docker/Dockerfile \ + -f docker/Dockerfile \ --build-arg CCC_VARIANT="$(CCC_VARIANT)" \ --build-arg CCC_INSTALL_SPEC=/ccc-src \ - .. + . build-pypi: docker build \ -t "$(IMAGE)" \ - -f ../docker/Dockerfile \ + -f docker/Dockerfile \ --build-arg CCC_VARIANT="$(CCC_VARIANT)" \ - .. + . ps: docker ps --filter 'name=cocoindex-code-local-daemon' @@ -43,9 +44,8 @@ reset: down "$${COCOINDEX_CODE_RUNTIME_VOLUME:-cocoindex-code-local-runtime}" \ 2>/dev/null || true docker network rm "$${COCOINDEX_CODE_DOCKER_NETWORK:-cocoindex-code-local}" 2>/dev/null || true - rm -rf data install-ccc-wrapper: mkdir -p "$(HOME)/.local/bin" - cp "bin/ccc" "$(HOME)/.local/bin/ccc" + cp "$(CCC_WRAPPER)" "$(HOME)/.local/bin/ccc" chmod +x "$(HOME)/.local/bin/ccc" diff --git a/README.md b/README.md index e58e908..ddf30bb 100644 --- a/README.md +++ b/README.md @@ -247,9 +247,9 @@ The rest of this section uses `:latest` — substitute `:full` in the `image:` / Build the branch-local image and install/use the sidecar wrapper: ```bash -cd sample +cd /path/to/cocoindex-code make build -make install-ccc-wrapper # optional; otherwise call sample/bin/ccc directly +make install-ccc-wrapper # optional; otherwise call bin/ccc directly ``` Authorize and index exactly one repo: diff --git a/sample/bin/ccc b/bin/ccc old mode 100755 new mode 100644 similarity index 96% rename from sample/bin/ccc rename to bin/ccc index d045bd6..e521358 --- a/sample/bin/ccc +++ b/bin/ccc @@ -3,11 +3,9 @@ set -euo pipefail image="${COCOINDEX_CODE_IMAGE:-cocoindex-code:local-layered}" central_container="${COCOINDEX_CODE_DAEMON_CONTAINER:-cocoindex-code-local-daemon}" -script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -sample_dir="$(cd "$script_dir/.." && pwd)" -data_dir="${COCOINDEX_CODE_SAMPLE_DATA_DIR:-$sample_dir/data}" -registry="$data_dir/authorized-repos.tsv" host_settings_dir="${COCOINDEX_CODE_HOST_SETTINGS_DIR:-$HOME/.cocoindex_code}" +data_dir="${COCOINDEX_CODE_WRAPPER_DATA_DIR:-$host_settings_dir/docker-sidecar}" +registry="$data_dir/authorized-repos.tsv" state_volume="${COCOINDEX_CODE_STATE_VOLUME:-cocoindex-code-local-state}" runtime_volume="${COCOINDEX_CODE_RUNTIME_VOLUME:-cocoindex-code-local-runtime}" network="${COCOINDEX_CODE_DOCKER_NETWORK:-cocoindex-code-local}" @@ -84,7 +82,7 @@ common_dir_for_authorized_root() { ensure_image_exists() { if ! docker image inspect "$image" >/dev/null 2>&1; then echo "Docker image '$image' does not exist." >&2 - echo "Build it with: cd '$sample_dir' && make build" >&2 + echo "Build it from the cocoindex-code checkout with: make build-local" >&2 exit 1 fi } diff --git a/docs/docker-layered-indexing.md b/docs/docker-layered-indexing.md index aa98358..ff42bd1 100644 --- a/docs/docker-layered-indexing.md +++ b/docs/docker-layered-indexing.md @@ -12,28 +12,36 @@ The intended Docker architecture is: Do not mount `$HOME` or a broad source tree just to make indexing work. -## Repo-Scoped Sample +## Repo-Scoped Wrapper Build the branch-local image: ```bash -cd sample +cd /path/to/cocoindex-code make build ``` +Install the wrapper as `ccc`: + +```bash +make install-ccc-wrapper +``` + +Or run it directly from the checkout with `/path/to/cocoindex-code/bin/ccc`. + Authorize one repo and register its base ref: ```bash cd /path/to/repo -/path/to/cocoindex-code/sample/bin/ccc init --base main +ccc init --base main ``` Then index and search: ```bash -/path/to/cocoindex-code/sample/bin/ccc index -/path/to/cocoindex-code/sample/bin/ccc search "query planner" -/path/to/cocoindex-code/sample/bin/ccc overlay status +ccc index +ccc search "query planner" +ccc overlay status ``` The wrapper refuses to run outside an authorized repo. Running `ccc init` from another repo authorizes that repo separately. Source access is granted only to the short-lived sidecar for that repo. @@ -42,8 +50,8 @@ Linked worktrees must also be authorized explicitly: ```bash cd /path/to/repo.worktrees/feature-1 -/path/to/cocoindex-code/sample/bin/ccc init --base main -/path/to/cocoindex-code/sample/bin/ccc index +ccc init --base main +ccc index ``` When linked worktrees share the same Git common directory, they can share daemon layer state while each sidecar still mounts only the initialized checkout. @@ -85,10 +93,10 @@ Indexing runs in the sidecar because it is the process with Git/source access. T ## State -Host-side sample metadata: +Host-side wrapper metadata: ```text -sample/data/authorized-repos.tsv +$HOME/.cocoindex_code/docker-sidecar/authorized-repos.tsv ``` Docker named volumes: @@ -104,10 +112,10 @@ Host user settings: |---|---|---| | `${COCOINDEX_CODE_HOST_SETTINGS_DIR:-$HOME/.cocoindex_code}` | `/home/coco/.cocoindex_code` | Global `ccc` settings shared with the Docker daemon and sidecars | -Reset sample Docker state: +Reset Docker state: ```bash -cd sample +cd /path/to/cocoindex-code make reset ``` @@ -121,7 +129,7 @@ make reset | `COCOINDEX_CODE_STATE_VOLUME` | Shared daemon state named volume. Default: `cocoindex-code-local-state`. | | `COCOINDEX_CODE_RUNTIME_VOLUME` | Shared runtime named volume. Default: `cocoindex-code-local-runtime`. | | `COCOINDEX_CODE_HOST_SETTINGS_DIR` | Host user settings directory mounted into daemon and sidecars. Default: `$HOME/.cocoindex_code`. | -| `COCOINDEX_CODE_SAMPLE_DATA_DIR` | Host-side allowlist directory. Default: `sample/data`. | +| `COCOINDEX_CODE_WRAPPER_DATA_DIR` | Host-side allowlist directory. Default: `$HOME/.cocoindex_code/docker-sidecar`. | | `PUID`, `PGID` | Linux-only ownership mapping. | Internal sidecar/daemon variables: @@ -139,7 +147,7 @@ Internal sidecar/daemon variables: Check the central daemon: ```bash -cd sample +cd /path/to/cocoindex-code make ps make logs ``` @@ -148,8 +156,8 @@ Check through a repo-authorized sidecar: ```bash cd /path/to/repo -/path/to/cocoindex-code/sample/bin/ccc daemon status -/path/to/cocoindex-code/sample/bin/ccc overlay status +ccc daemon status +ccc overlay status ``` Inspect named volume contents: diff --git a/sample/.gitignore b/sample/.gitignore deleted file mode 100644 index 8fce603..0000000 --- a/sample/.gitignore +++ /dev/null @@ -1 +0,0 @@ -data/ diff --git a/sample/README.md b/sample/README.md deleted file mode 100644 index 57a1d24..0000000 --- a/sample/README.md +++ /dev/null @@ -1,104 +0,0 @@ -# Repo-Scoped Docker Sample - -This sample runs CocoIndex Code in Docker without mounting your home directory or a broad source tree. - -The wrapper grants access on demand: - -1. `ccc init` must be run inside a Git repository. -2. The wrapper records that Git root as authorized. -3. It starts one central daemon container with only shared state/runtime volumes. -4. Each `ccc` invocation runs a short-lived sidecar with only that repository mounted at `/workspace`. -5. Later commands only run when your current directory is inside an authorized repo. - -Build the image from this branch: - -```bash -cd sample -make build -``` - -`make build` is the local-source build and is equivalent to `make build-local`. -Use `make build-pypi` to build the image using the package install path instead. -Set `CCC_VARIANT=full` if you want the full image with local embedding support: - -```bash -CCC_VARIANT=full make build-local -``` - -Initialize and authorize one repo: - -```bash -cd /path/to/repo -/path/to/cocoindex-code/sample/bin/ccc init --base main -``` - -Index and search from the same repo: - -```bash -/path/to/cocoindex-code/sample/bin/ccc index -/path/to/cocoindex-code/sample/bin/ccc search "query" -/path/to/cocoindex-code/sample/bin/ccc overlay status -``` - -Install the wrapper globally if desired: - -```bash -cd /path/to/cocoindex-code/sample -make install-ccc-wrapper -``` - -Then use it as: - -```bash -cd /path/to/repo -ccc init --base main -ccc index -``` - -Linked worktrees must be authorized separately by running `ccc init` from that worktree. They share layer state when they share the same Git common directory, but each sidecar only receives access to the worktree you initialized. - -```bash -cd /path/to/repo.worktrees/feature-1 -ccc init --base main -ccc index -``` - -State is stored under `sample/data/`: - -- `authorized-repos.tsv`: host-side allowlist written by the wrapper - -Shared Docker state uses named volumes: - -- `cocoindex-code-local-state`: central daemon layer/index/config state mounted at `/var/cocoindex` -- `cocoindex-code-local-runtime`: daemon PID/log runtime files mounted at `/var/run/cocoindex_code` - -User settings are shared from your host account: - -- `${COCOINDEX_CODE_HOST_SETTINGS_DIR:-$HOME/.cocoindex_code}` is mounted into both the daemon and sidecars -- inside containers it is read as `COCOINDEX_CODE_DIR=/home/coco/.cocoindex_code` -- `ccc init` therefore writes global settings to your normal host path, for example `/Users/you/.cocoindex_code/global_settings.yml` - -Useful overrides: - -| Variable | Default | -|---|---| -| `COCOINDEX_CODE_IMAGE` | `cocoindex-code:local-layered` | -| `COCOINDEX_CODE_DAEMON_CONTAINER` | `cocoindex-code-local-daemon` | -| `COCOINDEX_CODE_DOCKER_NETWORK` | `cocoindex-code-local` | -| `COCOINDEX_CODE_HOST_SETTINGS_DIR` | `$HOME/.cocoindex_code` | -| `COCOINDEX_CODE_WORKSPACE_DIR` | `/workspace` | -| `COCOINDEX_CODE_CONTAINER_SETTINGS_DIR` | `/home/coco/.cocoindex_code` | -| `COCOINDEX_CODE_CONTAINER_STATE_ROOT` | `/var/cocoindex` | -| `COCOINDEX_CODE_RUNTIME_DIR` | `/var/run/cocoindex_code` | -| `COCOINDEX_CODE_DAEMON_PORT` | `8765` | -| `COCOINDEX_CODE_DAEMON_LISTEN` | `0.0.0.0:$COCOINDEX_CODE_DAEMON_PORT` | -| `COCOINDEX_CODE_DAEMON_CONNECT` | `$COCOINDEX_CODE_DAEMON_CONTAINER:$COCOINDEX_CODE_DAEMON_PORT` | - -Sidecars talk to the central daemon over the private Docker network `cocoindex-code-local`. The daemon listens on `COCOINDEX_CODE_DAEMON_TCP=0.0.0.0:8765` inside that network; no host port is published. - -Stop the central daemon container: - -```bash -cd sample -make down -``` diff --git a/sample/docker-compose.yml b/sample/docker-compose.yml deleted file mode 100644 index f66c33c..0000000 --- a/sample/docker-compose.yml +++ /dev/null @@ -1,43 +0,0 @@ -# Central daemon compose file for the sidecar Docker model. -# -# This container does not mount source code. It only owns shared state/runtime -# volumes. Repo access happens through short-lived sidecars started by -# `sample/bin/ccc` after `ccc init` authorizes a specific Git repo. - -services: - cocoindex-code-daemon: - image: ${COCOINDEX_CODE_IMAGE:-cocoindex-code:local-layered} - container_name: ${COCOINDEX_CODE_DAEMON_CONTAINER:-cocoindex-code-local-daemon} - volumes: - - ${COCOINDEX_CODE_HOST_SETTINGS_DIR:-${HOME}/.cocoindex_code}:/home/coco/.cocoindex_code - - cocoindex-code-local-state:/var/cocoindex - - cocoindex-code-local-runtime:/var/run/cocoindex_code - environment: - COCOINDEX_CODE_DAEMON_TCP: 0.0.0.0:8765 - COCOINDEX_CODE_DIR: /home/coco/.cocoindex_code - COCOINDEX_CODE_STATE_DIR: /var/cocoindex/state - COCOINDEX_CODE_RUNTIME_DIR: /var/run/cocoindex_code - COCOINDEX_CODE_DB_PATH_MAPPING: /workspace=/var/cocoindex/db - COCOINDEX_CODE_DAEMON_SUPERVISED: "1" - PUID: ${PUID:-} - PGID: ${PGID:-} - healthcheck: - test: - [ - "CMD-SHELL", - "ccc daemon status >/dev/null 2>&1 || test -S /var/run/cocoindex_code/daemon.sock", - ] - interval: 10s - timeout: 5s - retries: 12 - start_period: 10s - networks: - - cocoindex-code-local - -volumes: - cocoindex-code-local-state: - cocoindex-code-local-runtime: - -networks: - cocoindex-code-local: - name: cocoindex-code-local diff --git a/tests/test_docker_setup.py b/tests/test_docker_setup.py index 8a864f4..1a6769c 100644 --- a/tests/test_docker_setup.py +++ b/tests/test_docker_setup.py @@ -85,8 +85,8 @@ def test_docker_sidecar_docs_describe_repo_scoped_architecture() -> None: assert "COCOINDEX_CODE_SIDECAR=1" in content -def test_sample_compose_uses_daemon_without_source_mount() -> None: - content = (REPO_ROOT / "sample" / "docker-compose.yml").read_text() +def test_official_compose_uses_daemon_without_source_mount() -> None: + content = (REPO_ROOT / "docker" / "docker-compose.yml").read_text() assert ":/workspace" not in content assert "ports:" not in content @@ -100,8 +100,8 @@ def test_sample_compose_uses_daemon_without_source_mount() -> None: assert "cocoindex-code-local-runtime:/var/run/cocoindex_code" in content -def test_sample_wrapper_mounts_only_authorized_repo_sidecar() -> None: - content = (REPO_ROOT / "sample" / "bin" / "ccc").read_text() +def test_wrapper_mounts_only_authorized_repo_sidecar() -> None: + content = (REPO_ROOT / "bin" / "ccc").read_text() assert 'record_authorization "$root" "$common_dir"' in content assert '--volume "$root:$workspace_dir"' in content @@ -120,13 +120,17 @@ def test_sample_wrapper_mounts_only_authorized_repo_sidecar() -> None: assert 'exec docker "${run_args[@]}"' in content -def test_sample_wrapper_defaults_settings_dir_to_host_home() -> None: - content = (REPO_ROOT / "sample" / "bin" / "ccc").read_text() +def test_wrapper_defaults_settings_dir_to_host_home() -> None: + content = (REPO_ROOT / "bin" / "ccc").read_text() assert ( 'host_settings_dir="${COCOINDEX_CODE_HOST_SETTINGS_DIR:-$HOME/.cocoindex_code}"' in content ) + assert ( + 'data_dir="${COCOINDEX_CODE_WRAPPER_DATA_DIR:-$host_settings_dir/docker-sidecar}"' + in content + ) assert 'workspace_dir="${COCOINDEX_CODE_WORKSPACE_DIR:-/workspace}"' in content assert ( 'container_settings_dir="${COCOINDEX_CODE_CONTAINER_SETTINGS_DIR:-' @@ -140,8 +144,8 @@ def test_sample_wrapper_defaults_settings_dir_to_host_home() -> None: assert 'mkdir -p "$host_settings_dir"' in content -def test_sample_wrapper_authorization_handles_nested_repos_and_worktrees() -> None: - content = (REPO_ROOT / "sample" / "bin" / "ccc").read_text() +def test_wrapper_authorization_handles_nested_repos_and_worktrees() -> None: + content = (REPO_ROOT / "bin" / "ccc").read_text() assert 'if (( ${#root} > ${#best} )); then' in content assert 'git_common_dir_for()' in content @@ -150,27 +154,23 @@ def test_sample_wrapper_authorization_handles_nested_repos_and_worktrees() -> No assert '--volume "$common_dir:$common_dir:ro"' in content -def test_sample_wrapper_refuses_unauthorized_paths_and_requires_git_for_init() -> None: - content = (REPO_ROOT / "sample" / "bin" / "ccc").read_text() +def test_wrapper_refuses_unauthorized_paths_and_requires_git_for_init() -> None: + content = (REPO_ROOT / "bin" / "ccc").read_text() assert "ccc init must be run inside a Git repository for Docker authorization." in content assert "This path has not been authorized for Docker-backed ccc access:" in content assert "Run ccc init from the Git repo root or a subdirectory first." in content -def test_sample_gitignore_excludes_runtime_authorization_state() -> None: - content = (REPO_ROOT / "sample" / ".gitignore").read_text() - - assert "data/" in content - - -def test_sample_makefile_has_default_image_and_reset_target() -> None: - content = (REPO_ROOT / "sample" / "Makefile").read_text() +def test_makefile_has_default_image_and_reset_target() -> None: + content = (REPO_ROOT / "Makefile").read_text() assert "IMAGE ?= cocoindex-code:local-layered" in content assert "CCC_VARIANT ?= slim" in content + assert "CCC_WRAPPER ?= bin/ccc" in content assert "build: build-local" in content assert "build-local:" in content + assert "-f docker/Dockerfile" in content assert "--build-arg CCC_INSTALL_SPEC=/ccc-src" in content assert "build-pypi:" in content assert "reset: down" in content From 93826dc9247ee020c61c42a0142de7be8e63ed79 Mon Sep 17 00:00:00 2001 From: Rudimar Ronsoni Date: Wed, 20 May 2026 11:25:22 +0200 Subject: [PATCH 06/15] Make sidecar wrapper executable --- bin/ccc | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 bin/ccc diff --git a/bin/ccc b/bin/ccc old mode 100644 new mode 100755 From 73ff3e720b5d637769d5f0fac8f8f344dd9084b5 Mon Sep 17 00:00:00 2001 From: Rudimar Ronsoni Date: Wed, 20 May 2026 17:43:10 +0200 Subject: [PATCH 07/15] Fix Docker sidecar state reuse --- Makefile | 10 +++ bin/ccc | 41 ++++++++- docker/docker-compose.yml | 4 + pyproject.toml | 1 + src/cocoindex_code/cli.py | 46 +++++++++- src/cocoindex_code/layered_project.py | 29 +++++-- src/cocoindex_code/layers/layer_stack.py | 5 +- src/cocoindex_code/sidecar.py | 86 ++++++++++++++++++- .../version_control/__init__.py | 8 +- src/cocoindex_code/version_control/git.py | 70 ++++++++++++++- tests/test_cli_helpers.py | 79 ++++++++++++++++- tests/test_docker_setup.py | 18 ++++ tests/test_git_layers.py | 36 ++++++++ uv.lock | 46 ++++++++++ 14 files changed, 460 insertions(+), 19 deletions(-) diff --git a/Makefile b/Makefile index cad69df..b0e656b 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,9 @@ IMAGE ?= cocoindex-code:local-layered COMPOSE ?= docker compose -f docker/docker-compose.yml CCC_VARIANT ?= slim CCC_WRAPPER ?= bin/ccc +DAEMON_CONTAINER ?= cocoindex-code-local-daemon +STATE_VOLUME ?= cocoindex-code-local-state +RUNTIME_VOLUME ?= cocoindex-code-local-runtime .PHONY: build build-local build-pypi up restart ps logs down reset install-ccc-wrapper @@ -26,6 +29,13 @@ ps: docker ps --filter 'name=cocoindex-code-local-daemon' up: + docker volume inspect "$(STATE_VOLUME)" >/dev/null 2>&1 || docker volume create "$(STATE_VOLUME)" >/dev/null + docker volume inspect "$(RUNTIME_VOLUME)" >/dev/null 2>&1 || docker volume create "$(RUNTIME_VOLUME)" >/dev/null + @if docker inspect "$(DAEMON_CONTAINER)" >/dev/null 2>&1 && \ + [ -z "$$(docker inspect -f '{{ index .Config.Labels "com.docker.compose.project" }}' "$(DAEMON_CONTAINER)" 2>/dev/null)" ]; then \ + echo "Removing non-Compose daemon container $(DAEMON_CONTAINER) before compose up"; \ + docker rm -f "$(DAEMON_CONTAINER)" >/dev/null; \ + fi COCOINDEX_CODE_IMAGE="$(IMAGE)" $(COMPOSE) up -d restart: diff --git a/bin/ccc b/bin/ccc index e521358..e8c1c37 100755 --- a/bin/ccc +++ b/bin/ccc @@ -87,10 +87,49 @@ ensure_image_exists() { fi } +container_has_env() { + local expected="$1" + docker inspect -f '{{range .Config.Env}}{{println .}}{{end}}' "$central_container" \ + 2>/dev/null | grep -Fx -- "$expected" >/dev/null +} + +container_has_volume_mount() { + local destination="$1" + local volume_name="$2" + docker inspect -f '{{range .Mounts}}{{.Destination}}{{"\t"}}{{.Type}}{{"\t"}}{{.Name}}{{"\n"}}{{end}}' \ + "$central_container" 2>/dev/null | awk -F '\t' \ + -v destination="$destination" -v volume_name="$volume_name" \ + '$1 == destination && $2 == "volume" && $3 == volume_name { found = 1 } END { exit !found }' +} + +container_has_bind_mount() { + local destination="$1" + docker inspect -f '{{range .Mounts}}{{.Destination}}{{"\t"}}{{.Type}}{{"\n"}}{{end}}' \ + "$central_container" 2>/dev/null | awk -F '\t' \ + -v destination="$destination" \ + '$1 == destination && $2 == "bind" { found = 1 } END { exit !found }' +} + +daemon_container_matches_expected() { + [[ "$(docker inspect -f '{{.Config.Image}}' "$central_container" 2>/dev/null || true)" == "$image" ]] || return 1 + container_has_bind_mount "$container_settings_dir" || return 1 + container_has_volume_mount "$container_state_root" "$state_volume" || return 1 + container_has_volume_mount "$container_runtime_dir" "$runtime_volume" || return 1 + container_has_env "COCOINDEX_CODE_DAEMON_TCP=$daemon_listen_addr" || return 1 + container_has_env "COCOINDEX_CODE_DIR=$container_settings_dir" || return 1 + container_has_env "COCOINDEX_CODE_STATE_DIR=$container_state_dir" || return 1 + container_has_env "COCOINDEX_CODE_RUNTIME_DIR=$container_runtime_dir" || return 1 + container_has_env "COCOINDEX_CODE_DB_PATH_MAPPING=$container_db_path_mapping" || return 1 +} + ensure_central_daemon() { docker network inspect "$network" >/dev/null 2>&1 || docker network create "$network" >/dev/null if [[ "$(docker inspect -f '{{.State.Running}}' "$central_container" 2>/dev/null || true)" == "true" ]]; then - return + if daemon_container_matches_expected; then + return + fi + echo "Recreating incompatible cocoindex-code daemon container: $central_container" >&2 + docker rm -f "$central_container" >/dev/null fi if docker inspect "$central_container" >/dev/null 2>&1; then docker rm "$central_container" >/dev/null diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 003dc11..6fb1ed9 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -36,7 +36,11 @@ services: volumes: cocoindex-code-local-state: + name: ${COCOINDEX_CODE_STATE_VOLUME:-cocoindex-code-local-state} + external: true cocoindex-code-local-runtime: + name: ${COCOINDEX_CODE_RUNTIME_VOLUME:-cocoindex-code-local-runtime} + external: true networks: cocoindex-code-local: diff --git a/pyproject.toml b/pyproject.toml index e429ea2..f3f0c6a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ dependencies = [ "pygit2>=1.19.0", "pyyaml>=6.0", "questionary>=2.0.0", + "botocore>=1.34.0", ] [project.optional-dependencies] diff --git a/src/cocoindex_code/cli.py b/src/cocoindex_code/cli.py index 4b79bdc..3b8e936 100644 --- a/src/cocoindex_code/cli.py +++ b/src/cocoindex_code/cli.py @@ -19,6 +19,7 @@ ProjectStatusResponse, SearchResponse, ) + from .sidecar import SidecarIndexReport from .settings import ( DEFAULT_ST_MODEL, @@ -190,6 +191,38 @@ def print_index_stats(status: ProjectStatusResponse) -> None: _typer.echo(f" {lang}: {count} chunks") +def _short_hash(value: str | None) -> str: + if value is None: + return "-" + return value[:12] + + +def print_layered_index_report(report: SidecarIndexReport) -> None: + _typer.echo("\nLayered index:") + _typer.echo(f" Repo ID: {report.repo_id or '-'}") + _typer.echo(f" Project: {format_path_for_display(report.project_root)}") + _typer.echo(f" Worktree: {format_path_for_display(report.cwd)}") + _typer.echo(f" Branch: {report.branch or '-'}") + _typer.echo(f" Base: {report.base_ref or '-'} @ {_short_hash(report.base_commit)}") + _typer.echo(f" Head: {_short_hash(report.head_commit)}") + _typer.echo(" Layers:") + for layer in report.layers: + action = "built" if layer.built else "reused" + diff = f"diff={layer.affected_count} paths" + if layer.tombstoned_count: + diff += f", tombstones={layer.tombstoned_count}" + previous = layer.previous_commit or layer.merge_base + _typer.echo( + " " + f"{layer.kind:<6} {action:<6} " + f"id={layer.layer_id} " + f"ref={layer.ref_name or '-'} " + f"prev={_short_hash(previous)} " + f"commit={_short_hash(layer.commit)} " + f"{diff}" + ) + + def print_search_results(response: SearchResponse) -> None: """Print formatted search results.""" if not response.success: @@ -211,7 +244,7 @@ def _run_index_with_progress( *, cwd: str | None = None, base_ref: str | None = None, -) -> None: +) -> SidecarIndexReport | None: """Run indexing with streaming progress display. Exits on failure.""" from rich.console import Console as _Console from rich.live import Live as _Live @@ -223,6 +256,7 @@ def _run_index_with_progress( err_console = _Console(stderr=True) last_progress_line: str | None = None + sidecar_report: SidecarIndexReport | None = None with _Live(_Spinner("dots", "Indexing..."), console=err_console, transient=True) as live: @@ -241,7 +275,7 @@ def _on_progress(progress: IndexingProgress) -> None: try: if sidecar_enabled(): - asyncio.run( + sidecar_report = asyncio.run( run_sidecar_index( project_root=Path(project_root), cwd=Path(cwd) if cwd is not None else Path(project_root), @@ -273,6 +307,7 @@ def _on_progress(progress: IndexingProgress) -> None: if not resp.success: _typer.echo(f"Indexing failed: {resp.message}", err=True) raise _typer.Exit(code=1) + return sidecar_report def _search_with_wait_spinner( @@ -660,12 +695,17 @@ def index( ) -> None: """Create/update index for the codebase.""" from . import client as _client + from .sidecar import sidecar_enabled project_root_path = require_project_root_from(cwd.resolve() if cwd is not None else None) project_root = str(project_root_path) request_cwd = str(cwd.resolve()) if cwd is not None else None print_project_header(project_root) - _run_index_with_progress(project_root, cwd=request_cwd, base_ref=base_ref) + sidecar_report = _run_index_with_progress(project_root, cwd=request_cwd, base_ref=base_ref) + if sidecar_enabled(): + if sidecar_report is not None: + print_layered_index_report(sidecar_report) + return print_index_stats(_client.project_status(project_root)) diff --git a/src/cocoindex_code/layered_project.py b/src/cocoindex_code/layered_project.py index bc48658..9dce993 100644 --- a/src/cocoindex_code/layered_project.py +++ b/src/cocoindex_code/layered_project.py @@ -18,7 +18,7 @@ ) from .settings import load_project_settings from .shared import Embedder -from .version_control import resolve_worktree +from .version_control import remote_tracking_ref_for_local_branch, resolve_worktree def _sha_short(value: str) -> str: @@ -158,9 +158,16 @@ async def ensure_layer_ids( self, on_progress: Callable[[IndexingProgress], None] | None = None, ) -> list[str]: + layers = await self.ensure_layer_results(on_progress=on_progress) + return [layer.layer.id for layer in layers] + + async def ensure_layer_results( + self, + on_progress: Callable[[IndexingProgress], None] | None = None, + ) -> list[LayerBuildResult]: layers = await self._ensure_layers(on_progress=on_progress) self._last_layers = layers - return [layer.layer.id for layer in layers] + return layers def get_status(self) -> ProjectStatusResponse: total_chunks = 0 @@ -208,10 +215,20 @@ async def _ensure_layers( worktree = resolve_worktree(self.cwd, base_ref=self.base_ref, index_config_hash=config_hash) if self.base_ref is None: stored_base_ref = self.store.get_overlay_base_ref(worktree.repository.id) - if stored_base_ref is not None and stored_base_ref != worktree.branch.base_ref: - worktree = resolve_worktree( - self.cwd, base_ref=stored_base_ref, index_config_hash=config_hash - ) + if stored_base_ref is not None: + remote_base_ref = remote_tracking_ref_for_local_branch(self.cwd, stored_base_ref) + if remote_base_ref is not None and remote_base_ref != stored_base_ref: + worktree = resolve_worktree( + self.cwd, base_ref=remote_base_ref, index_config_hash=config_hash + ) + self.store.upsert_overlay_policy( + repo_id=worktree.repository.id, base_ref=remote_base_ref + ) + stored_base_ref = remote_base_ref + if stored_base_ref != worktree.branch.base_ref: + worktree = resolve_worktree( + self.cwd, base_ref=stored_base_ref, index_config_hash=config_hash + ) return await self._stack.ensure( worktree=worktree, config_hash=config_hash, diff --git a/src/cocoindex_code/layers/layer_stack.py b/src/cocoindex_code/layers/layer_stack.py index cf0855f..a2c1996 100644 --- a/src/cocoindex_code/layers/layer_stack.py +++ b/src/cocoindex_code/layers/layer_stack.py @@ -39,6 +39,7 @@ class LayerBuildResult: layer: Layer manifest: LayerManifest runtime: LayerRuntime + built: bool = False @property def record(self) -> Layer: @@ -255,11 +256,13 @@ async def _ensure_layer( ) -> LayerBuildResult: paths = LayerPaths.for_layer(self.state_dir, worktree.repository.id, layer_id) existing = self.store.get_layer(layer_id) + built = False if ( existing is None or existing.status != "ready" or not paths.target_sqlite.exists() ): + built = True shutil.rmtree(paths.root, ignore_errors=True) paths.source.mkdir(parents=True, exist_ok=True) paths.db_dir.mkdir(parents=True, exist_ok=True) @@ -294,7 +297,7 @@ async def _ensure_layer( if manifest is None: raise RuntimeError(f"Layer manifest missing after build: {layer_id}") runtime = await self._runtime(layer) - return LayerBuildResult(layer=layer, manifest=manifest, runtime=runtime) + return LayerBuildResult(layer=layer, manifest=manifest, runtime=runtime, built=built) def _require_layer(self, layer_id: str) -> Layer: layer = self.store.get_layer(layer_id) diff --git a/src/cocoindex_code/sidecar.py b/src/cocoindex_code/sidecar.py index 9d25891..38edc5b 100644 --- a/src/cocoindex_code/sidecar.py +++ b/src/cocoindex_code/sidecar.py @@ -2,6 +2,7 @@ import os from collections.abc import Callable +from dataclasses import dataclass from pathlib import Path from ._daemon_paths import daemon_state_dir @@ -9,6 +10,7 @@ from .embedder_params import resolve_embedder_params from .layer_store import LayerStore from .layered_project import LayeredProject +from .layers import LayerBuildResult from .protocol import IndexingProgress from .settings import load_project_settings, load_user_settings from .shared import create_embedder @@ -18,6 +20,67 @@ def sidecar_enabled() -> bool: return os.environ.get("COCOINDEX_CODE_SIDECAR") == "1" +@dataclass(frozen=True) +class SidecarLayerSummary: + layer_id: str + kind: str + ref_name: str | None + commit: str | None + previous_commit: str | None + merge_base: str | None + base_layer_id: str | None + status: str + built: bool + affected_count: int + tombstoned_count: int + + +@dataclass(frozen=True) +class SidecarIndexReport: + project_root: Path + cwd: Path + repo_id: str | None + branch: str | None + base_ref: str | None + base_commit: str | None + head_commit: str | None + layers: tuple[SidecarLayerSummary, ...] + + +def _summarize_layers( + *, project_root: Path, cwd: Path, layers: list[LayerBuildResult] +) -> SidecarIndexReport: + summaries = tuple( + SidecarLayerSummary( + layer_id=layer.layer.id, + kind=layer.layer.kind.value, + ref_name=layer.layer.ref_name, + commit=layer.layer.commit_hash, + previous_commit=layer.layer.base_commit_hash, + merge_base=layer.layer.merge_base_hash, + base_layer_id=layer.layer.base_layer_id, + status=layer.layer.status, + built=layer.built, + affected_count=len(layer.manifest.affected_paths), + tombstoned_count=len(layer.manifest.tombstoned_paths), + ) + for layer in layers + ) + base = next((layer for layer in summaries if layer.kind == "base"), None) + top = summaries[0] if summaries else None + branch = next((layer.ref_name for layer in summaries if layer.kind != "base"), None) + return SidecarIndexReport( + project_root=project_root, + cwd=cwd, + repo_id=layers[0].layer.repo_id if layers else None, + branch=branch or (top.ref_name if top is not None else None), + base_ref=base.ref_name if base is not None else None, + base_commit=base.commit if base is not None else None, + head_commit=top.commit if top is not None else None, + layers=summaries, + ) + + async def ensure_sidecar_layer_ids( *, project_root: Path, @@ -55,10 +118,27 @@ async def run_sidecar_index( cwd: Path, base_ref: str | None, on_progress: Callable[[IndexingProgress], None] | None = None, -) -> None: - await ensure_sidecar_layer_ids( +) -> SidecarIndexReport: + user_settings = load_user_settings() + for key, value in user_settings.envs.items(): + os.environ[key] = value + params = resolve_embedder_params(user_settings.embedding) + project_settings = load_project_settings(project_root) + state_dir = daemon_state_dir() + project = LayeredProject( project_root=project_root, cwd=cwd, base_ref=base_ref, - on_progress=on_progress, + state_dir=state_dir, + store=LayerStore(state_dir / "daemon.db"), + embedder=create_embedder(user_settings.embedding, indexing_params=params.indexing), + indexing_params=params.indexing, + query_params=params.query, + chunker_registry=_resolve_chunker_registry(project_settings.chunkers), + project_cache={}, ) + try: + layers = await project.ensure_layer_results(on_progress=on_progress) + return _summarize_layers(project_root=project_root, cwd=cwd, layers=layers) + finally: + project.close() diff --git a/src/cocoindex_code/version_control/__init__.py b/src/cocoindex_code/version_control/__init__.py index d197183..1f80f47 100644 --- a/src/cocoindex_code/version_control/__init__.py +++ b/src/cocoindex_code/version_control/__init__.py @@ -1,6 +1,11 @@ from .branch import Branch from .change_set import ChangeSet, GitStatusEntry -from .git import GitContextError, normalize_remote_url, resolve_worktree +from .git import ( + GitContextError, + normalize_remote_url, + remote_tracking_ref_for_local_branch, + resolve_worktree, +) from .repository import Repository from .worktree import Worktree @@ -12,5 +17,6 @@ "Repository", "Worktree", "normalize_remote_url", + "remote_tracking_ref_for_local_branch", "resolve_worktree", ] diff --git a/src/cocoindex_code/version_control/git.py b/src/cocoindex_code/version_control/git.py index 10c7099..3226a0e 100644 --- a/src/cocoindex_code/version_control/git.py +++ b/src/cocoindex_code/version_control/git.py @@ -121,7 +121,7 @@ def _dirty_snapshot_hash(repo_root: Path, entries: tuple[GitStatusEntry, ...]) - def _resolve_base_ref(repo: pygit2.Repository, requested: str | None) -> str: - candidates = [requested] if requested else ["origin/main", "main", "master", "HEAD"] + candidates = [requested] if requested else _default_base_ref_candidates(repo) for candidate in candidates: if candidate is None: continue @@ -130,7 +130,73 @@ def _resolve_base_ref(repo: pygit2.Repository, requested: str | None) -> str: return candidate except (KeyError, ValueError, pygit2.GitError): continue - raise GitContextError("No usable base ref found") + if requested: + raise GitContextError(f"No usable base ref found for {requested}") + raise GitContextError( + "No usable default base ref found. Configure an upstream branch or run " + "`ccc init --base `." + ) + + +def _shorten_ref_name(ref_name: str) -> str: + for prefix in ("refs/remotes/", "refs/heads/"): + if ref_name.startswith(prefix): + return ref_name.removeprefix(prefix) + return ref_name + + +def _branch_upstream_ref(repo: pygit2.Repository, branch_name: str) -> str | None: + try: + branch = repo.branches.local.get(branch_name) + except (KeyError, ValueError, pygit2.GitError): + return None + if branch is None: + return None + try: + upstream = branch.upstream + except (KeyError, ValueError, pygit2.GitError): + return None + if upstream is None: + return None + return _shorten_ref_name(upstream.name) + + +def _current_branch_upstream_ref(repo: pygit2.Repository) -> str | None: + try: + return _branch_upstream_ref(repo, repo.head.shorthand) + except (KeyError, ValueError, pygit2.GitError): + return None + + +def _remote_head_refs(repo: pygit2.Repository) -> list[str]: + refs: list[str] = [] + for ref_name in sorted(repo.references): + if not ref_name.startswith("refs/remotes/") or not ref_name.endswith("/HEAD"): + continue + try: + ref = repo.lookup_reference(ref_name) + refs.append(_shorten_ref_name(ref.resolve().name)) + except (KeyError, ValueError, pygit2.GitError): + continue + return refs + + +def _default_base_ref_candidates(repo: pygit2.Repository) -> list[str]: + candidates: list[str] = [] + upstream = _current_branch_upstream_ref(repo) + if upstream is not None: + candidates.append(upstream) + candidates.extend(_remote_head_refs(repo)) + return list(dict.fromkeys(candidates)) + + +def remote_tracking_ref_for_local_branch( + cwd: str | os.PathLike[str] | Path, + branch_name: str, +) -> str | None: + """Return the configured upstream ref for a local branch, if any.""" + repo = _open_repo(Path(cwd).resolve()) + return _branch_upstream_ref(repo, branch_name) def _git_common_dir(repo: pygit2.Repository) -> Path: diff --git a/tests/test_cli_helpers.py b/tests/test_cli_helpers.py index fc2c7bf..e1274ea 100644 --- a/tests/test_cli_helpers.py +++ b/tests/test_cli_helpers.py @@ -14,6 +14,47 @@ resolve_default_path, ) from cocoindex_code.protocol import SearchResponse +from cocoindex_code.sidecar import SidecarIndexReport, SidecarLayerSummary + + +def _sample_sidecar_report(project_root: Path) -> SidecarIndexReport: + return SidecarIndexReport( + project_root=project_root, + cwd=project_root, + repo_id="repo-123", + branch="feature", + base_ref="origin/main", + base_commit="abcdef1234567890", + head_commit="fedcba9876543210", + layers=( + SidecarLayerSummary( + layer_id="branch-layer", + kind="branch", + ref_name="feature", + commit="fedcba9876543210", + previous_commit="abcdef1234567890", + merge_base="abcdef1234567890", + base_layer_id="base-layer", + status="ready", + built=True, + affected_count=12, + tombstoned_count=1, + ), + SidecarLayerSummary( + layer_id="base-layer", + kind="base", + ref_name="origin/main", + commit="abcdef1234567890", + previous_commit=None, + merge_base=None, + base_layer_id=None, + status="ready", + built=False, + affected_count=0, + tombstoned_count=0, + ), + ), + ) def test_require_project_root_success(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: @@ -308,8 +349,9 @@ def test_run_index_with_progress_uses_sidecar_indexer( import cocoindex_code.client as client import cocoindex_code.sidecar as sidecar - async def fake_run_sidecar_index(**kwargs: object) -> None: + async def fake_run_sidecar_index(**kwargs: object) -> SidecarIndexReport: captured["index_kwargs"] = kwargs + return _sample_sidecar_report(tmp_path / "repo") def fail_client_index(*_args: object, **_kwargs: object) -> object: raise AssertionError("daemon index should not run in sidecar mode") @@ -319,12 +361,14 @@ def fail_client_index(*_args: object, **_kwargs: object) -> object: monkeypatch.setattr(sidecar, "run_sidecar_index", fake_run_sidecar_index) monkeypatch.setattr(client, "index", fail_client_index) - cli._run_index_with_progress( + report = cli._run_index_with_progress( str(tmp_path / "repo"), cwd=str(tmp_path / "repo" / "src"), base_ref="main", ) + assert report is not None + assert report.repo_id == "repo-123" kwargs = captured["index_kwargs"] assert isinstance(kwargs, dict) assert kwargs["project_root"] == tmp_path / "repo" @@ -334,6 +378,37 @@ def fail_client_index(*_args: object, **_kwargs: object) -> object: assert "Indexing failed" not in capsys.readouterr().err +def test_index_command_skips_daemon_project_status_in_sidecar_mode( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] +) -> None: + import cocoindex_code.client as client + import cocoindex_code.sidecar as sidecar + + project_root = tmp_path / "repo" + + def fail_project_status(_project_root: str) -> object: + raise AssertionError("sidecar index must not ask daemon for non-mounted project status") + + monkeypatch.setattr(cli, "require_project_root_from", lambda _cwd: project_root) + monkeypatch.setattr( + cli, + "_run_index_with_progress", + lambda *_args, **_kwargs: _sample_sidecar_report(project_root), + ) + monkeypatch.setattr(sidecar, "sidecar_enabled", lambda: True) + monkeypatch.setattr(client, "project_status", fail_project_status) + + cli.index(cwd=None, base_ref=None) + + out = capsys.readouterr().out + assert f"Project: {project_root}" in out + assert "Layered index:" in out + assert "Repo ID: repo-123" in out + assert "branch built" in out + assert "diff=12 paths, tombstones=1" in out + assert "Index stats:" not in out + + # --------------------------------------------------------------------------- # ccc init — auto-populate indexing_params / query_params from curated table # --------------------------------------------------------------------------- diff --git a/tests/test_docker_setup.py b/tests/test_docker_setup.py index 1a6769c..3de6eea 100644 --- a/tests/test_docker_setup.py +++ b/tests/test_docker_setup.py @@ -55,6 +55,9 @@ def test_docker_compose_uses_sidecar_daemon_model() -> None: assert "COCOINDEX_CODE_DB_PATH_MAPPING: /workspace=/var/cocoindex/db" in content assert "cocoindex-code-local-state:/var/cocoindex" in content assert "cocoindex-code-local-runtime:/var/run/cocoindex_code" in content + assert "name: ${COCOINDEX_CODE_STATE_VOLUME:-cocoindex-code-local-state}" in content + assert "name: ${COCOINDEX_CODE_RUNTIME_VOLUME:-cocoindex-code-local-runtime}" in content + assert "external: true" in content assert "ccc daemon status" in content assert "daemon.sock" in content @@ -120,6 +123,18 @@ def test_wrapper_mounts_only_authorized_repo_sidecar() -> None: assert 'exec docker "${run_args[@]}"' in content +def test_wrapper_recreates_incompatible_daemon_container() -> None: + content = (REPO_ROOT / "bin" / "ccc").read_text() + + assert "daemon_container_matches_expected()" in content + assert "container_has_volume_mount" in content + assert "container_has_env" in content + assert '{{.Destination}}{{"\\t"}}{{.Type}}{{"\\t"}}{{.Name}}' in content + assert '{{println .Destination "\\t" .Type' not in content + assert "Recreating incompatible cocoindex-code daemon container" in content + assert 'docker rm -f "$central_container"' in content + + def test_wrapper_defaults_settings_dir_to_host_home() -> None: content = (REPO_ROOT / "bin" / "ccc").read_text() @@ -168,11 +183,14 @@ def test_makefile_has_default_image_and_reset_target() -> None: assert "IMAGE ?= cocoindex-code:local-layered" in content assert "CCC_VARIANT ?= slim" in content assert "CCC_WRAPPER ?= bin/ccc" in content + assert "DAEMON_CONTAINER ?= cocoindex-code-local-daemon" in content assert "build: build-local" in content assert "build-local:" in content assert "-f docker/Dockerfile" in content assert "--build-arg CCC_INSTALL_SPEC=/ccc-src" in content assert "build-pypi:" in content + assert "Removing non-Compose daemon container" in content + assert "docker volume create" in content assert "reset: down" in content assert "docker volume rm" in content diff --git a/tests/test_git_layers.py b/tests/test_git_layers.py index 189a3e1..ba56b87 100644 --- a/tests/test_git_layers.py +++ b/tests/test_git_layers.py @@ -86,6 +86,42 @@ def test_resolve_worktree_context_worktree_id_uses_name_and_branch(tmp_path: Pat assert first_ctx.repo_root != second_ctx.repo_root +def test_resolve_worktree_context_uses_configured_branch_upstream( + tmp_path: Path, +) -> None: + repo = _init_repo(tmp_path / "repo") + _git(repo, "branch", "-m", "master") + origin_master = _git(repo, "rev-parse", "HEAD") + _git(repo, "update-ref", "refs/remotes/origin/master", origin_master) + _git(repo, "branch", "--set-upstream-to=origin/master", "master") + (repo / "main.py").write_text("def changed() -> str:\n return 'changed'\n") + _git(repo, "add", ".") + _git(repo, "commit", "-m", "local master advanced") + + ctx = resolve_worktree_context(repo, base_ref=None, index_config_hash="cfg") + + assert ctx.branch.base_ref == "origin/master" + assert ctx.branch.base_commit == origin_master + assert ctx.branch.head_commit != ctx.branch.base_commit + + +def test_resolve_worktree_context_uses_remote_head_when_no_branch_upstream( + tmp_path: Path, +) -> None: + repo = _init_repo(tmp_path / "repo") + origin_default = _git(repo, "rev-parse", "HEAD") + _git(repo, "update-ref", "refs/remotes/upstream/default", origin_default) + _git(repo, "symbolic-ref", "refs/remotes/upstream/HEAD", "refs/remotes/upstream/default") + (repo / "main.py").write_text("def changed() -> str:\n return 'changed'\n") + _git(repo, "add", ".") + _git(repo, "commit", "-m", "local branch advanced") + + ctx = resolve_worktree_context(repo, base_ref=None, index_config_hash="cfg") + + assert ctx.branch.base_ref == "upstream/default" + assert ctx.branch.base_commit == origin_default + + def test_layer_store_persists_ready_layers_and_manifests(tmp_path: Path) -> None: store = LayerStore(tmp_path / "daemon.db") record = store.upsert_layer( diff --git a/uv.lock b/uv.lock index d9944bf..b66c73e 100644 --- a/uv.lock +++ b/uv.lock @@ -170,6 +170,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3a/2a/7cc015f5b9f5db42b7d48157e23356022889fc354a2813c15934b7cb5c0e/attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373", size = 67615, upload-time = "2025-10-06T13:54:43.17Z" }, ] +[[package]] +name = "botocore" +version = "1.43.11" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jmespath" }, + { name = "python-dateutil" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/e9/fa/4bec16fa5a4cde7b593e549238bfeb8ed1bdba9d427888a18c460a1f2352/botocore-1.43.11.tar.gz", hash = "sha256:d7d479cc2809ec2728f2898521003adfb79bfe6a4615c59dfd222ec52b0cee6b", size = 15364020, upload-time = "2026-05-19T19:39:58.317Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e1/9a/9f1d955c2eebefb6bd20de740ae7a05e7b015c63f0f01dba338dcf29cc68/botocore-1.43.11-py3-none-any.whl", hash = "sha256:0108b5604df5a26918936c845e1e761866ee9ea8d1c1f9358ed3c69afdc37436", size = 15043467, upload-time = "2026-05-19T19:39:53.176Z" }, +] + [[package]] name = "certifi" version = "2026.1.4" @@ -373,6 +387,7 @@ sentence-transformers = [ name = "cocoindex-code" source = { editable = "." } dependencies = [ + { name = "botocore" }, { name = "cocoindex", extra = ["litellm"] }, { name = "einops" }, { name = "mcp" }, @@ -418,6 +433,7 @@ dev = [ [package.metadata] requires-dist = [ + { name = "botocore", specifier = ">=1.34.0" }, { name = "cocoindex", extras = ["litellm"], specifier = ">=1.0.6,<1.1.0" }, { name = "cocoindex", extras = ["sentence-transformers"], marker = "extra == 'dev'", specifier = ">=1.0.6,<1.1.0" }, { name = "cocoindex", extras = ["sentence-transformers"], marker = "extra == 'embeddings-local'", specifier = ">=1.0.6,<1.1.0" }, @@ -1054,6 +1070,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/67/8a/a342b2f0251f3dac4ca17618265d93bf244a2a4d089126e81e4c1056ac50/jiter-0.13.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7bb00b6d26db67a05fe3e12c76edc75f32077fb51deed13822dc648fa373bc19", size = 343768, upload-time = "2026-02-02T12:37:55.055Z" }, ] +[[package]] +name = "jmespath" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/59/322338183ecda247fb5d1763a6cbe46eff7222eaeebafd9fa65d4bf5cb11/jmespath-1.1.0.tar.gz", hash = "sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d", size = 27377, upload-time = "2026-01-22T16:35:26.279Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/14/2f/967ba146e6d58cf6a652da73885f52fc68001525b4197effc174321d70b4/jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64", size = 20419, upload-time = "2026-01-22T16:35:24.919Z" }, +] + [[package]] name = "joblib" version = "1.5.3" @@ -2216,6 +2241,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/49/1377b49de7d0c1ce41292161ea0f721913fa8722c19fb9c1e3aa0367eecb/pytest_cov-7.0.0-py3-none-any.whl", hash = "sha256:3b8e9558b16cc1479da72058bdecf8073661c7f57f7d3c5f22a1c23507f2d861", size = 22424, upload-time = "2025-09-09T10:57:00.695Z" }, ] +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, +] + [[package]] name = "python-dotenv" version = "1.2.1" @@ -2780,6 +2817,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" }, ] +[[package]] +name = "six" +version = "1.17.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, +] + [[package]] name = "sniffio" version = "1.3.1" From 5e24eea7232a87fa4ccd919a641fb765482f7845 Mon Sep 17 00:00:00 2001 From: Rudimar Ronsoni Date: Wed, 20 May 2026 18:08:36 +0200 Subject: [PATCH 08/15] Fix Docker sidecar linked worktrees --- README.md | 6 ++++-- bin/ccc | 1 + docs/docker-layered-indexing.md | 9 ++++++++- tests/test_docker_setup.py | 3 +++ 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index ddf30bb..0449ff1 100644 --- a/README.md +++ b/README.md @@ -262,8 +262,10 @@ ccc search "authentication logic" ``` `ccc init` records the current Git root as authorized. Later commands refuse to -run outside an authorized repo. Sidecars mount only the authorized repo at -`/workspace` and talk to the central daemon over a private Docker network. +run outside an authorized repo. Sidecars mount only the authorized repo, at +`/workspace` and at the same absolute path it has on the host so libgit2 can +resolve linked-worktree metadata. They talk to the central daemon over a private +Docker network. ### Trusted-workspace compose: `docker compose up -d` diff --git a/bin/ccc b/bin/ccc index e8c1c37..5c34e6f 100755 --- a/bin/ccc +++ b/bin/ccc @@ -168,6 +168,7 @@ run_sidecar() { run --rm -i --network "$network" --volume "$root:$workspace_dir" + --volume "$root:$root" --volume "$host_settings_dir:$container_settings_dir" --volume "$state_volume:$container_state_root" --volume "$runtime_volume:$container_runtime_dir" diff --git a/docs/docker-layered-indexing.md b/docs/docker-layered-indexing.md index ff42bd1..3a2b35a 100644 --- a/docs/docker-layered-indexing.md +++ b/docs/docker-layered-indexing.md @@ -7,7 +7,9 @@ The intended Docker architecture is: - one central daemon container with no source-code mount - Docker named volumes for daemon state, runtime files, config, caches, and layer databases - short-lived sidecar containers for repo work -- each sidecar mounts exactly one authorized Git checkout at `/workspace` +- each sidecar mounts exactly one authorized Git checkout at `/workspace` and at + the same absolute path it has on the host, so libgit2 can resolve + linked-worktree metadata without exposing a broader source tree - sidecars talk to the central daemon over a private Docker network Do not mount `$HOME` or a broad source tree just to make indexing work. @@ -78,6 +80,7 @@ Sidecar container: ```text mounts: /authorized/repo -> /workspace + /authorized/repo -> /authorized/repo $HOME/.cocoindex_code -> /home/coco/.cocoindex_code cocoindex-code-local-state -> /var/cocoindex cocoindex-code-local-runtime -> /var/run/cocoindex_code @@ -89,6 +92,10 @@ source access: only the authorized repo ``` +The second repo bind mount is the same authorized checkout, not a parent +directory. It exists so linked-worktree `.git` metadata that contains absolute +host paths still resolves inside the sidecar. + Indexing runs in the sidecar because it is the process with Git/source access. The resulting layer metadata and layer databases are written to shared daemon state. Search sends the resolved layer IDs to the central daemon, and the daemon serves the query from shared layer databases without mounting the repository. ## State diff --git a/tests/test_docker_setup.py b/tests/test_docker_setup.py index 3de6eea..f4f16f4 100644 --- a/tests/test_docker_setup.py +++ b/tests/test_docker_setup.py @@ -84,6 +84,8 @@ def test_docker_sidecar_docs_describe_repo_scoped_architecture() -> None: assert "one central daemon container with no source-code mount" in content assert "short-lived sidecar containers" in content assert "Do not mount `$HOME` or a broad source tree" in content + assert "same absolute path it has on the host" in content + assert "linked-worktree metadata" in content assert "COCOINDEX_CODE_DAEMON_TCP" in content assert "COCOINDEX_CODE_SIDECAR=1" in content @@ -108,6 +110,7 @@ def test_wrapper_mounts_only_authorized_repo_sidecar() -> None: assert 'record_authorization "$root" "$common_dir"' in content assert '--volume "$root:$workspace_dir"' in content + assert '--volume "$root:$root"' in content assert '--volume "$host_settings_dir:$container_settings_dir"' in content assert '--volume "$state_volume:$container_state_root"' in content assert '--volume "$runtime_volume:$container_runtime_dir"' in content From 955b75a4f34f8508a3f339d7b0ed02e0294e6e7e Mon Sep 17 00:00:00 2001 From: Rudimar Ronsoni Date: Wed, 20 May 2026 18:16:09 +0200 Subject: [PATCH 09/15] Show per-layer index counts --- src/cocoindex_code/cli.py | 15 ++++++++++ src/cocoindex_code/layers/layer_stack.py | 18 ++++++++++-- src/cocoindex_code/sidecar.py | 37 +++++++++++++++--------- tests/test_cli_helpers.py | 18 +++++++++++- 4 files changed, 72 insertions(+), 16 deletions(-) diff --git a/src/cocoindex_code/cli.py b/src/cocoindex_code/cli.py index 3b8e936..20737f4 100644 --- a/src/cocoindex_code/cli.py +++ b/src/cocoindex_code/cli.py @@ -211,6 +211,19 @@ def print_layered_index_report(report: SidecarIndexReport) -> None: diff = f"diff={layer.affected_count} paths" if layer.tombstoned_count: diff += f", tombstones={layer.tombstoned_count}" + index = "" + if layer.indexed_file_count is not None and layer.indexed_chunk_count is not None: + index = f" index={layer.indexed_file_count} files, {layer.indexed_chunk_count} chunks" + progress = "" + if layer.progress is not None: + progress = ( + f" progress={layer.progress.num_execution_starts} listed," + f" {layer.progress.num_adds} added," + f" {layer.progress.num_unchanged} unchanged," + f" {layer.progress.num_reprocesses} reprocessed," + f" {layer.progress.num_deletes} deleted," + f" {layer.progress.num_errors} errors" + ) previous = layer.previous_commit or layer.merge_base _typer.echo( " " @@ -220,6 +233,8 @@ def print_layered_index_report(report: SidecarIndexReport) -> None: f"prev={_short_hash(previous)} " f"commit={_short_hash(layer.commit)} " f"{diff}" + f"{index}" + f"{progress}" ) diff --git a/src/cocoindex_code/layers/layer_stack.py b/src/cocoindex_code/layers/layer_stack.py index a2c1996..48eb3a3 100644 --- a/src/cocoindex_code/layers/layer_stack.py +++ b/src/cocoindex_code/layers/layer_stack.py @@ -40,6 +40,7 @@ class LayerBuildResult: manifest: LayerManifest runtime: LayerRuntime built: bool = False + progress: IndexingProgress | None = None @property def record(self) -> Layer: @@ -257,6 +258,7 @@ async def _ensure_layer( paths = LayerPaths.for_layer(self.state_dir, worktree.repository.id, layer_id) existing = self.store.get_layer(layer_id) built = False + progress: IndexingProgress | None = None if ( existing is None or existing.status != "ready" @@ -284,7 +286,13 @@ async def _ensure_layer( materialize(paths.source) layer = self._require_layer(layer_id) runtime = await self._runtime(layer) - await runtime.run_index(on_progress=on_progress) + def _on_progress(snapshot: IndexingProgress) -> None: + nonlocal progress + progress = snapshot + if on_progress is not None: + on_progress(snapshot) + + await runtime.run_index(on_progress=_on_progress) self.store.replace_manifest( layer_id, affected_paths=affected_paths, @@ -297,7 +305,13 @@ async def _ensure_layer( if manifest is None: raise RuntimeError(f"Layer manifest missing after build: {layer_id}") runtime = await self._runtime(layer) - return LayerBuildResult(layer=layer, manifest=manifest, runtime=runtime, built=built) + return LayerBuildResult( + layer=layer, + manifest=manifest, + runtime=runtime, + built=built, + progress=progress, + ) def _require_layer(self, layer_id: str) -> Layer: layer = self.store.get_layer(layer_id) diff --git a/src/cocoindex_code/sidecar.py b/src/cocoindex_code/sidecar.py index 38edc5b..e60eddb 100644 --- a/src/cocoindex_code/sidecar.py +++ b/src/cocoindex_code/sidecar.py @@ -33,6 +33,9 @@ class SidecarLayerSummary: built: bool affected_count: int tombstoned_count: int + indexed_file_count: int | None = None + indexed_chunk_count: int | None = None + progress: IndexingProgress | None = None @dataclass(frozen=True) @@ -51,19 +54,7 @@ def _summarize_layers( *, project_root: Path, cwd: Path, layers: list[LayerBuildResult] ) -> SidecarIndexReport: summaries = tuple( - SidecarLayerSummary( - layer_id=layer.layer.id, - kind=layer.layer.kind.value, - ref_name=layer.layer.ref_name, - commit=layer.layer.commit_hash, - previous_commit=layer.layer.base_commit_hash, - merge_base=layer.layer.merge_base_hash, - base_layer_id=layer.layer.base_layer_id, - status=layer.layer.status, - built=layer.built, - affected_count=len(layer.manifest.affected_paths), - tombstoned_count=len(layer.manifest.tombstoned_paths), - ) + _summarize_layer(layer) for layer in layers ) base = next((layer for layer in summaries if layer.kind == "base"), None) @@ -81,6 +72,26 @@ def _summarize_layers( ) +def _summarize_layer(layer: LayerBuildResult) -> SidecarLayerSummary: + status = layer.runtime.project.get_status() + return SidecarLayerSummary( + layer_id=layer.layer.id, + kind=layer.layer.kind.value, + ref_name=layer.layer.ref_name, + commit=layer.layer.commit_hash, + previous_commit=layer.layer.base_commit_hash, + merge_base=layer.layer.merge_base_hash, + base_layer_id=layer.layer.base_layer_id, + status=layer.layer.status, + built=layer.built, + affected_count=len(layer.manifest.affected_paths), + tombstoned_count=len(layer.manifest.tombstoned_paths), + indexed_file_count=status.total_files if status.index_exists else None, + indexed_chunk_count=status.total_chunks if status.index_exists else None, + progress=layer.progress, + ) + + async def ensure_sidecar_layer_ids( *, project_root: Path, diff --git a/tests/test_cli_helpers.py b/tests/test_cli_helpers.py index e1274ea..6d88aee 100644 --- a/tests/test_cli_helpers.py +++ b/tests/test_cli_helpers.py @@ -13,7 +13,7 @@ require_project_root, resolve_default_path, ) -from cocoindex_code.protocol import SearchResponse +from cocoindex_code.protocol import IndexingProgress, SearchResponse from cocoindex_code.sidecar import SidecarIndexReport, SidecarLayerSummary @@ -39,6 +39,16 @@ def _sample_sidecar_report(project_root: Path) -> SidecarIndexReport: built=True, affected_count=12, tombstoned_count=1, + indexed_file_count=8, + indexed_chunk_count=34, + progress=IndexingProgress( + num_execution_starts=8, + num_unchanged=2, + num_adds=5, + num_deletes=1, + num_reprocesses=0, + num_errors=0, + ), ), SidecarLayerSummary( layer_id="base-layer", @@ -52,6 +62,8 @@ def _sample_sidecar_report(project_root: Path) -> SidecarIndexReport: built=False, affected_count=0, tombstoned_count=0, + indexed_file_count=120, + indexed_chunk_count=610, ), ), ) @@ -406,6 +418,10 @@ def fail_project_status(_project_root: str) -> object: assert "Repo ID: repo-123" in out assert "branch built" in out assert "diff=12 paths, tombstones=1" in out + assert "index=8 files, 34 chunks" in out + assert "8 listed, 5 added, 2 unchanged, 0 reprocessed, 1 deleted, 0 errors" in out + assert "base reused" in out + assert "index=120 files, 610 chunks" in out assert "Index stats:" not in out From 5a5a09690850ea40f82daf0454e3366d5ca11654 Mon Sep 17 00:00:00 2001 From: Rudimar Ronsoni Date: Wed, 20 May 2026 18:45:52 +0200 Subject: [PATCH 10/15] Improve layered index reporting --- src/cocoindex_code/cli.py | 104 +++++++++++++++++++++++----------- src/cocoindex_code/project.py | 12 ++++ src/cocoindex_code/sidecar.py | 29 ++++++++-- tests/test_cli_helpers.py | 21 +++++-- tests/test_sidecar.py | 33 +++++++++++ 5 files changed, 156 insertions(+), 43 deletions(-) diff --git a/src/cocoindex_code/cli.py b/src/cocoindex_code/cli.py index 20737f4..83cd685 100644 --- a/src/cocoindex_code/cli.py +++ b/src/cocoindex_code/cli.py @@ -19,7 +19,7 @@ ProjectStatusResponse, SearchResponse, ) - from .sidecar import SidecarIndexReport + from .sidecar import SidecarIndexReport, SidecarLayerSummary from .settings import ( DEFAULT_ST_MODEL, @@ -197,45 +197,83 @@ def _short_hash(value: str | None) -> str: return value[:12] +def _format_number(value: int | None) -> str: + if value is None: + return "unknown" + return f"{value:,}" + + +def _format_file_chunk_counts(file_count: int | None, chunk_count: int | None) -> str: + if file_count is None or chunk_count is None: + return "unknown" + return f"{_format_number(file_count)} files, {_format_number(chunk_count)} chunks" + + +def _format_layer_scope(layer: SidecarLayerSummary) -> str: + previous = layer.previous_commit or layer.merge_base + if layer.kind == "base": + return f"full snapshot of {layer.ref_name or 'base'} at {_short_hash(layer.commit)}" + if layer.kind == "branch": + return ( + f"{layer.ref_name or 'branch'} changes from {_short_hash(previous)} " + f"to {_short_hash(layer.commit)}" + ) + if layer.kind == "dirty": + return f"uncommitted worktree changes on {_short_hash(layer.commit)}" + return f"{layer.ref_name or layer.kind} at {_short_hash(layer.commit)}" + + +def _format_layer_changes(layer: SidecarLayerSummary) -> str: + if layer.kind == "base": + return "full base snapshot" + message = f"{_format_number(layer.affected_count)} changed paths" + if layer.tombstoned_count: + message += f", {_format_number(layer.tombstoned_count)} deleted" + return message + + +def _format_layer_build_work(layer: SidecarLayerSummary) -> str: + if layer.progress is None: + return "skipped, reused existing ready layer" + progress = layer.progress + return ( + f"{_format_number(progress.num_execution_starts)} files listed; " + f"{_format_number(progress.num_adds)} added, " + f"{_format_number(progress.num_unchanged)} unchanged, " + f"{_format_number(progress.num_reprocesses)} reprocessed, " + f"{_format_number(progress.num_deletes)} deleted, " + f"{_format_number(progress.num_errors)} errors" + ) + + def print_layered_index_report(report: SidecarIndexReport) -> None: - _typer.echo("\nLayered index:") + _typer.echo("\nLayered index updated:") + _typer.echo( + " Mode: Git layered index " + "(base snapshot + branch delta + dirty changes if present)" + ) _typer.echo(f" Repo ID: {report.repo_id or '-'}") - _typer.echo(f" Project: {format_path_for_display(report.project_root)}") + _typer.echo(f" Source: {format_path_for_display(report.project_root)}") _typer.echo(f" Worktree: {format_path_for_display(report.cwd)}") _typer.echo(f" Branch: {report.branch or '-'}") - _typer.echo(f" Base: {report.base_ref or '-'} @ {_short_hash(report.base_commit)}") - _typer.echo(f" Head: {_short_hash(report.head_commit)}") - _typer.echo(" Layers:") + _typer.echo(f" Base snapshot: {report.base_ref or '-'} @ {_short_hash(report.base_commit)}") + _typer.echo(f" Head commit: {_short_hash(report.head_commit)}") + _typer.echo( + " Total searchable content: " + f"{_format_file_chunk_counts(report.effective_file_count, report.effective_chunk_count)}" + ) + _typer.echo(" Search layers, top to bottom:") for layer in report.layers: - action = "built" if layer.built else "reused" - diff = f"diff={layer.affected_count} paths" - if layer.tombstoned_count: - diff += f", tombstones={layer.tombstoned_count}" - index = "" - if layer.indexed_file_count is not None and layer.indexed_chunk_count is not None: - index = f" index={layer.indexed_file_count} files, {layer.indexed_chunk_count} chunks" - progress = "" - if layer.progress is not None: - progress = ( - f" progress={layer.progress.num_execution_starts} listed," - f" {layer.progress.num_adds} added," - f" {layer.progress.num_unchanged} unchanged," - f" {layer.progress.num_reprocesses} reprocessed," - f" {layer.progress.num_deletes} deleted," - f" {layer.progress.num_errors} errors" - ) - previous = layer.previous_commit or layer.merge_base + action = "built now" if layer.built else "reused" + _typer.echo(f" {layer.kind:<6} {action}") + _typer.echo(f" Layer ID: {layer.layer_id}") + _typer.echo(f" Covers: {_format_layer_scope(layer)}") + _typer.echo(f" Source changes: {_format_layer_changes(layer)}") _typer.echo( - " " - f"{layer.kind:<6} {action:<6} " - f"id={layer.layer_id} " - f"ref={layer.ref_name or '-'} " - f"prev={_short_hash(previous)} " - f"commit={_short_hash(layer.commit)} " - f"{diff}" - f"{index}" - f"{progress}" + " Searchable in this layer: " + f"{_format_file_chunk_counts(layer.indexed_file_count, layer.indexed_chunk_count)}" ) + _typer.echo(f" Build work: {_format_layer_build_work(layer)}") def print_search_results(response: SearchResponse) -> None: diff --git a/src/cocoindex_code/project.py b/src/cocoindex_code/project.py index aa31966..6b47ac0 100644 --- a/src/cocoindex_code/project.py +++ b/src/cocoindex_code/project.py @@ -266,6 +266,18 @@ def get_status(self) -> ProjectStatusResponse: index_exists=index_exists, ) + def get_indexed_file_chunk_counts(self) -> dict[str, int]: + """Return indexed chunk counts by file path.""" + db = self._env.get_context(SQLITE_DB) + try: + with db.readonly() as conn: + rows = conn.execute( + "SELECT file_path, COUNT(*) FROM code_chunks_vec GROUP BY file_path" + ).fetchall() + except sqlite3.OperationalError: + return {} + return {str(file_path): int(count) for file_path, count in rows} + # ------------------------------------------------------------------ # Properties # ------------------------------------------------------------------ diff --git a/src/cocoindex_code/sidecar.py b/src/cocoindex_code/sidecar.py index e60eddb..592aee4 100644 --- a/src/cocoindex_code/sidecar.py +++ b/src/cocoindex_code/sidecar.py @@ -48,15 +48,15 @@ class SidecarIndexReport: base_commit: str | None head_commit: str | None layers: tuple[SidecarLayerSummary, ...] + effective_file_count: int | None = None + effective_chunk_count: int | None = None def _summarize_layers( *, project_root: Path, cwd: Path, layers: list[LayerBuildResult] ) -> SidecarIndexReport: - summaries = tuple( - _summarize_layer(layer) - for layer in layers - ) + summaries = tuple(_summarize_layer(layer) for layer in layers) + effective_file_count, effective_chunk_count = _effective_index_counts(layers) base = next((layer for layer in summaries if layer.kind == "base"), None) top = summaries[0] if summaries else None branch = next((layer.ref_name for layer in summaries if layer.kind != "base"), None) @@ -69,6 +69,8 @@ def _summarize_layers( base_commit=base.commit if base is not None else None, head_commit=top.commit if top is not None else None, layers=summaries, + effective_file_count=effective_file_count, + effective_chunk_count=effective_chunk_count, ) @@ -92,6 +94,25 @@ def _summarize_layer(layer: LayerBuildResult) -> SidecarLayerSummary: ) +def _effective_index_counts(layers: list[LayerBuildResult]) -> tuple[int | None, int | None]: + if not layers: + return None, None + + lower_layer_shadowed_paths: set[str] = set() + file_count = 0 + chunk_count = 0 + for layer in layers: + file_chunks = layer.runtime.project.get_indexed_file_chunk_counts() + for file_path, chunks in file_chunks.items(): + if file_path in lower_layer_shadowed_paths: + continue + file_count += 1 + chunk_count += chunks + lower_layer_shadowed_paths.update(layer.manifest.affected_paths) + lower_layer_shadowed_paths.update(layer.manifest.tombstoned_paths) + return file_count, chunk_count + + async def ensure_sidecar_layer_ids( *, project_root: Path, diff --git a/tests/test_cli_helpers.py b/tests/test_cli_helpers.py index 6d88aee..7f67606 100644 --- a/tests/test_cli_helpers.py +++ b/tests/test_cli_helpers.py @@ -26,6 +26,8 @@ def _sample_sidecar_report(project_root: Path) -> SidecarIndexReport: base_ref="origin/main", base_commit="abcdef1234567890", head_commit="fedcba9876543210", + effective_file_count=123, + effective_chunk_count=620, layers=( SidecarLayerSummary( layer_id="branch-layer", @@ -414,14 +416,21 @@ def fail_project_status(_project_root: str) -> object: out = capsys.readouterr().out assert f"Project: {project_root}" in out - assert "Layered index:" in out + assert "Layered index updated:" in out + assert "Mode: Git layered index" in out assert "Repo ID: repo-123" in out - assert "branch built" in out - assert "diff=12 paths, tombstones=1" in out - assert "index=8 files, 34 chunks" in out - assert "8 listed, 5 added, 2 unchanged, 0 reprocessed, 1 deleted, 0 errors" in out + assert "Source:" in out + assert "Total searchable content: 123 files, 620 chunks" in out + assert "Search layers, top to bottom:" in out + assert "branch built now" in out + assert "Covers: feature changes from abcdef123456 to fedcba987654" in out + assert "Source changes: 12 changed paths, 1 deleted" in out + assert "Searchable in this layer: 8 files, 34 chunks" in out + assert "8 files listed; 5 added, 2 unchanged, 0 reprocessed, 1 deleted, 0 errors" in out assert "base reused" in out - assert "index=120 files, 610 chunks" in out + assert "Covers: full snapshot of origin/main at abcdef123456" in out + assert "Searchable in this layer: 120 files, 610 chunks" in out + assert "Build work: skipped, reused existing ready layer" in out assert "Index stats:" not in out diff --git a/tests/test_sidecar.py b/tests/test_sidecar.py index 8884d09..9ac11d0 100644 --- a/tests/test_sidecar.py +++ b/tests/test_sidecar.py @@ -129,3 +129,36 @@ def test_sidecar_enabled_requires_exact_one(monkeypatch: pytest.MonkeyPatch) -> monkeypatch.setenv("COCOINDEX_CODE_SIDECAR", "1") assert sidecar.sidecar_enabled() is True + + +def test_effective_index_counts_apply_layer_shadowing() -> None: + def fake_layer( + *, + affected_paths: set[str], + tombstoned_paths: set[str], + file_chunks: dict[str, int], + ) -> Any: + return SimpleNamespace( + manifest=SimpleNamespace( + affected_paths=frozenset(affected_paths), + tombstoned_paths=frozenset(tombstoned_paths), + ), + runtime=SimpleNamespace( + project=SimpleNamespace( + get_indexed_file_chunk_counts=lambda: file_chunks, + ) + ), + ) + + branch = fake_layer( + affected_paths={"a.py", "d.py", "e.py"}, + tombstoned_paths={"b.py"}, + file_chunks={"a.py": 4, "d.py": 1}, + ) + base = fake_layer( + affected_paths=set(), + tombstoned_paths=set(), + file_chunks={"a.py": 2, "b.py": 3, "c.py": 1, "e.py": 7}, + ) + + assert sidecar._effective_index_counts([branch, base]) == (3, 6) From 4ccbf5cff32690d0c30284a6c068efbe16a89832 Mon Sep 17 00:00:00 2001 From: Rudimar Ronsoni Date: Wed, 20 May 2026 20:22:04 +0200 Subject: [PATCH 11/15] Reuse nearest indexed Git ancestor --- src/cocoindex_code/layers/layer_stack.py | 117 +++++++++++++++++++--- src/cocoindex_code/version_control/git.py | 26 +++++ tests/test_git_layers.py | 91 +++++++++++++++++ 3 files changed, 221 insertions(+), 13 deletions(-) diff --git a/src/cocoindex_code/layers/layer_stack.py b/src/cocoindex_code/layers/layer_stack.py index 48eb3a3..d38f1c2 100644 --- a/src/cocoindex_code/layers/layer_stack.py +++ b/src/cocoindex_code/layers/layer_stack.py @@ -13,6 +13,7 @@ from cocoindex_code.shared import Embedder from cocoindex_code.version_control import Worktree from cocoindex_code.version_control.git import ( + ancestor_distances, branch_changes, materialize_commit, materialize_paths_from_commit, @@ -98,11 +99,17 @@ async def ensure( last_seen_path=worktree.path, ) base = await self._ensure_base(worktree, config_hash, on_progress) - layers: list[LayerBuildResult] = [base] - branch = await self._ensure_branch(worktree, base.layer.id, config_hash, on_progress) + parent_layers = await self._nearest_indexed_ancestor_chain( + worktree=worktree, + config_hash=config_hash, + ) + if parent_layers is None: + parent_layers = [base] + branch = await self._ensure_branch(worktree, parent_layers[0], config_hash, on_progress) + layers = parent_layers if branch is not None: - layers.insert(0, branch) - dirty = await self._ensure_dirty(worktree, base.layer.id, config_hash, on_progress) + layers = [branch, *parent_layers] + dirty = await self._ensure_dirty(worktree, layers[0], config_hash, on_progress) if dirty is not None: layers.insert(0, dirty) for layer in layers: @@ -149,12 +156,15 @@ async def _ensure_base( async def _ensure_branch( self, worktree: Worktree, - base_layer_id: str, + parent: LayerBuildResult, config_hash: str, on_progress: Callable[[IndexingProgress], None] | None, ) -> LayerBuildResult | None: + parent_commit = parent.layer.commit_hash + if parent_commit is None: + raise RuntimeError(f"Parent layer has no commit: {parent.layer.id}") changes = branch_changes( - worktree.repository.root, worktree.branch.merge_base, worktree.branch.head_commit + worktree.repository.root, parent_commit, worktree.branch.head_commit ) if changes.is_empty: return None @@ -165,8 +175,8 @@ async def _ensure_branch( worktree.repository.id, worktree.branch.name, worktree.branch.head_commit, - worktree.branch.merge_base, - base_layer_id, + parent_commit, + parent.layer.id, config_hash, ] ) @@ -177,9 +187,9 @@ async def _ensure_branch( kind=LayerKind.BRANCH, ref_name=worktree.branch.name, commit=worktree.branch.head_commit, - base_commit=worktree.branch.merge_base, + base_commit=parent_commit, merge_base=worktree.branch.merge_base, - base_layer_id=base_layer_id, + base_layer_id=parent.layer.id, worktree_id=None, config_hash=config_hash, expires_at=time.time() + _BRANCH_TTL_SECONDS, @@ -194,15 +204,96 @@ async def _ensure_branch( on_progress=on_progress, ) + async def _nearest_indexed_ancestor_chain( + self, + *, + worktree: Worktree, + config_hash: str, + ) -> list[LayerBuildResult] | None: + candidates = [ + layer + for layer in self.store.list_layers(repo_id=worktree.repository.id) + if self._is_reusable_commit_layer(layer, config_hash=config_hash) + ] + distances = ancestor_distances( + worktree.repository.root, + head=worktree.branch.head_commit, + candidate_commits=(layer.commit_hash for layer in candidates if layer.commit_hash), + ) + candidates.sort( + key=lambda layer: ( + distances.get(layer.commit_hash or "", 1_000_000_000), + self._layer_change_count(layer), + -layer.last_accessed_at, + layer.id, + ) + ) + for layer in candidates: + if layer.commit_hash not in distances: + continue + chain = await self._existing_layer_chain(layer, config_hash=config_hash) + if chain is not None: + return chain + return None + + def _is_reusable_commit_layer(self, layer: Layer, *, config_hash: str) -> bool: + return ( + layer.kind in {LayerKind.BASE, LayerKind.BRANCH} + and layer.status == "ready" + and layer.commit_hash is not None + and layer.config_hash == config_hash + and layer.paths.target_sqlite.exists() + and self.store.get_manifest(layer.id) is not None + ) + + def _layer_change_count(self, layer: Layer) -> int: + manifest = self.store.get_manifest(layer.id) + if manifest is None: + return 1_000_000_000 + return len(manifest.affected_paths) + len(manifest.tombstoned_paths) + + async def _existing_layer_chain( + self, layer: Layer, *, config_hash: str + ) -> list[LayerBuildResult] | None: + chain: list[LayerBuildResult] = [] + seen: set[str] = set() + current: Layer | None = layer + while current is not None: + if current.id in seen or not self._is_reusable_commit_layer( + current, config_hash=config_hash + ): + return None + seen.add(current.id) + manifest = self.store.get_manifest(current.id) + if manifest is None: + return None + chain.append( + LayerBuildResult( + layer=current, + manifest=manifest, + runtime=await self._runtime(current), + ) + ) + if current.base_layer_id is None: + break + parent = self.store.get_layer(current.base_layer_id) + if parent is None or current.base_commit_hash != parent.commit_hash: + return None + current = parent + return chain if chain and chain[-1].layer.kind == LayerKind.BASE else None + async def _ensure_dirty( self, worktree: Worktree, - base_layer_id: str, + parent: LayerBuildResult, config_hash: str, on_progress: Callable[[IndexingProgress], None] | None, ) -> LayerBuildResult | None: if worktree.dirty.snapshot_hash is None: return None + parent_commit = parent.layer.commit_hash + if parent_commit is None: + raise RuntimeError(f"Parent layer has no commit: {parent.layer.id}") layer_id = _sha_short( "\0".join( [ @@ -222,9 +313,9 @@ async def _ensure_dirty( kind=LayerKind.DIRTY, ref_name=worktree.branch.name, commit=worktree.branch.head_commit, - base_commit=worktree.branch.merge_base, + base_commit=parent_commit, merge_base=worktree.branch.merge_base, - base_layer_id=base_layer_id, + base_layer_id=parent.layer.id, worktree_id=worktree.id, config_hash=config_hash, expires_at=time.time() + _DIRTY_TTL_SECONDS, diff --git a/src/cocoindex_code/version_control/git.py b/src/cocoindex_code/version_control/git.py index 3226a0e..02bbbe3 100644 --- a/src/cocoindex_code/version_control/git.py +++ b/src/cocoindex_code/version_control/git.py @@ -3,6 +3,7 @@ import hashlib import os import tarfile +from collections.abc import Iterable from pathlib import Path from typing import Any, cast from urllib.parse import urlparse @@ -330,6 +331,31 @@ def branch_changes(repo_root: Path, base: str, head: str) -> ChangeSet: ) +def ancestor_distances( + repo_root: Path, + *, + head: str, + candidate_commits: Iterable[str], +) -> dict[str, int]: + """Return candidate commits that are ancestors of *head*, mapped to distance.""" + repo = _open_repo(repo_root) + try: + head_id = repo.revparse_single(head).id + except (KeyError, ValueError, pygit2.GitError) as e: + raise GitContextError(f"Cannot resolve head commit {head}") from e + + distances: dict[str, int] = {} + for candidate in dict.fromkeys(candidate_commits): + try: + candidate_id = repo.revparse_single(candidate).id + ahead, behind = repo.ahead_behind(head_id, candidate_id) + except (KeyError, ValueError, pygit2.GitError): + continue + if behind == 0: + distances[candidate] = ahead + return distances + + def materialize_commit(repo_root: Path, commit: str, source_dir: Path) -> None: repo = _open_repo(repo_root) obj = repo.revparse_single(commit) diff --git a/tests/test_git_layers.py b/tests/test_git_layers.py index ba56b87..8a282fc 100644 --- a/tests/test_git_layers.py +++ b/tests/test_git_layers.py @@ -206,3 +206,94 @@ async def test_layered_project_creates_base_and_branch_manifests( manifest = project.store.get_manifest(branch_layer.layer_id) assert manifest is not None assert manifest.affected_paths == frozenset({"extra.py", "main.py"}) + + +@pytest.mark.asyncio +async def test_layered_project_builds_from_nearest_indexed_ancestor( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + from typing import Any + + import cocoindex_code.layers.layer_stack as layer_stack + from cocoindex_code.protocol import IndexingProgress + + class FakeRuntime: + def __init__(self, layer: Any) -> None: + self.layer = layer + self.project = object() + + async def run_index(self, on_progress: object = None) -> None: + self.layer.paths.target_sqlite.parent.mkdir(parents=True, exist_ok=True) + self.layer.paths.target_sqlite.touch() + if on_progress is not None: + on_progress(IndexingProgress(1, 0, 1, 0, 0, 0)) + + async def fake_runtime_create(**kwargs: Any) -> FakeRuntime: + return FakeRuntime(kwargs["layer"]) + + monkeypatch.setattr( + layer_stack.LayerRuntime, + "create", + staticmethod(fake_runtime_create), + ) + + monkeypatch.setenv("COCOINDEX_CODE_STATE_DIR", str(tmp_path / "state")) + repo = _init_repo(tmp_path / "repo") + base_commit = _git(repo, "rev-parse", "HEAD") + (repo / "main.py").write_text("def base_function() -> str:\n return 'master'\n") + (repo / "master.py").write_text("def master_only() -> str:\n return 'master'\n") + _git(repo, "add", ".") + _git(repo, "commit", "-m", "master") + master_head = _git(repo, "rev-parse", "HEAD") + + state_dir = daemon_state_dir() + store = LayerStore(state_dir / "daemon.db") + + def make_project() -> LayeredProject: + return LayeredProject( + project_root=repo, + cwd=repo, + base_ref=base_commit, + state_dir=state_dir, + store=store, + embedder=object(), + indexing_params={}, + query_params={}, + chunker_registry={}, + project_cache={}, + ) + + master_project = make_project() + try: + master_layers = await master_project.ensure_layer_results() + finally: + master_project.close() + master_layer = next(layer for layer in master_layers if layer.layer.kind == LayerKind.BRANCH) + assert master_layer.layer.commit_hash == master_head + + _git(repo, "checkout", "-b", "feature") + (repo / "feature.py").write_text("def feature_only() -> str:\n return 'feature'\n") + _git(repo, "add", ".") + _git(repo, "commit", "-m", "feature") + feature_head = _git(repo, "rev-parse", "HEAD") + + feature_project = make_project() + try: + feature_layers = await feature_project.ensure_layer_results() + finally: + feature_project.close() + + assert [layer.layer.kind for layer in feature_layers] == [ + LayerKind.BRANCH, + LayerKind.BRANCH, + LayerKind.BASE, + ] + feature_layer, reused_master_layer, _base_layer = feature_layers + assert feature_layer.built is True + assert reused_master_layer.built is False + assert reused_master_layer.layer.id == master_layer.layer.id + assert feature_layer.layer.commit_hash == feature_head + assert feature_layer.layer.base_commit_hash == master_head + assert feature_layer.layer.base_layer_id == master_layer.layer.id + assert feature_layer.manifest.affected_paths == frozenset({"feature.py"}) + assert feature_layer.manifest.tombstoned_paths == frozenset() From 0d3cc46068b3116396899e0c15736ef70ef27dfd Mon Sep 17 00:00:00 2001 From: Rudimar Ronsoni Date: Wed, 20 May 2026 20:38:47 +0200 Subject: [PATCH 12/15] Harden Git layer reuse tests --- src/cocoindex_code/layers/layer_stack.py | 4 +- tests/test_git_layers.py | 465 +++++++++++++++++++++-- 2 files changed, 435 insertions(+), 34 deletions(-) diff --git a/src/cocoindex_code/layers/layer_stack.py b/src/cocoindex_code/layers/layer_stack.py index d38f1c2..7b56129 100644 --- a/src/cocoindex_code/layers/layer_stack.py +++ b/src/cocoindex_code/layers/layer_stack.py @@ -98,12 +98,12 @@ async def ensure( branch_name=worktree.branch.name, last_seen_path=worktree.path, ) - base = await self._ensure_base(worktree, config_hash, on_progress) parent_layers = await self._nearest_indexed_ancestor_chain( worktree=worktree, config_hash=config_hash, ) if parent_layers is None: + base = await self._ensure_base(worktree, config_hash, on_progress) parent_layers = [base] branch = await self._ensure_branch(worktree, parent_layers[0], config_hash, on_progress) layers = parent_layers @@ -302,6 +302,8 @@ async def _ensure_dirty( worktree.id, worktree.branch.name, worktree.branch.head_commit, + parent_commit, + parent.layer.id, worktree.dirty.snapshot_hash, config_hash, ] diff --git a/tests/test_git_layers.py b/tests/test_git_layers.py index 8a282fc..51ef4b4 100644 --- a/tests/test_git_layers.py +++ b/tests/test_git_layers.py @@ -3,6 +3,7 @@ import shutil import subprocess from pathlib import Path +from typing import Any import pytest @@ -10,6 +11,7 @@ from cocoindex_code.git_context import normalize_remote_url, resolve_worktree_context from cocoindex_code.layer_store import LayerKind, LayerStore from cocoindex_code.layered_project import LayeredProject +from cocoindex_code.protocol import IndexingProgress from cocoindex_code.settings import default_project_settings, save_project_settings @@ -30,6 +32,100 @@ def _init_repo(path: Path) -> Path: return path +class _FakeRuntime: + def __init__(self, layer: Any) -> None: + self.layer = layer + self.project = object() + + async def run_index(self, on_progress: Any = None) -> None: + self.layer.paths.target_sqlite.parent.mkdir(parents=True, exist_ok=True) + self.layer.paths.target_sqlite.touch() + if on_progress is not None: + on_progress(IndexingProgress(1, 0, 1, 0, 0, 0)) + + +async def _fake_runtime_create(**kwargs: Any) -> _FakeRuntime: + return _FakeRuntime(kwargs["layer"]) + + +def _install_fake_layer_runtime(monkeypatch: pytest.MonkeyPatch) -> None: + import cocoindex_code.layers.layer_stack as layer_stack + + monkeypatch.setattr( + layer_stack.LayerRuntime, + "create", + staticmethod(_fake_runtime_create), + ) + + +def _fake_layered_project( + *, + repo: Path, + base_ref: str, + state_dir: Path, + store: LayerStore, +) -> LayeredProject: + return LayeredProject( + project_root=repo, + cwd=repo, + base_ref=base_ref, + state_dir=state_dir, + store=store, + embedder=object(), + indexing_params={}, + query_params={}, + chunker_registry={}, + project_cache={}, + ) + + +def _touch_layer_target(layer: Any) -> None: + layer.paths.target_sqlite.parent.mkdir(parents=True, exist_ok=True) + layer.paths.target_sqlite.touch() + + +def _upsert_ready_layer( + *, + store: LayerStore, + state_dir: Path, + repo_id: str, + layer_id: str, + kind: LayerKind, + ref_name: str, + commit: str, + base_commit: str | None, + base_layer_id: str | None, + config_hash: str, + affected_paths: list[str], + tombstoned_paths: list[str] | None = None, +) -> Any: + root = state_dir / "manual-layers" / layer_id + layer = store.upsert_layer( + layer_id=layer_id, + repo_id=repo_id, + kind=kind, + ref_name=ref_name, + commit=commit, + base_commit=base_commit, + base_layer_id=base_layer_id, + source_dir=root / "src", + db_dir=root / "db", + status="building", + config_hash=config_hash, + ) + store.replace_manifest( + layer_id, + affected_paths=affected_paths, + tombstoned_paths=tombstoned_paths or [], + expires_at=None, + ) + _touch_layer_target(layer) + store.mark_layer_ready(layer_id) + ready = store.get_layer(layer_id) + assert ready is not None + return ready + + def test_daemon_state_dir_defaults_to_xdg_data_home( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: @@ -122,6 +218,29 @@ def test_resolve_worktree_context_uses_remote_head_when_no_branch_upstream( assert ctx.branch.base_commit == origin_default +def test_ancestor_distances_only_returns_commits_reachable_from_head(tmp_path: Path) -> None: + from cocoindex_code.version_control.git import ancestor_distances + + repo = _init_repo(tmp_path / "repo") + base_commit = _git(repo, "rev-parse", "HEAD") + _git(repo, "checkout", "-b", "other") + (repo / "other.py").write_text("def other() -> str:\n return 'other'\n") + _git(repo, "add", ".") + _git(repo, "commit", "-m", "other") + other_head = _git(repo, "rev-parse", "HEAD") + _git(repo, "checkout", "main") + (repo / "main.py").write_text("def main() -> str:\n return 'main'\n") + _git(repo, "add", ".") + _git(repo, "commit", "-m", "main") + main_head = _git(repo, "rev-parse", "HEAD") + + assert ancestor_distances( + repo, + head=main_head, + candidate_commits=[base_commit, other_head, main_head, "missing"], + ) == {base_commit: 1, main_head: 0} + + def test_layer_store_persists_ready_layers_and_manifests(tmp_path: Path) -> None: store = LayerStore(tmp_path / "daemon.db") record = store.upsert_layer( @@ -212,31 +331,7 @@ async def test_layered_project_creates_base_and_branch_manifests( async def test_layered_project_builds_from_nearest_indexed_ancestor( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: - from typing import Any - - import cocoindex_code.layers.layer_stack as layer_stack - from cocoindex_code.protocol import IndexingProgress - - class FakeRuntime: - def __init__(self, layer: Any) -> None: - self.layer = layer - self.project = object() - - async def run_index(self, on_progress: object = None) -> None: - self.layer.paths.target_sqlite.parent.mkdir(parents=True, exist_ok=True) - self.layer.paths.target_sqlite.touch() - if on_progress is not None: - on_progress(IndexingProgress(1, 0, 1, 0, 0, 0)) - - async def fake_runtime_create(**kwargs: Any) -> FakeRuntime: - return FakeRuntime(kwargs["layer"]) - - monkeypatch.setattr( - layer_stack.LayerRuntime, - "create", - staticmethod(fake_runtime_create), - ) - + _install_fake_layer_runtime(monkeypatch) monkeypatch.setenv("COCOINDEX_CODE_STATE_DIR", str(tmp_path / "state")) repo = _init_repo(tmp_path / "repo") base_commit = _git(repo, "rev-parse", "HEAD") @@ -250,17 +345,11 @@ async def fake_runtime_create(**kwargs: Any) -> FakeRuntime: store = LayerStore(state_dir / "daemon.db") def make_project() -> LayeredProject: - return LayeredProject( - project_root=repo, - cwd=repo, + return _fake_layered_project( + repo=repo, base_ref=base_commit, state_dir=state_dir, store=store, - embedder=object(), - indexing_params={}, - query_params={}, - chunker_registry={}, - project_cache={}, ) master_project = make_project() @@ -297,3 +386,313 @@ def make_project() -> LayeredProject: assert feature_layer.layer.base_layer_id == master_layer.layer.id assert feature_layer.manifest.affected_paths == frozenset({"feature.py"}) assert feature_layer.manifest.tombstoned_paths == frozenset() + + +@pytest.mark.asyncio +async def test_layered_project_does_not_build_configured_base_when_ancestor_chain_exists( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + _install_fake_layer_runtime(monkeypatch) + monkeypatch.setenv("COCOINDEX_CODE_STATE_DIR", str(tmp_path / "state")) + repo = _init_repo(tmp_path / "repo") + base_commit = _git(repo, "rev-parse", "HEAD") + (repo / "master.py").write_text("def master_only() -> str:\n return 'master'\n") + _git(repo, "add", ".") + _git(repo, "commit", "-m", "master") + master_head = _git(repo, "rev-parse", "HEAD") + + state_dir = daemon_state_dir() + store = LayerStore(state_dir / "daemon.db") + master_project = _fake_layered_project( + repo=repo, + base_ref=base_commit, + state_dir=state_dir, + store=store, + ) + try: + master_layers = await master_project.ensure_layer_results() + finally: + master_project.close() + master_layer = next(layer for layer in master_layers if layer.layer.kind == LayerKind.BRANCH) + + _git(repo, "checkout", "-b", "feature") + (repo / "feature.py").write_text("def feature_only() -> str:\n return 'feature'\n") + _git(repo, "add", ".") + _git(repo, "commit", "-m", "feature") + + feature_project = _fake_layered_project( + repo=repo, + base_ref=master_head, + state_dir=state_dir, + store=store, + ) + try: + feature_layers = await feature_project.ensure_layer_results() + finally: + feature_project.close() + + feature_layer, reused_master_layer, _base_layer = feature_layers + assert feature_layer.manifest.affected_paths == frozenset({"feature.py"}) + assert reused_master_layer.layer.id == master_layer.layer.id + assert not any( + layer.kind == LayerKind.BASE and layer.commit_hash == master_head + for layer in store.list_layers(repo_id=master_layer.layer.repo_id) + ) + + +@pytest.mark.asyncio +async def test_layered_project_ignores_unusable_indexed_ancestor_layers( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + from cocoindex_code.layered_project import build_index_config_hash + from cocoindex_code.version_control import resolve_worktree + + _install_fake_layer_runtime(monkeypatch) + monkeypatch.setenv("COCOINDEX_CODE_STATE_DIR", str(tmp_path / "state")) + repo = _init_repo(tmp_path / "repo") + base_commit = _git(repo, "rev-parse", "HEAD") + (repo / "master.py").write_text("def master_only() -> str:\n return 'master'\n") + _git(repo, "add", ".") + _git(repo, "commit", "-m", "master") + master_head = _git(repo, "rev-parse", "HEAD") + _git(repo, "checkout", "-b", "feature") + (repo / "feature.py").write_text("def feature_only() -> str:\n return 'feature'\n") + _git(repo, "add", ".") + _git(repo, "commit", "-m", "feature") + + state_dir = daemon_state_dir() + store = LayerStore(state_dir / "daemon.db") + config_hash = build_index_config_hash(repo, indexing_params={}, query_params={}) + worktree = resolve_worktree(repo, base_ref=base_commit, index_config_hash=config_hash) + invalid_layer = store.upsert_layer( + layer_id="missing-target-db", + repo_id=worktree.repository.id, + kind=LayerKind.BRANCH, + ref_name="master", + commit=master_head, + base_commit=base_commit, + base_layer_id="missing-base", + source_dir=state_dir / "invalid" / "src", + db_dir=state_dir / "invalid" / "db", + status="building", + config_hash=config_hash, + ) + store.replace_manifest( + invalid_layer.id, + affected_paths=["master.py"], + tombstoned_paths=[], + expires_at=None, + ) + store.mark_layer_ready(invalid_layer.id) + + project = _fake_layered_project( + repo=repo, + base_ref=base_commit, + state_dir=state_dir, + store=store, + ) + try: + layers = await project.ensure_layer_results() + finally: + project.close() + + branch_layer = layers[0] + assert branch_layer.layer.base_commit_hash == base_commit + assert branch_layer.manifest.affected_paths == frozenset({"feature.py", "master.py"}) + + +@pytest.mark.asyncio +async def test_layered_project_ignores_indexed_ancestor_with_broken_parent_chain( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + from cocoindex_code.layered_project import build_index_config_hash + from cocoindex_code.version_control import resolve_worktree + + _install_fake_layer_runtime(monkeypatch) + monkeypatch.setenv("COCOINDEX_CODE_STATE_DIR", str(tmp_path / "state")) + repo = _init_repo(tmp_path / "repo") + base_commit = _git(repo, "rev-parse", "HEAD") + (repo / "master.py").write_text("def master_only() -> str:\n return 'master'\n") + _git(repo, "add", ".") + _git(repo, "commit", "-m", "master") + master_head = _git(repo, "rev-parse", "HEAD") + _git(repo, "checkout", "-b", "feature") + (repo / "feature.py").write_text("def feature_only() -> str:\n return 'feature'\n") + _git(repo, "add", ".") + _git(repo, "commit", "-m", "feature") + + state_dir = daemon_state_dir() + store = LayerStore(state_dir / "daemon.db") + config_hash = build_index_config_hash(repo, indexing_params={}, query_params={}) + worktree = resolve_worktree(repo, base_ref=base_commit, index_config_hash=config_hash) + broken_layer = _upsert_ready_layer( + store=store, + state_dir=state_dir, + repo_id=worktree.repository.id, + layer_id="broken-parent-chain", + kind=LayerKind.BRANCH, + ref_name="master", + commit=master_head, + base_commit=base_commit, + base_layer_id="missing-base", + config_hash=config_hash, + affected_paths=["master.py"], + ) + assert broken_layer.paths.target_sqlite.exists() + + project = _fake_layered_project( + repo=repo, + base_ref=base_commit, + state_dir=state_dir, + store=store, + ) + try: + layers = await project.ensure_layer_results() + finally: + project.close() + + branch_layer = layers[0] + assert branch_layer.layer.base_commit_hash == base_commit + assert branch_layer.manifest.affected_paths == frozenset({"feature.py", "master.py"}) + + +@pytest.mark.asyncio +async def test_dirty_layer_identity_includes_selected_parent_layer( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + _install_fake_layer_runtime(monkeypatch) + monkeypatch.setenv("COCOINDEX_CODE_STATE_DIR", str(tmp_path / "state")) + repo = _init_repo(tmp_path / "repo") + base_commit = _git(repo, "rev-parse", "HEAD") + (repo / "master.py").write_text("def master_only() -> str:\n return 'master'\n") + _git(repo, "add", ".") + _git(repo, "commit", "-m", "master") + master_head = _git(repo, "rev-parse", "HEAD") + _git(repo, "checkout", "-b", "feature") + (repo / "feature.py").write_text("def feature_only() -> str:\n return 'feature'\n") + _git(repo, "add", ".") + _git(repo, "commit", "-m", "feature") + (repo / "dirty.py").write_text("def dirty() -> str:\n return 'dirty'\n") + + state_dir = daemon_state_dir() + store = LayerStore(state_dir / "daemon.db") + feature_project = _fake_layered_project( + repo=repo, + base_ref=base_commit, + state_dir=state_dir, + store=store, + ) + try: + original_layers = await feature_project.ensure_layer_results() + finally: + feature_project.close() + old_dirty, old_branch, base_layer = original_layers + assert old_dirty.layer.kind == LayerKind.DIRTY + old_branch.layer.paths.target_sqlite.unlink() + + master_layer = _upsert_ready_layer( + store=store, + state_dir=state_dir, + repo_id=base_layer.layer.repo_id, + layer_id="manual-master-layer", + kind=LayerKind.BRANCH, + ref_name="master", + commit=master_head, + base_commit=base_commit, + base_layer_id=base_layer.layer.id, + config_hash=base_layer.layer.config_hash or "", + affected_paths=["master.py"], + ) + + refreshed_project = _fake_layered_project( + repo=repo, + base_ref=base_commit, + state_dir=state_dir, + store=store, + ) + try: + refreshed_layers = await refreshed_project.ensure_layer_results() + finally: + refreshed_project.close() + + new_dirty, new_branch, reused_master, _reused_base = refreshed_layers + assert new_dirty.layer.kind == LayerKind.DIRTY + assert new_dirty.layer.id != old_dirty.layer.id + assert new_dirty.layer.base_layer_id == new_branch.layer.id + assert new_branch.layer.base_layer_id == master_layer.id + assert reused_master.layer.id == master_layer.id + + +@pytest.mark.asyncio +async def test_layered_project_prefers_smaller_layer_at_same_commit( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + from cocoindex_code.layered_project import build_index_config_hash + from cocoindex_code.version_control import resolve_worktree + + _install_fake_layer_runtime(monkeypatch) + monkeypatch.setenv("COCOINDEX_CODE_STATE_DIR", str(tmp_path / "state")) + repo = _init_repo(tmp_path / "repo") + base_commit = _git(repo, "rev-parse", "HEAD") + (repo / "main.py").write_text("def changed() -> str:\n return 'changed'\n") + _git(repo, "add", ".") + _git(repo, "commit", "-m", "head") + head_commit = _git(repo, "rev-parse", "HEAD") + + state_dir = daemon_state_dir() + store = LayerStore(state_dir / "daemon.db") + config_hash = build_index_config_hash(repo, indexing_params={}, query_params={}) + worktree = resolve_worktree(repo, base_ref=base_commit, index_config_hash=config_hash) + base_layer = _upsert_ready_layer( + store=store, + state_dir=state_dir, + repo_id=worktree.repository.id, + layer_id="manual-base-layer", + kind=LayerKind.BASE, + ref_name="base", + commit=base_commit, + base_commit=None, + base_layer_id=None, + config_hash=config_hash, + affected_paths=[], + ) + _upsert_ready_layer( + store=store, + state_dir=state_dir, + repo_id=worktree.repository.id, + layer_id="large-layer-at-head", + kind=LayerKind.BRANCH, + ref_name="feature", + commit=head_commit, + base_commit=base_commit, + base_layer_id=base_layer.id, + config_hash=config_hash, + affected_paths=[f"file_{i}.py" for i in range(20)], + ) + small_layer = _upsert_ready_layer( + store=store, + state_dir=state_dir, + repo_id=worktree.repository.id, + layer_id="small-layer-at-head", + kind=LayerKind.BRANCH, + ref_name="feature", + commit=head_commit, + base_commit=base_commit, + base_layer_id=base_layer.id, + config_hash=config_hash, + affected_paths=["main.py"], + ) + + project = _fake_layered_project( + repo=repo, + base_ref=base_commit, + state_dir=state_dir, + store=store, + ) + try: + layers = await project.ensure_layer_results() + finally: + project.close() + + assert layers[0].layer.id == small_layer.id + assert layers[0].built is False From 30782afd949c4866774bc46878bc094e846bdcde Mon Sep 17 00:00:00 2001 From: Rudimar Ronsoni Date: Wed, 20 May 2026 20:50:36 +0200 Subject: [PATCH 13/15] Name Docker sidecars by repo and branch --- bin/ccc | 84 +++++++++++++++++++++++++++++++++ docs/docker-layered-indexing.md | 14 ++++++ tests/test_docker_setup.py | 25 ++++++++++ 3 files changed, 123 insertions(+) diff --git a/bin/ccc b/bin/ccc index 5c34e6f..d954453 100755 --- a/bin/ccc +++ b/bin/ccc @@ -26,6 +26,30 @@ canonical_path() { python3 -c 'import pathlib, sys; print(pathlib.Path(sys.argv[1]).resolve())' "$1" } +slugify() { + python3 - "$1" "${2:-63}" <<'PY' +import re +import sys +import unicodedata + +value = unicodedata.normalize("NFKD", sys.argv[1].strip().lower()) +value = value.encode("ascii", "ignore").decode() +value = re.sub(r"[^a-z0-9]+", "-", value).strip("-") +max_len = int(sys.argv[2]) +value = value[:max_len].rstrip("-") +print(value or "unknown") +PY +} + +short_hash() { + python3 - "$1" <<'PY' +import hashlib +import sys + +print(hashlib.sha256(sys.argv[1].encode()).hexdigest()[:12]) +PY +} + git_root_for() { git -C "$1" rev-parse --show-toplevel 2>/dev/null | xargs -I{} python3 -c 'import pathlib, sys; print(pathlib.Path(sys.argv[1]).resolve())' "{}" } @@ -41,6 +65,55 @@ git_common_dir_for() { fi } +git_repo_name_for() { + local root="$1" + local remote_url + remote_url="$(git -C "$root" config --get remote.origin.url 2>/dev/null || true)" + python3 - "$root" "$remote_url" <<'PY' +import pathlib +import sys + +root = pathlib.Path(sys.argv[1]) +remote_url = sys.argv[2].strip().removesuffix(".git") +if remote_url: + remote_url = remote_url.replace(":", "/") + print(remote_url.rstrip("/").rsplit("/", 1)[-1] or root.name) +else: + print(root.name) +PY +} + +git_branch_for() { + local root="$1" + local branch + branch="$(git -C "$root" branch --show-current 2>/dev/null || true)" + if [[ -n "$branch" ]]; then + echo "$branch" + return + fi + branch="$(git -C "$root" rev-parse --short HEAD 2>/dev/null || true)" + if [[ -n "$branch" ]]; then + echo "detached-$branch" + else + echo "unknown" + fi +} + +sidecar_container_name() { + local root="$1" + local host_cwd="$2" + local command_name="${3:-cmd}" + local repo_name branch_name repo_slug branch_slug command_slug suffix + repo_name="$(git_repo_name_for "$root")" + branch_name="$(git_branch_for "$root")" + repo_slug="$(slugify "$repo_name" 40)" + branch_slug="$(slugify "$branch_name" 96)" + command_slug="$(slugify "$command_name" 24)" + suffix="$(short_hash "$root|$branch_name|$command_name|$host_cwd|$$|$RANDOM")" + printf 'cocoindex-code-sidecar-%s-%s-%s-%s\n' \ + "$repo_slug" "$branch_slug" "$command_slug" "$suffix" +} + is_path_within() { local child="$1" local parent="$2" @@ -163,10 +236,21 @@ run_sidecar() { local common_dir="$2" local host_cwd="$3" shift 3 + local sidecar_command="${1:-cmd}" + local sidecar_container repo_name branch_name + sidecar_container="$(sidecar_container_name "$root" "$host_cwd" "$sidecar_command")" + repo_name="$(git_repo_name_for "$root")" + branch_name="$(git_branch_for "$root")" local run_args=( run --rm -i + --name "$sidecar_container" --network "$network" + --label "io.cocoindex.code.role=sidecar" + --label "io.cocoindex.code.repo=$repo_name" + --label "io.cocoindex.code.branch=$branch_name" + --label "io.cocoindex.code.command=$sidecar_command" + --label "io.cocoindex.code.worktree=$root" --volume "$root:$workspace_dir" --volume "$root:$root" --volume "$host_settings_dir:$container_settings_dir" diff --git a/docs/docker-layered-indexing.md b/docs/docker-layered-indexing.md index 3a2b35a..e55a0ec 100644 --- a/docs/docker-layered-indexing.md +++ b/docs/docker-layered-indexing.md @@ -78,6 +78,14 @@ source access: Sidecar container: ```text +name: + cocoindex-code-sidecar---- +labels: + io.cocoindex.code.role=sidecar + io.cocoindex.code.repo= + io.cocoindex.code.branch= + io.cocoindex.code.command= + io.cocoindex.code.worktree= mounts: /authorized/repo -> /workspace /authorized/repo -> /authorized/repo @@ -96,6 +104,12 @@ The second repo bind mount is the same authorized checkout, not a parent directory. It exists so linked-worktree `.git` metadata that contains absolute host paths still resolves inside the sidecar. +Sidecar names use lowercase Docker-safe slugs, so a repo/branch such as +`fever2` and `feature/PLATFORM-5958-example` appears in `docker ps` as a name +like `cocoindex-code-sidecar-fever2-feature-platform-5958-example-index-`. +The labels keep the unsanitized repo, branch, command, and worktree values for +inspection and filtering. + Indexing runs in the sidecar because it is the process with Git/source access. The resulting layer metadata and layer databases are written to shared daemon state. Search sends the resolved layer IDs to the central daemon, and the daemon serves the query from shared layer databases without mounting the repository. ## State diff --git a/tests/test_docker_setup.py b/tests/test_docker_setup.py index f4f16f4..593b150 100644 --- a/tests/test_docker_setup.py +++ b/tests/test_docker_setup.py @@ -86,6 +86,8 @@ def test_docker_sidecar_docs_describe_repo_scoped_architecture() -> None: assert "Do not mount `$HOME` or a broad source tree" in content assert "same absolute path it has on the host" in content assert "linked-worktree metadata" in content + assert "cocoindex-code-sidecar----" in content + assert "io.cocoindex.code.branch=" in content assert "COCOINDEX_CODE_DAEMON_TCP" in content assert "COCOINDEX_CODE_SIDECAR=1" in content @@ -109,6 +111,7 @@ def test_wrapper_mounts_only_authorized_repo_sidecar() -> None: content = (REPO_ROOT / "bin" / "ccc").read_text() assert 'record_authorization "$root" "$common_dir"' in content + assert '--name "$sidecar_container"' in content assert '--volume "$root:$workspace_dir"' in content assert '--volume "$root:$root"' in content assert '--volume "$host_settings_dir:$container_settings_dir"' in content @@ -126,6 +129,28 @@ def test_wrapper_mounts_only_authorized_repo_sidecar() -> None: assert 'exec docker "${run_args[@]}"' in content +def test_wrapper_names_sidecars_by_repo_branch_command() -> None: + content = (REPO_ROOT / "bin" / "ccc").read_text() + + assert "sidecar_container_name()" in content + assert "git_repo_name_for()" in content + assert "git_branch_for()" in content + assert "slugify()" in content + assert 'cocoindex-code-sidecar-%s-%s-%s-%s' in content + assert 'repo_slug="$(slugify "$repo_name" 40)"' in content + assert 'branch_slug="$(slugify "$branch_name" 96)"' in content + assert 'command_slug="$(slugify "$command_name" 24)"' in content + assert ( + 'suffix="$(short_hash "$root|$branch_name|$command_name|$host_cwd|$$|$RANDOM")"' + in content + ) + assert '--label "io.cocoindex.code.role=sidecar"' in content + assert '--label "io.cocoindex.code.repo=$repo_name"' in content + assert '--label "io.cocoindex.code.branch=$branch_name"' in content + assert '--label "io.cocoindex.code.command=$sidecar_command"' in content + assert '--label "io.cocoindex.code.worktree=$root"' in content + + def test_wrapper_recreates_incompatible_daemon_container() -> None: content = (REPO_ROOT / "bin" / "ccc").read_text() From 9feca57e761dd595614e13f18c4653e5bae77f82 Mon Sep 17 00:00:00 2001 From: Rudimar Ronsoni Date: Tue, 2 Jun 2026 13:01:17 +0200 Subject: [PATCH 14/15] fix: harden git layer resource handling --- src/cocoindex_code/_daemon_paths.py | 2 + src/cocoindex_code/cli.py | 11 +- src/cocoindex_code/client.py | 45 ++++++- src/cocoindex_code/daemon.py | 40 +++++- src/cocoindex_code/layers/layer_store.py | 15 ++- src/cocoindex_code/protocol.py | 7 + src/cocoindex_code/version_control/git.py | 102 +++++++++++++-- tests/test_client.py | 42 ++++++ tests/test_git_layers.py | 151 ++++++++++++++++++++++ tests/test_protocol.py | 24 ++++ 10 files changed, 413 insertions(+), 26 deletions(-) diff --git a/src/cocoindex_code/_daemon_paths.py b/src/cocoindex_code/_daemon_paths.py index 82b8e3f..cfa6914 100644 --- a/src/cocoindex_code/_daemon_paths.py +++ b/src/cocoindex_code/_daemon_paths.py @@ -46,6 +46,8 @@ def daemon_state_dir() -> Path: xdg_data_home = os.environ.get("XDG_DATA_HOME") if xdg_data_home: return Path(xdg_data_home) / "cocoindex-code" + if os.environ.get("COCOINDEX_CODE_DIR"): + return user_settings_dir() / "state" return Path.home() / ".local" / "share" / "cocoindex-code" diff --git a/src/cocoindex_code/cli.py b/src/cocoindex_code/cli.py index 83cd685..c546e3a 100644 --- a/src/cocoindex_code/cli.py +++ b/src/cocoindex_code/cli.py @@ -1107,8 +1107,15 @@ def overlay_prune() -> None: resp = _client.overlay_prune() if not resp.pruned_layer_ids: _typer.echo("No expired layers pruned.") - return - _typer.echo(f"Pruned {len(resp.pruned_layer_ids)} layer(s).") + else: + _typer.echo(f"Pruned {len(resp.pruned_layer_ids)} layer(s).") + if resp.failures: + for failure in resp.failures: + _typer.echo( + f"Failed to prune {failure.layer_id} at {failure.path}: {failure.message}", + err=True, + ) + raise _typer.Exit(code=1) # --- Daemon subcommands --- diff --git a/src/cocoindex_code/client.py b/src/cocoindex_code/client.py index 80d23bc..edcca68 100644 --- a/src/cocoindex_code/client.py +++ b/src/cocoindex_code/client.py @@ -468,14 +468,17 @@ def start_daemon() -> subprocess.Popen[bytes]: ccc_path = _find_ccc_executable() if ccc_path: cmd = [ccc_path, "run-daemon"] + env = os.environ.copy() else: cmd = [sys.executable, "-m", "cocoindex_code.cli", "run-daemon"] + env = _daemon_subprocess_env() log_fd = open(log_path, "w") if sys.platform == "win32": _create_no_window = 0x08000000 proc = subprocess.Popen( cmd, + env=env, stdout=log_fd, stderr=log_fd, stdin=subprocess.DEVNULL, @@ -484,6 +487,7 @@ def start_daemon() -> subprocess.Popen[bytes]: else: proc = subprocess.Popen( cmd, + env=env, start_new_session=True, stdout=log_fd, stderr=log_fd, @@ -493,17 +497,56 @@ def start_daemon() -> subprocess.Popen[bytes]: return proc +def _daemon_subprocess_env() -> dict[str, str]: + """Environment for launching the daemon through ``python -m``.""" + env = os.environ.copy() + source_root = str(Path(__file__).resolve().parents[1]) + pythonpath = env.get("PYTHONPATH") + if pythonpath: + paths = pythonpath.split(os.pathsep) + if source_root not in paths: + env["PYTHONPATH"] = os.pathsep.join([source_root, *paths]) + else: + env["PYTHONPATH"] = source_root + return env + + def _find_ccc_executable() -> str | None: """Find the ccc executable in PATH or the same directory as python.""" python_dir = Path(sys.executable).parent names = ["ccc.exe", "ccc"] if sys.platform == "win32" else ["ccc"] for name in names: ccc = python_dir / name - if ccc.exists(): + if _is_usable_ccc_executable(ccc): return str(ccc) return None +def _is_usable_ccc_executable(path: Path) -> bool: + """Return whether a local ccc launcher is executable in this environment.""" + if not path.exists(): + return False + if sys.platform == "win32": + return True + + try: + with path.open("rb") as f: + first_line = f.readline(256) + except OSError: + return False + + if not first_line.startswith(b"#!"): + return True + + shebang = first_line[2:].strip().decode("utf-8", errors="ignore") + if not shebang: + return False + interpreter = shebang.split(maxsplit=1)[0] + if interpreter.startswith("/") and not Path(interpreter).exists(): + return False + return True + + def _pid_alive(pid: int) -> bool: """Return True if *pid* is still running.""" if sys.platform == "win32": diff --git a/src/cocoindex_code/daemon.py b/src/cocoindex_code/daemon.py index f5fa3ef..de4bc3b 100644 --- a/src/cocoindex_code/daemon.py +++ b/src/cocoindex_code/daemon.py @@ -48,6 +48,7 @@ IndexStreamResponse, IndexWaitingNotice, OverlayLayerInfo, + OverlayPruneFailure, OverlayPruneRequest, OverlayPruneResponse, OverlayStatusRequest, @@ -237,6 +238,12 @@ def close_all(self) -> None: self._layer_project_cache.clear() gc.collect() + def close_layer_projects(self, layer_ids: list[str] | tuple[str, ...]) -> None: + for layer_id in layer_ids: + project = self._layer_project_cache.pop(layer_id, None) + if project is not None: + project.close() + def list_projects(self) -> list[DaemonProjectInfo]: """List all loaded projects with their indexing state.""" return [ @@ -703,12 +710,33 @@ async def _dispatch( ) if isinstance(req, OverlayPruneRequest): - pruned = registry.layer_store.prune_expired() - for layer in pruned: - import shutil - - shutil.rmtree(layer.source_dir.parent, ignore_errors=True) - return OverlayPruneResponse(pruned_layer_ids=[layer.layer_id for layer in pruned]) + import shutil + + expired = registry.layer_store.list_expired_layers() + successful_layer_ids: list[str] = [] + failures: list[OverlayPruneFailure] = [] + registry.close_layer_projects(tuple(layer.id for layer in expired)) + for layer in expired: + layer_root = layer.source_dir.parent + try: + shutil.rmtree(layer_root) + except FileNotFoundError: + successful_layer_ids.append(layer.id) + except OSError as e: + failures.append( + OverlayPruneFailure( + layer_id=layer.id, + path=str(layer_root), + message=str(e), + ) + ) + else: + successful_layer_ids.append(layer.id) + registry.layer_store.delete_layers(tuple(successful_layer_ids)) + return OverlayPruneResponse( + pruned_layer_ids=successful_layer_ids, + failures=failures, + ) if isinstance(req, DoctorRequest): return _handle_doctor(req, registry) diff --git a/src/cocoindex_code/layers/layer_store.py b/src/cocoindex_code/layers/layer_store.py index baf3379..2b95e38 100644 --- a/src/cocoindex_code/layers/layer_store.py +++ b/src/cocoindex_code/layers/layer_store.py @@ -373,7 +373,7 @@ def list_layers(self, *, repo_id: str | None = None) -> list[Layer]: ).fetchall() return [self._row_to_layer(row) for row in rows] - def prune_expired(self, now: float | None = None) -> list[Layer]: + def list_expired_layers(self, now: float | None = None) -> list[Layer]: cutoff = time.time() if now is None else now with self._connect() as conn: rows = conn.execute( @@ -386,12 +386,21 @@ def prune_expired(self, now: float | None = None) -> list[Layer]: """, (cutoff,), ).fetchall() - layer_ids = [row["layer_id"] for row in rows] + return [self._row_to_layer(row) for row in rows] + + def delete_layers(self, layer_ids: list[str] | tuple[str, ...]) -> None: + if not layer_ids: + return + with self._connect() as conn: conn.executemany( "DELETE FROM overlay_manifests WHERE layer_id = ?", [(i,) for i in layer_ids] ) conn.executemany("DELETE FROM layers WHERE layer_id = ?", [(i,) for i in layer_ids]) - return [self._row_to_layer(row) for row in rows] + + def prune_expired(self, now: float | None = None) -> list[Layer]: + layers = self.list_expired_layers(now) + self.delete_layers(tuple(layer.id for layer in layers)) + return layers LayerRecord = Layer diff --git a/src/cocoindex_code/protocol.py b/src/cocoindex_code/protocol.py index 7425d46..e88b383 100644 --- a/src/cocoindex_code/protocol.py +++ b/src/cocoindex_code/protocol.py @@ -211,8 +211,15 @@ class OverlayStatusResponse(_msgspec.Struct, tag="overlay_status"): layers: list[OverlayLayerInfo] +class OverlayPruneFailure(_msgspec.Struct): + layer_id: str + path: str + message: str + + class OverlayPruneResponse(_msgspec.Struct, tag="overlay_prune"): pruned_layer_ids: list[str] + failures: list[OverlayPruneFailure] = [] class ErrorResponse(_msgspec.Struct, tag="error"): diff --git a/src/cocoindex_code/version_control/git.py b/src/cocoindex_code/version_control/git.py index 02bbbe3..c390418 100644 --- a/src/cocoindex_code/version_control/git.py +++ b/src/cocoindex_code/version_control/git.py @@ -2,9 +2,8 @@ import hashlib import os -import tarfile from collections.abc import Iterable -from pathlib import Path +from pathlib import Path, PurePosixPath from typing import Any, cast from urllib.parse import urlparse @@ -24,6 +23,24 @@ def _sha_short(value: str) -> str: return hashlib.sha256(value.encode()).hexdigest()[:24] +def _safe_repo_path(path: str) -> PurePosixPath | None: + parsed = PurePosixPath(path) + if parsed.is_absolute() or not parsed.parts or any(part == ".." for part in parsed.parts): + return None + return parsed + + +def _safe_destination(source_dir: Path, repo_path: str) -> Path | None: + safe_path = _safe_repo_path(repo_path) + if safe_path is None: + return None + root = source_dir.resolve() + destination = (root / Path(*safe_path.parts)).resolve() + if destination != root and not destination.is_relative_to(root): + return None + return destination + + def _open_repo(cwd: Path) -> pygit2.Repository: try: discovered = pygit2.discover_repository(str(cwd)) @@ -108,6 +125,7 @@ def _status_entries(repo: pygit2.Repository) -> tuple[GitStatusEntry, ...]: def _dirty_snapshot_hash(repo_root: Path, entries: tuple[GitStatusEntry, ...]) -> str | None: if not entries: return None + repo = _open_repo(repo_root) digest = hashlib.sha256() for entry in sorted(entries, key=lambda e: (e.path, e.original_path or "")): digest.update(entry.index_status.encode()) @@ -115,9 +133,21 @@ def _dirty_snapshot_hash(repo_root: Path, entries: tuple[GitStatusEntry, ...]) - digest.update(entry.path.encode()) if entry.original_path is not None: digest.update(entry.original_path.encode()) - path = repo_root / entry.path - if path.is_file(): - digest.update(hashlib.sha256(path.read_bytes()).digest()) + safe_path = _safe_repo_path(entry.path) + if safe_path is None: + continue + path = repo_root / Path(*safe_path.parts) + try: + stat = path.lstat() + except OSError: + continue + digest.update(str(stat.st_mode).encode()) + digest.update(str(stat.st_size).encode()) + if not path.is_symlink() and path.is_file(): + try: + digest.update(str(repo.hashfile(str(path))).encode()) + except (OSError, ValueError, pygit2.GitError): + continue return digest.hexdigest()[:24] @@ -359,11 +389,31 @@ def ancestor_distances( def materialize_commit(repo_root: Path, commit: str, source_dir: Path) -> None: repo = _open_repo(repo_root) obj = repo.revparse_single(commit) - with tarfile.open(source_dir / ".archive.tar", mode="w") as archive: - repo.write_archive(obj, archive) - with tarfile.open(source_dir / ".archive.tar", mode="r:") as archive: - archive.extractall(source_dir) - (source_dir / ".archive.tar").unlink(missing_ok=True) + source_dir.mkdir(parents=True, exist_ok=True) + + def _walk_tree(tree: pygit2.Tree, prefix: PurePosixPath) -> None: + for entry in tree: + if entry.name is None: + continue + entry_path = prefix / entry.name + if entry.filemode == pygit2.enums.FileMode.TREE: + child = repo[entry.id] + if isinstance(child, pygit2.Tree): + _walk_tree(child, entry_path) + continue + if entry.filemode not in { + pygit2.enums.FileMode.BLOB, + pygit2.enums.FileMode.BLOB_EXECUTABLE, + }: + continue + blob = repo[entry.id] + destination = _safe_destination(source_dir, entry_path.as_posix()) + if destination is None: + continue + _write_file(destination, cast(Any, blob).data) + + commit_obj = cast(Any, obj) + _walk_tree(commit_obj.tree, PurePosixPath()) def _write_file(path: Path, data: bytes) -> None: @@ -377,19 +427,43 @@ def materialize_paths_from_commit( repo = _open_repo(repo_root) commit_obj = repo.revparse_single(commit) for path in paths: + destination = _safe_destination(source_dir, path) + if destination is None: + continue try: entry = commit_obj.tree[path] + if entry.filemode not in { + pygit2.enums.FileMode.BLOB, + pygit2.enums.FileMode.BLOB_EXECUTABLE, + }: + continue blob = repo[entry.id] data = cast(Any, blob).data except (KeyError, ValueError, pygit2.GitError): continue - _write_file(source_dir / path, data) + _write_file(destination, data) def materialize_paths_from_worktree( repo_root: Path, paths: tuple[str, ...], source_dir: Path ) -> None: + root = repo_root.resolve() for path in paths: - source = repo_root / path - if source.is_file(): - _write_file(source_dir / path, source.read_bytes()) + safe_path = _safe_repo_path(path) + destination = _safe_destination(source_dir, path) + if safe_path is None or destination is None: + continue + source = repo_root / Path(*safe_path.parts) + try: + source.lstat() + except OSError: + continue + if source.is_symlink() or not source.is_file(): + continue + resolved_source = source.resolve() + if resolved_source != root and not resolved_source.is_relative_to(root): + continue + destination.parent.mkdir(parents=True, exist_ok=True) + with source.open("rb") as src, destination.open("wb") as dst: + while chunk := src.read(1024 * 1024): + dst.write(chunk) diff --git a/tests/test_client.py b/tests/test_client.py index 3480079..dcc427f 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -2,6 +2,8 @@ from __future__ import annotations +import os +import sys import tempfile from pathlib import Path @@ -93,6 +95,46 @@ def test_print_warning_prefixes_message(capsys: pytest.CaptureFixture[str]) -> N assert err.startswith("Warning: something happened") +def test_find_ccc_executable_ignores_stale_script_shebang( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + bin_dir = tmp_path / "bin" + bin_dir.mkdir() + python = bin_dir / "python" + python.write_text("") + stale_ccc = bin_dir / "ccc" + stale_ccc.write_text("#!/missing/python\n") + stale_ccc.chmod(0o755) + monkeypatch.setattr(sys, "executable", str(python)) + + assert client._find_ccc_executable() is None + + +def test_start_daemon_fallback_preserves_source_import_path( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + captured: dict[str, object] = {} + + class _FakePopen: + def __init__(self, cmd: list[str], **kwargs: object) -> None: + captured["cmd"] = cmd + captured["env"] = kwargs["env"] + + monkeypatch.setenv("COCOINDEX_CODE_DIR", str(tmp_path)) + monkeypatch.setattr(client, "_find_ccc_executable", lambda: None) + monkeypatch.setattr(client.subprocess, "Popen", _FakePopen) + + client.start_daemon() + + env = captured["env"] + assert isinstance(env, dict) + pythonpath = env.get("PYTHONPATH") + assert isinstance(pythonpath, str) + assert str(Path(client.__file__).resolve().parents[1]) in pythonpath.split(os.pathsep) + + def test_print_handshake_warnings_no_warnings_prints_nothing( capsys: pytest.CaptureFixture[str], monkeypatch: pytest.MonkeyPatch ) -> None: diff --git a/tests/test_git_layers.py b/tests/test_git_layers.py index 51ef4b4..9aba900 100644 --- a/tests/test_git_layers.py +++ b/tests/test_git_layers.py @@ -130,11 +130,22 @@ def test_daemon_state_dir_defaults_to_xdg_data_home( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: monkeypatch.delenv("COCOINDEX_CODE_STATE_DIR", raising=False) + monkeypatch.delenv("COCOINDEX_CODE_DIR", raising=False) monkeypatch.setenv("XDG_DATA_HOME", str(tmp_path / "xdg")) assert daemon_state_dir() == tmp_path / "xdg" / "cocoindex-code" +def test_daemon_state_dir_uses_isolated_code_dir_without_xdg( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.delenv("COCOINDEX_CODE_STATE_DIR", raising=False) + monkeypatch.delenv("XDG_DATA_HOME", raising=False) + monkeypatch.setenv("COCOINDEX_CODE_DIR", str(tmp_path / "code")) + + assert daemon_state_dir() == tmp_path / "code" / "state" + + def test_normalize_remote_url_equates_common_github_forms() -> None: assert normalize_remote_url("git@github.com:Example/Repo.git") == normalize_remote_url( "https://github.com/example/repo" @@ -241,6 +252,90 @@ def test_ancestor_distances_only_returns_commits_reachable_from_head(tmp_path: P ) == {base_commit: 1, main_head: 0} +def test_dirty_snapshot_hash_streams_content_without_reading_file_into_memory( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + from cocoindex_code.version_control.change_set import GitStatusEntry + from cocoindex_code.version_control.git import _dirty_snapshot_hash + + repo = _init_repo(tmp_path / "repo") + dirty_path = repo / "dirty.py" + dirty_path.write_text("def dirty() -> str:\n return 'dirty'\n") + + def _forbid_read_bytes(self: Path) -> bytes: + raise AssertionError(f"read_bytes should not be used for dirty hashing: {self}") + + monkeypatch.setattr(Path, "read_bytes", _forbid_read_bytes) + + digest = _dirty_snapshot_hash( + repo, + ( + GitStatusEntry( + index_status=" ", + worktree_status="?", + path="dirty.py", + ), + ), + ) + + assert digest is not None + assert len(digest) == 24 + + +def test_resolve_worktree_context_excludes_gitignored_untracked_files(tmp_path: Path) -> None: + repo = _init_repo(tmp_path / "repo") + (repo / ".gitignore").write_text("ignored.py\n") + (repo / "ignored.py").write_text("IGNORED = True\n") + (repo / "visible.py").write_text("VISIBLE = True\n") + + ctx = resolve_worktree_context(repo, base_ref="main", index_config_hash="cfg") + + assert "visible.py" in ctx.dirty.affected_paths + assert "ignored.py" not in ctx.dirty.affected_paths + + +def test_dirty_git_rename_tombstones_old_path_and_affects_new_path(tmp_path: Path) -> None: + repo = _init_repo(tmp_path / "repo") + _git(repo, "mv", "main.py", "renamed.py") + + ctx = resolve_worktree_context(repo, base_ref="main", index_config_hash="cfg") + + assert "renamed.py" in ctx.dirty.affected_paths + assert "main.py" in ctx.dirty.tombstoned_paths + assert "main.py" not in ctx.dirty.affected_paths + assert "renamed.py" not in ctx.dirty.tombstoned_paths + + +def test_materialize_paths_from_worktree_rejects_symlinks(tmp_path: Path) -> None: + from cocoindex_code.version_control.git import materialize_paths_from_worktree + + repo = _init_repo(tmp_path / "repo") + outside = tmp_path / "outside.txt" + outside.write_text("secret\n") + (repo / "link.py").symlink_to(outside) + source_dir = tmp_path / "layer-src" + + materialize_paths_from_worktree(repo, ("link.py",), source_dir) + + assert not (source_dir / "link.py").exists() + + +def test_materialize_commit_skips_git_symlinks(tmp_path: Path) -> None: + from cocoindex_code.version_control.git import materialize_commit + + repo = _init_repo(tmp_path / "repo") + (repo / "link.py").symlink_to("main.py") + _git(repo, "add", "link.py") + _git(repo, "commit", "-m", "add symlink") + commit = _git(repo, "rev-parse", "HEAD") + source_dir = tmp_path / "layer-src" + + materialize_commit(repo, commit, source_dir) + + assert (source_dir / "main.py").is_file() + assert not (source_dir / "link.py").exists() + + def test_layer_store_persists_ready_layers_and_manifests(tmp_path: Path) -> None: store = LayerStore(tmp_path / "daemon.db") record = store.upsert_layer( @@ -696,3 +791,59 @@ async def test_layered_project_prefers_smaller_layer_at_same_commit( assert layers[0].layer.id == small_layer.id assert layers[0].built is False + + +@pytest.mark.asyncio +async def test_overlay_prune_closes_cached_layer_projects_before_deleting( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + from cocoindex_code.daemon import ProjectRegistry, _dispatch + from cocoindex_code.protocol import OverlayPruneRequest, OverlayPruneResponse + + monkeypatch.setenv("COCOINDEX_CODE_STATE_DIR", str(tmp_path / "state")) + registry = ProjectRegistry(embedder=object()) + layer_root = registry.state_dir / "repos" / "repo" / "layers" / "expired" + layer = registry.layer_store.upsert_layer( + layer_id="expired", + repo_id="repo", + kind=LayerKind.DIRTY, + ref_name="feature", + commit="abc", + base_commit="base", + base_layer_id="base-layer", + source_dir=layer_root / "src", + db_dir=layer_root / "db", + status="ready", + ) + layer.paths.source.mkdir(parents=True) + layer.paths.db_dir.mkdir(parents=True) + registry.layer_store.replace_manifest( + "expired", + affected_paths=["dirty.py"], + tombstoned_paths=[], + expires_at=0.0, + ) + + class _CachedProject: + closed = False + + def close(self) -> None: + self.closed = True + + cached_project = _CachedProject() + registry._layer_project_cache["expired"] = cached_project # noqa: SLF001 + + resp = await _dispatch( + OverlayPruneRequest(), + registry, + start_time=0.0, + on_shutdown=lambda: None, + settings_env_names=[], + ) + + assert isinstance(resp, OverlayPruneResponse) + assert resp.pruned_layer_ids == ["expired"] + assert resp.failures == [] + assert cached_project.closed is True + assert "expired" not in registry._layer_project_cache # noqa: SLF001 + assert not layer_root.exists() diff --git a/tests/test_protocol.py b/tests/test_protocol.py index 91c69b3..23b7bb8 100644 --- a/tests/test_protocol.py +++ b/tests/test_protocol.py @@ -18,6 +18,8 @@ IndexRequest, IndexResponse, IndexWaitingNotice, + OverlayPruneFailure, + OverlayPruneResponse, ProjectStatusRequest, ProjectStatusResponse, RemoveProjectRequest, @@ -208,6 +210,27 @@ def test_encode_decode_daemon_env_response() -> None: assert decoded.settings_env_names == ["GEMINI_API_KEY"] +def test_encode_decode_overlay_prune_response_with_failures() -> None: + resp = OverlayPruneResponse( + pruned_layer_ids=["old-dirty"], + failures=[ + OverlayPruneFailure( + layer_id="locked-branch", + path="/tmp/layer", + message="permission denied", + ), + ], + ) + data = encode_response(resp) + decoded = decode_response(data) + assert isinstance(decoded, OverlayPruneResponse) + assert decoded.pruned_layer_ids == ["old-dirty"] + assert len(decoded.failures) == 1 + assert decoded.failures[0].layer_id == "locked-branch" + assert decoded.failures[0].path == "/tmp/layer" + assert decoded.failures[0].message == "permission denied" + + def test_all_request_types_round_trip() -> None: requests: list[Request] = [ HandshakeRequest(version="1.0.0"), @@ -314,6 +337,7 @@ def test_all_response_types_round_trip() -> None: result=DoctorCheckResult(name="test", ok=True, details=[], errors=[]), ), DaemonEnvResponse(env_names=["HOME"], settings_env_names=[]), + OverlayPruneResponse(pruned_layer_ids=[]), ErrorResponse(message="err"), ] for resp in responses: From d6b432cd3ce281fb19f107f9247592c21221a7de Mon Sep 17 00:00:00 2001 From: Rudimar Ronsoni Date: Fri, 5 Jun 2026 23:34:16 +0200 Subject: [PATCH 15/15] fix: harden layer cache lifecycle --- src/cocoindex_code/daemon.py | 12 +- src/cocoindex_code/layered_project.py | 5 + src/cocoindex_code/layers/layer_stack.py | 6 +- tests/test_git_layers.py | 137 +++++++++++++++++++++++ 4 files changed, 155 insertions(+), 5 deletions(-) diff --git a/src/cocoindex_code/daemon.py b/src/cocoindex_code/daemon.py index de4bc3b..cf5012d 100644 --- a/src/cocoindex_code/daemon.py +++ b/src/cocoindex_code/daemon.py @@ -206,6 +206,7 @@ async def get_project( query_params=self.query_params, chunker_registry=chunker_registry, project_cache=self._layer_project_cache, + owns_project_cache=False, ) self._projects[cache_key] = project return self._projects[cache_key] @@ -216,12 +217,15 @@ def remove_project(self, project_root: str) -> bool: prefix = f"{Path(project_root).resolve()}\0" keys = [key for key in self._projects if key.startswith(prefix) or key == project_root] - project = None + removed_projects: list[Project | LayeredProject] = [] for key in keys: project = self._projects.pop(key, None) - if project is not None: - project.close() - del project + if project is not None: + removed_projects.append(project) + if removed_projects: + for project in removed_projects: + project.close() + del removed_projects gc.collect() return True return False diff --git a/src/cocoindex_code/layered_project.py b/src/cocoindex_code/layered_project.py index 9dce993..aa4847f 100644 --- a/src/cocoindex_code/layered_project.py +++ b/src/cocoindex_code/layered_project.py @@ -61,6 +61,7 @@ def __init__( query_params: dict[str, Any], chunker_registry: dict[str, Any], project_cache: dict[str, Project], + owns_project_cache: bool = True, ) -> None: self.project_root = project_root self.cwd = cwd @@ -72,6 +73,7 @@ def __init__( self.query_params = query_params self.chunker_registry = chunker_registry self.project_cache = project_cache + self.owns_project_cache = owns_project_cache self._stack = LayerStack( project_root=project_root, state_dir=state_dir, @@ -96,8 +98,11 @@ def indexing_stats(self) -> IndexingProgress | None: return self._indexing_stats def close(self) -> None: + if not self.owns_project_cache: + return for project in self.project_cache.values(): project.close() + self.project_cache.clear() async def ensure_indexing_started(self) -> None: if self._initial_index_done.is_set() or self._index_lock.locked(): diff --git a/src/cocoindex_code/layers/layer_stack.py b/src/cocoindex_code/layers/layer_stack.py index 7b56129..9e07750 100644 --- a/src/cocoindex_code/layers/layer_stack.py +++ b/src/cocoindex_code/layers/layer_stack.py @@ -358,7 +358,11 @@ async def _ensure_layer( or not paths.target_sqlite.exists() ): built = True - shutil.rmtree(paths.root, ignore_errors=True) + cached_project = self.project_cache.pop(layer_id, None) + if cached_project is not None: + cached_project.close() + if paths.root.exists(): + shutil.rmtree(paths.root) paths.source.mkdir(parents=True, exist_ok=True) paths.db_dir.mkdir(parents=True, exist_ok=True) self.store.upsert_layer( diff --git a/tests/test_git_layers.py b/tests/test_git_layers.py index 9aba900..845a6bf 100644 --- a/tests/test_git_layers.py +++ b/tests/test_git_layers.py @@ -847,3 +847,140 @@ def close(self) -> None: assert cached_project.closed is True assert "expired" not in registry._layer_project_cache # noqa: SLF001 assert not layer_root.exists() + + +@pytest.mark.asyncio +async def test_layer_rebuild_closes_cached_project_before_deleting_layer_root( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + from cocoindex_code.layered_project import build_index_config_hash + from cocoindex_code.layers.layer_paths import LayerPaths + from cocoindex_code.layers.layer_stack import LayerStack + from cocoindex_code.version_control import resolve_worktree + + _install_fake_layer_runtime(monkeypatch) + repo = _init_repo(tmp_path / "repo") + state_dir = tmp_path / "state" + store = LayerStore(state_dir / "daemon.db") + config_hash = build_index_config_hash(repo, indexing_params={}, query_params={}) + worktree = resolve_worktree(repo, base_ref="main", index_config_hash=config_hash) + layer_id = "stale-layer" + paths = LayerPaths.for_layer(state_dir, worktree.repository.id, layer_id) + store.upsert_layer( + layer_id=layer_id, + repo_id=worktree.repository.id, + kind=LayerKind.DIRTY, + ref_name="main", + commit=worktree.branch.head_commit, + base_commit=worktree.branch.base_commit, + base_layer_id="base-layer", + source_dir=paths.source, + db_dir=paths.db_dir, + status="ready", + config_hash=config_hash, + ) + paths.root.mkdir(parents=True) + stale_file = paths.root / "stale.txt" + stale_file.write_text("stale\n") + + class _CachedProject: + closed = False + + def close(self) -> None: + self.closed = True + + cached_project = _CachedProject() + project_cache: dict[str, Any] = {layer_id: cached_project} + stack = LayerStack( + project_root=repo, + state_dir=state_dir, + store=store, + embedder=object(), + indexing_params={}, + query_params={}, + chunker_registry={}, + project_cache=project_cache, + ) + + await stack._ensure_layer( # noqa: SLF001 + worktree=worktree, + layer_id=layer_id, + kind=LayerKind.DIRTY, + ref_name="main", + commit=worktree.branch.head_commit, + base_commit=worktree.branch.base_commit, + merge_base=worktree.branch.merge_base, + base_layer_id="base-layer", + worktree_id=worktree.id, + config_hash=config_hash, + expires_at=0.0, + materialize=lambda source_dir: (source_dir / "fresh.txt").write_text("fresh\n"), + affected_paths=("fresh.txt",), + tombstoned_paths=(), + on_progress=None, + ) + + assert cached_project.closed is True + assert layer_id not in project_cache + assert not stale_file.exists() + assert (paths.source / "fresh.txt").exists() + + +def test_layered_project_close_preserves_registry_owned_project_cache(tmp_path: Path) -> None: + repo = _init_repo(tmp_path / "repo") + + class _CachedProject: + closed = False + + def close(self) -> None: + self.closed = True + + cached_project = _CachedProject() + project_cache: dict[str, Any] = {"layer": cached_project} + project = LayeredProject( + project_root=repo, + cwd=repo, + base_ref="main", + state_dir=tmp_path / "state", + store=LayerStore(tmp_path / "state" / "daemon.db"), + embedder=object(), + indexing_params={}, + query_params={}, + chunker_registry={}, + project_cache=project_cache, + owns_project_cache=False, + ) + + project.close() + + assert cached_project.closed is False + assert project_cache == {"layer": cached_project} + + +def test_project_registry_remove_project_closes_all_matching_project_variants( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + from cocoindex_code.daemon import ProjectRegistry + + monkeypatch.setenv("COCOINDEX_CODE_STATE_DIR", str(tmp_path / "state")) + root = tmp_path / "repo" + root.mkdir() + + class _CachedProject: + def __init__(self) -> None: + self.closed = False + + def close(self) -> None: + self.closed = True + + first = _CachedProject() + second = _CachedProject() + registry = ProjectRegistry(embedder=object()) + registry._projects[f"{root.resolve()}\0{root.resolve()}\0"] = first # noqa: SLF001 + registry._projects[f"{root.resolve()}\0{root.resolve() / 'subdir'}\0main"] = second # noqa: SLF001 + + assert registry.remove_project(str(root)) is True + + assert first.closed is True + assert second.closed is True + assert registry._projects == {} # noqa: SLF001