Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ The background daemon starts automatically on first use.
| `ccc init` | Initialize a project — creates settings files, adds `.cocoindex_code/` to `.gitignore` |
| `ccc index` | Build or update the index (auto-inits if needed). Shows streaming progress. |
| `ccc search <query>` | Semantic search across the codebase |
| `ccc grep <pattern> [path]` | Structural code search by example (no index needed) |
| `ccc status` | Show index stats (chunk count, file count, language breakdown) |
| `ccc mcp` | Run as MCP server in stdio mode |
| `ccc doctor` | Run diagnostics — checks settings, daemon, model, file matching, and index health |
Expand All @@ -200,6 +201,36 @@ ccc search --refresh database schema # update index first, then

By default, `ccc search` scopes results to your current working directory (relative to the project root). Use `--path` to override.

### Structural Search (`ccc grep`)

`ccc grep` finds code by **structure**, not text — you write a by-example pattern
and it matches the syntax tree (via cocoindex's `code_match`), so formatting,
whitespace, and intervening tokens don't matter. It runs entirely locally: no
index, daemon, or embeddings required.

```bash
ccc grep 'def \NAME(\(ARGS*\)):' # every Python function def under the cwd
ccc grep 'foo(\(ARGS*\))' src/ # calls to foo(...) anywhere under src/
ccc grep 'fn \NAME(\(A*\))' --lang rust # restrict to one language
ccc grep 'class \NAME:' --path 'tests/**' # restrict to a path glob
ccc grep 'TODO(\(A*\))' path/to/file.py # a single file
```

Metavariables use the `\` sigil: `\NAME` captures one node, `\(NAME*\)` a run of
siblings, `\_`/`\*` match anonymously. The pattern is matched per language, so a
single invocation scans every supported source file (others are skipped). Inside
an initialized project, `ccc grep` honors the project's include/exclude patterns
and `.gitignore`; otherwise it scans all supported source files under the path.

Results stream to the terminal file-by-file as each match is found (in completion
order, since files are matched in parallel) rather than all at once at the end.
Each matching file shows its matched line range; under a TTY the path is colored,
line numbers are dimmed, and the unmatched context around a match is dimmed so the
match stands out.

> **Note:** `ccc grep` relies on cocoindex's structural `code_match` feature.
> Until it ships in a released cocoindex, run against a local cocoindex build.

## Docker

A Docker image is available for teams who want a reproducible, dependency-free
Expand Down
10 changes: 5 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ classifiers = [

dependencies = [
"mcp>=1.0.0",
"cocoindex[litellm]>=1.0.6,<1.1.0",
"cocoindex[litellm]>=1.0.13,<1.1.0",
"sqlite-vec>=0.1.0",
"pydantic>=2.0.0",
"numpy>=1.24.0",
Expand All @@ -39,23 +39,23 @@ dependencies = [
# `embeddings-local` is the primary feature extra: it pulls in
# `sentence-transformers` (via cocoindex) so local embeddings work without
# an API key.
embeddings-local = ["cocoindex[sentence-transformers]>=1.0.6,<1.1.0"]
embeddings-local = ["cocoindex[sentence-transformers]>=1.0.13,<1.1.0"]
# `full` is the umbrella "batteries-included" alias. Today it's just
# `embeddings-local`, but we expect to bundle more optional niceties under
# it over time — users who want everything can keep using `[full]` and pick
# up the additions automatically. The name also matches the Docker
# `:full` image variant for consistency across install paths. Contents are
# inlined rather than self-referencing `cocoindex-code[embeddings-local]`
# to avoid resolver edge cases with older pip.
full = ["cocoindex[sentence-transformers]>=1.0.6,<1.1.0"]
full = ["cocoindex[sentence-transformers]>=1.0.13,<1.1.0"]
dev = [
"pytest>=7.0.0",
"pytest-asyncio>=0.21.0",
"pytest-cov>=4.0.0",
"ruff>=0.1.0",
"mypy>=1.0.0",
"prek>=0.1.0",
"cocoindex[sentence-transformers]>=1.0.6,<1.1.0",
"cocoindex[sentence-transformers]>=1.0.13,<1.1.0",
]

[project.scripts]
Expand Down Expand Up @@ -89,7 +89,7 @@ dev = [
"mypy>=1.0.0",
"prek>=0.1.0",
"types-pyyaml>=6.0.12.20250915",
"cocoindex[sentence-transformers]>=1.0.7,<1.1.0",
"cocoindex[sentence-transformers]>=1.0.13,<1.1.0",
]

[tool.ruff]
Expand Down
75 changes: 75 additions & 0 deletions src/cocoindex_code/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
import functools
import os
import sys
import threading
from collections.abc import Callable
from pathlib import Path
from typing import TYPE_CHECKING, TypeVar

import typer as _typer

if TYPE_CHECKING:
from .grep import FileMatches, GrepWarning
from .protocol import (
DoctorCheckResult,
IndexingProgress,
Expand Down Expand Up @@ -657,6 +659,79 @@ def search(
print_search_results(resp)


@app.command()
def grep(
pattern: str = _typer.Argument(
...,
help=r"By-example structural pattern; use \ for metavariables, "
r"e.g. 'def \NAME(\(ARGS*\)):' or 'foo(\(ARGS*\))'.",
),
path: str = _typer.Argument(
".", help="File or directory to search (default: current directory)."
),
lang: list[str] = _typer.Option(
[], "--lang", help="Only match files of these languages (e.g. python, rust, cpp)."
),
path_glob: str | None = _typer.Option(
None, "--path", help="Only match files whose path matches this glob (globset syntax)."
),
no_color: bool = _typer.Option(False, "--no-color", help="Disable colored output."),
) -> None:
r"""Structurally grep code by example (no index or daemon required).

Compiles the pattern per language and matches every supported source file
under PATH in parallel. Inside an initialized project it honors the project's
include/exclude and .gitignore rules; otherwise it scans all supported source
files.
"""
from . import grep as _grep

target = Path(path)
if not target.exists():
_typer.echo(f"Error: path not found: {path}", err=True)
raise _typer.Exit(code=1)

req = _grep.GrepRequest(
pattern=pattern,
root=target,
languages=frozenset(lang_name.lower() for lang_name in lang) or None,
path_glob=path_glob,
)
use_color = not no_color and sys.stdout.isatty() and not os.environ.get("NO_COLOR")
grep_run = _grep.Grep(req)
matched = 0
# `run` calls `_emit` from several worker threads at once; the lock keeps one
# file's output (and the `matched` bookkeeping) from interleaving with another's.
output_lock = threading.Lock()

def _emit(item: FileMatches | GrepWarning) -> None:
nonlocal matched
if isinstance(item, _grep.GrepWarning):
with output_lock:
_typer.echo(f"warning: {item.message}", err=True)
return
block = _grep.render_file(item, color=use_color) # render outside the lock
with output_lock:
if matched:
_typer.echo() # blank line between files
_typer.echo(block)
matched += 1

grep_run.run(_emit)

# The "unusable everywhere" verdict needs the whole walk, so it's known only
# once the run is done — report it before exiting.
if grep_run.unusable:
langs = ", ".join(grep_run.failed_languages)
_typer.echo(
f"Error: the pattern did not compile for any of the languages found ({langs}).",
err=True,
)
raise _typer.Exit(code=1)
if matched == 0:
_typer.echo("No matches found.")


@app.command()
@_catch_daemon_start_error
def status() -> None:
Expand Down
13 changes: 3 additions & 10 deletions src/cocoindex_code/daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,23 +372,16 @@ async def _check_file_walk(project_root_str: str) -> DoctorCheckResult:
"""Walk project files and report counts + gitignore paths."""
from pathlib import PurePath

from cocoindex.resources.file import PatternFilePathMatcher

from .indexer import GitignoreAwareMatcher
from .settings import load_gitignore_spec, load_project_settings
from .file_walk import build_matcher
from .settings import load_project_settings

project_root = Path(project_root_str)
try:
ps = load_project_settings(project_root)
except FileNotFoundError as e:
return DoctorCheckResult(name="File Walk", ok=False, details=[], errors=[str(e)])

gitignore_spec = load_gitignore_spec(project_root)
base_matcher = PatternFilePathMatcher(
included_patterns=ps.include_patterns,
excluded_patterns=ps.exclude_patterns,
)
matcher = GitignoreAwareMatcher(base_matcher, gitignore_spec, project_root)
matcher = build_matcher(project_root, ps.include_patterns, ps.exclude_patterns)

counts_by_ext: dict[str, int] = {}
gitignore_dirs: list[str] = []
Expand Down
176 changes: 176 additions & 0 deletions src/cocoindex_code/file_walk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
"""Shared source-file walking: pattern + .gitignore matching, reused by the
indexer, the daemon's doctor file-walk, and ``ccc grep``.

The matcher (include/exclude globs + nested ``.gitignore`` awareness) is the
single source of truth for "which files count as part of the project". The
indexer feeds it to CocoIndex's incremental file source; the daemon and ``ccc
grep`` drive a plain :func:`os.walk` over it via :func:`iter_included_files`.
"""

from __future__ import annotations

import os
from collections.abc import Iterable, Iterator
from pathlib import Path, PurePath

from cocoindex.resources.file import FilePathMatcher, PatternFilePathMatcher
from pathspec import GitIgnoreSpec

from .settings import load_gitignore_spec


def _normalize_gitignore_lines(lines: Iterable[str], directory: PurePath) -> list[str]:
"""Normalize .gitignore lines to root-relative gitignore patterns."""
if directory in (PurePath("."), PurePath("")):
prefix = ""
else:
prefix = f"{directory.as_posix().rstrip('/')}/"

normalized: list[str] = []
for raw_line in lines:
line = raw_line.rstrip("\n\r")
if not line:
continue
stripped = line.lstrip()
if not stripped or stripped.startswith("#"):
continue
if line.startswith("\\#") or line.startswith("\\!"):
line = line[1:]
negated = line.startswith("!")
if negated:
line = line[1:]
body = line.strip()
if not body:
continue
anchor = body.startswith("/")
if anchor:
body = body.lstrip("/")
pattern = f"{prefix}{body}" if prefix else body
else:
contains_slash = "/" in body
base = prefix
if contains_slash:
pattern = f"{base}{body}"
else:
if base:
pattern = f"{base}**/{body}"
else:
pattern = f"**/{body}"
if negated:
pattern = f"!{pattern}"
normalized.append(pattern)
return normalized


class GitignoreAwareMatcher(FilePathMatcher):
"""Wraps another matcher and applies .gitignore filtering."""

def __init__(
self,
delegate: FilePathMatcher,
root_spec: GitIgnoreSpec | None,
project_root: Path,
) -> None:
self._delegate = delegate
self._root = project_root
self._spec_cache: dict[PurePath, GitIgnoreSpec | None] = {PurePath("."): root_spec}

def _spec_for(self, directory: PurePath) -> GitIgnoreSpec | None:
if directory in self._spec_cache:
return self._spec_cache[directory]

parent_dir = directory.parent if directory != PurePath(".") else PurePath(".")
parent_spec = self._spec_for(parent_dir)
spec = parent_spec

gitignore_path = (self._root / directory) / ".gitignore"
if gitignore_path.is_file():
try:
lines = gitignore_path.read_text().splitlines()
except (OSError, UnicodeDecodeError):
lines = []
normalized = _normalize_gitignore_lines(lines, directory)
if normalized:
new_spec = GitIgnoreSpec.from_lines(normalized)
spec = new_spec if spec is None else spec + new_spec

self._spec_cache[directory] = spec
return spec

def _is_ignored(self, path: PurePath, is_dir: bool) -> bool:
directory = path if is_dir else path.parent
if directory == PurePath(""):
directory = PurePath(".")
spec = self._spec_for(directory)
if spec is None:
return False
match_path = path.as_posix()
if is_dir and not match_path.endswith("/"):
match_path = f"{match_path}/"
return spec.match_file(match_path)

def is_dir_included(self, path: PurePath) -> bool:
if self._is_ignored(path, True):
return False
return self._delegate.is_dir_included(path)

def is_file_included(self, path: PurePath) -> bool:
if self._is_ignored(path, False):
return False
return self._delegate.is_file_included(path)


def find_git_root(start: Path) -> Path | None:
"""Walk up from ``start`` to the nearest directory holding a ``.git`` entry — a
directory for a normal repo, or a *file* for a submodule or linked worktree.
Returns that directory, or ``None`` if ``start`` is not inside a git repo.

Used to anchor ``.gitignore`` resolution at the real repo root when grepping a
subdirectory that isn't inside an initialized cocoindex project."""
current = start.resolve()
while True:
if (current / ".git").exists():
return current
if current.parent == current:
return None
current = current.parent


def build_matcher(
project_root: Path,
included_patterns: list[str],
excluded_patterns: list[str],
) -> FilePathMatcher:
"""Build the project's file matcher: include/exclude globs plus nested
``.gitignore`` awareness anchored at ``project_root``."""
base_matcher = PatternFilePathMatcher(
included_patterns=included_patterns,
excluded_patterns=excluded_patterns,
)
return GitignoreAwareMatcher(base_matcher, load_gitignore_spec(project_root), project_root)


def iter_included_files(
start: Path,
base: Path,
matcher: FilePathMatcher,
) -> Iterator[tuple[Path, PurePath]]:
"""Walk ``start`` recursively, yielding ``(absolute_path, path_relative_to_base)``
for every file ``matcher`` includes, pruning excluded directories.

``base`` anchors the relative paths the matcher sees (the project root, so
its patterns line up); ``start`` is where traversal begins and may be a
subdirectory of ``base``. Both must be absolute. Traversal is deterministic
(directories and files are visited in sorted order).
"""
for dirpath_str, dirnames, filenames in os.walk(start):
dirpath = Path(dirpath_str)
rel_dir = PurePath(dirpath.relative_to(base))
if rel_dir != PurePath(".") and not matcher.is_dir_included(rel_dir):
dirnames.clear()
continue
dirnames.sort()
for fname in sorted(filenames):
rel_path = rel_dir / fname if rel_dir != PurePath(".") else PurePath(fname)
if matcher.is_file_included(rel_path):
yield dirpath / fname, rel_path
Loading
Loading