diff --git a/README.md b/README.md index f2ab074..b7d8b38 100644 --- a/README.md +++ b/README.md @@ -180,6 +180,7 @@ The background daemon starts automatically on first use. | `ccc init` | Initialize a project — creates settings files, adds `.cocoindex_code/` to `.gitignore` | | `ccc index` | Build or update the index (auto-inits if needed). Shows streaming progress. | | `ccc search ` | Semantic search across the codebase | +| `ccc grep [path]` | Structural code search by example (no index needed) | | `ccc status` | Show index stats (chunk count, file count, language breakdown) | | `ccc mcp` | Run as MCP server in stdio mode | | `ccc doctor` | Run diagnostics — checks settings, daemon, model, file matching, and index health | @@ -200,6 +201,36 @@ ccc search --refresh database schema # update index first, then By default, `ccc search` scopes results to your current working directory (relative to the project root). Use `--path` to override. +### Structural Search (`ccc grep`) + +`ccc grep` finds code by **structure**, not text — you write a by-example pattern +and it matches the syntax tree (via cocoindex's `code_match`), so formatting, +whitespace, and intervening tokens don't matter. It runs entirely locally: no +index, daemon, or embeddings required. + +```bash +ccc grep 'def \NAME(\(ARGS*\)):' # every Python function def under the cwd +ccc grep 'foo(\(ARGS*\))' src/ # calls to foo(...) anywhere under src/ +ccc grep 'fn \NAME(\(A*\))' --lang rust # restrict to one language +ccc grep 'class \NAME:' --path 'tests/**' # restrict to a path glob +ccc grep 'TODO(\(A*\))' path/to/file.py # a single file +``` + +Metavariables use the `\` sigil: `\NAME` captures one node, `\(NAME*\)` a run of +siblings, `\_`/`\*` match anonymously. The pattern is matched per language, so a +single invocation scans every supported source file (others are skipped). Inside +an initialized project, `ccc grep` honors the project's include/exclude patterns +and `.gitignore`; otherwise it scans all supported source files under the path. + +Results stream to the terminal file-by-file as each match is found (in completion +order, since files are matched in parallel) rather than all at once at the end. +Each matching file shows its matched line range; under a TTY the path is colored, +line numbers are dimmed, and the unmatched context around a match is dimmed so the +match stands out. + +> **Note:** `ccc grep` relies on cocoindex's structural `code_match` feature. +> Until it ships in a released cocoindex, run against a local cocoindex build. + ## Docker A Docker image is available for teams who want a reproducible, dependency-free diff --git a/pyproject.toml b/pyproject.toml index bd702a8..b8858bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ classifiers = [ dependencies = [ "mcp>=1.0.0", - "cocoindex[litellm]>=1.0.6,<1.1.0", + "cocoindex[litellm]>=1.0.13,<1.1.0", "sqlite-vec>=0.1.0", "pydantic>=2.0.0", "numpy>=1.24.0", @@ -39,7 +39,7 @@ dependencies = [ # `embeddings-local` is the primary feature extra: it pulls in # `sentence-transformers` (via cocoindex) so local embeddings work without # an API key. -embeddings-local = ["cocoindex[sentence-transformers]>=1.0.6,<1.1.0"] +embeddings-local = ["cocoindex[sentence-transformers]>=1.0.13,<1.1.0"] # `full` is the umbrella "batteries-included" alias. Today it's just # `embeddings-local`, but we expect to bundle more optional niceties under # it over time — users who want everything can keep using `[full]` and pick @@ -47,7 +47,7 @@ embeddings-local = ["cocoindex[sentence-transformers]>=1.0.6,<1.1.0"] # `:full` image variant for consistency across install paths. Contents are # inlined rather than self-referencing `cocoindex-code[embeddings-local]` # to avoid resolver edge cases with older pip. -full = ["cocoindex[sentence-transformers]>=1.0.6,<1.1.0"] +full = ["cocoindex[sentence-transformers]>=1.0.13,<1.1.0"] dev = [ "pytest>=7.0.0", "pytest-asyncio>=0.21.0", @@ -55,7 +55,7 @@ dev = [ "ruff>=0.1.0", "mypy>=1.0.0", "prek>=0.1.0", - "cocoindex[sentence-transformers]>=1.0.6,<1.1.0", + "cocoindex[sentence-transformers]>=1.0.13,<1.1.0", ] [project.scripts] @@ -89,7 +89,7 @@ dev = [ "mypy>=1.0.0", "prek>=0.1.0", "types-pyyaml>=6.0.12.20250915", - "cocoindex[sentence-transformers]>=1.0.7,<1.1.0", + "cocoindex[sentence-transformers]>=1.0.13,<1.1.0", ] [tool.ruff] diff --git a/src/cocoindex_code/cli.py b/src/cocoindex_code/cli.py index 90b3297..093b72e 100644 --- a/src/cocoindex_code/cli.py +++ b/src/cocoindex_code/cli.py @@ -5,6 +5,7 @@ import functools import os import sys +import threading from collections.abc import Callable from pathlib import Path from typing import TYPE_CHECKING, TypeVar @@ -12,6 +13,7 @@ import typer as _typer if TYPE_CHECKING: + from .grep import FileMatches, GrepWarning from .protocol import ( DoctorCheckResult, IndexingProgress, @@ -657,6 +659,79 @@ def search( print_search_results(resp) +@app.command() +def grep( + pattern: str = _typer.Argument( + ..., + help=r"By-example structural pattern; use \ for metavariables, " + r"e.g. 'def \NAME(\(ARGS*\)):' or 'foo(\(ARGS*\))'.", + ), + path: str = _typer.Argument( + ".", help="File or directory to search (default: current directory)." + ), + lang: list[str] = _typer.Option( + [], "--lang", help="Only match files of these languages (e.g. python, rust, cpp)." + ), + path_glob: str | None = _typer.Option( + None, "--path", help="Only match files whose path matches this glob (globset syntax)." + ), + no_color: bool = _typer.Option(False, "--no-color", help="Disable colored output."), +) -> None: + r"""Structurally grep code by example (no index or daemon required). + + Compiles the pattern per language and matches every supported source file + under PATH in parallel. Inside an initialized project it honors the project's + include/exclude and .gitignore rules; otherwise it scans all supported source + files. + """ + from . import grep as _grep + + target = Path(path) + if not target.exists(): + _typer.echo(f"Error: path not found: {path}", err=True) + raise _typer.Exit(code=1) + + req = _grep.GrepRequest( + pattern=pattern, + root=target, + languages=frozenset(lang_name.lower() for lang_name in lang) or None, + path_glob=path_glob, + ) + use_color = not no_color and sys.stdout.isatty() and not os.environ.get("NO_COLOR") + grep_run = _grep.Grep(req) + matched = 0 + # `run` calls `_emit` from several worker threads at once; the lock keeps one + # file's output (and the `matched` bookkeeping) from interleaving with another's. + output_lock = threading.Lock() + + def _emit(item: FileMatches | GrepWarning) -> None: + nonlocal matched + if isinstance(item, _grep.GrepWarning): + with output_lock: + _typer.echo(f"warning: {item.message}", err=True) + return + block = _grep.render_file(item, color=use_color) # render outside the lock + with output_lock: + if matched: + _typer.echo() # blank line between files + _typer.echo(block) + matched += 1 + + grep_run.run(_emit) + + # The "unusable everywhere" verdict needs the whole walk, so it's known only + # once the run is done — report it before exiting. + if grep_run.unusable: + langs = ", ".join(grep_run.failed_languages) + _typer.echo( + f"Error: the pattern did not compile for any of the languages found ({langs}).", + err=True, + ) + raise _typer.Exit(code=1) + if matched == 0: + _typer.echo("No matches found.") + + @app.command() @_catch_daemon_start_error def status() -> None: diff --git a/src/cocoindex_code/daemon.py b/src/cocoindex_code/daemon.py index 35b982a..8d4fe8d 100644 --- a/src/cocoindex_code/daemon.py +++ b/src/cocoindex_code/daemon.py @@ -372,10 +372,8 @@ async def _check_file_walk(project_root_str: str) -> DoctorCheckResult: """Walk project files and report counts + gitignore paths.""" from pathlib import PurePath - from cocoindex.resources.file import PatternFilePathMatcher - - from .indexer import GitignoreAwareMatcher - from .settings import load_gitignore_spec, load_project_settings + from .file_walk import build_matcher + from .settings import load_project_settings project_root = Path(project_root_str) try: @@ -383,12 +381,7 @@ async def _check_file_walk(project_root_str: str) -> DoctorCheckResult: except FileNotFoundError as e: return DoctorCheckResult(name="File Walk", ok=False, details=[], errors=[str(e)]) - gitignore_spec = load_gitignore_spec(project_root) - base_matcher = PatternFilePathMatcher( - included_patterns=ps.include_patterns, - excluded_patterns=ps.exclude_patterns, - ) - matcher = GitignoreAwareMatcher(base_matcher, gitignore_spec, project_root) + matcher = build_matcher(project_root, ps.include_patterns, ps.exclude_patterns) counts_by_ext: dict[str, int] = {} gitignore_dirs: list[str] = [] diff --git a/src/cocoindex_code/file_walk.py b/src/cocoindex_code/file_walk.py new file mode 100644 index 0000000..62ebc3c --- /dev/null +++ b/src/cocoindex_code/file_walk.py @@ -0,0 +1,176 @@ +"""Shared source-file walking: pattern + .gitignore matching, reused by the +indexer, the daemon's doctor file-walk, and ``ccc grep``. + +The matcher (include/exclude globs + nested ``.gitignore`` awareness) is the +single source of truth for "which files count as part of the project". The +indexer feeds it to CocoIndex's incremental file source; the daemon and ``ccc +grep`` drive a plain :func:`os.walk` over it via :func:`iter_included_files`. +""" + +from __future__ import annotations + +import os +from collections.abc import Iterable, Iterator +from pathlib import Path, PurePath + +from cocoindex.resources.file import FilePathMatcher, PatternFilePathMatcher +from pathspec import GitIgnoreSpec + +from .settings import load_gitignore_spec + + +def _normalize_gitignore_lines(lines: Iterable[str], directory: PurePath) -> list[str]: + """Normalize .gitignore lines to root-relative gitignore patterns.""" + if directory in (PurePath("."), PurePath("")): + prefix = "" + else: + prefix = f"{directory.as_posix().rstrip('/')}/" + + normalized: list[str] = [] + for raw_line in lines: + line = raw_line.rstrip("\n\r") + if not line: + continue + stripped = line.lstrip() + if not stripped or stripped.startswith("#"): + continue + if line.startswith("\\#") or line.startswith("\\!"): + line = line[1:] + negated = line.startswith("!") + if negated: + line = line[1:] + body = line.strip() + if not body: + continue + anchor = body.startswith("/") + if anchor: + body = body.lstrip("/") + pattern = f"{prefix}{body}" if prefix else body + else: + contains_slash = "/" in body + base = prefix + if contains_slash: + pattern = f"{base}{body}" + else: + if base: + pattern = f"{base}**/{body}" + else: + pattern = f"**/{body}" + if negated: + pattern = f"!{pattern}" + normalized.append(pattern) + return normalized + + +class GitignoreAwareMatcher(FilePathMatcher): + """Wraps another matcher and applies .gitignore filtering.""" + + def __init__( + self, + delegate: FilePathMatcher, + root_spec: GitIgnoreSpec | None, + project_root: Path, + ) -> None: + self._delegate = delegate + self._root = project_root + self._spec_cache: dict[PurePath, GitIgnoreSpec | None] = {PurePath("."): root_spec} + + def _spec_for(self, directory: PurePath) -> GitIgnoreSpec | None: + if directory in self._spec_cache: + return self._spec_cache[directory] + + parent_dir = directory.parent if directory != PurePath(".") else PurePath(".") + parent_spec = self._spec_for(parent_dir) + spec = parent_spec + + gitignore_path = (self._root / directory) / ".gitignore" + if gitignore_path.is_file(): + try: + lines = gitignore_path.read_text().splitlines() + except (OSError, UnicodeDecodeError): + lines = [] + normalized = _normalize_gitignore_lines(lines, directory) + if normalized: + new_spec = GitIgnoreSpec.from_lines(normalized) + spec = new_spec if spec is None else spec + new_spec + + self._spec_cache[directory] = spec + return spec + + def _is_ignored(self, path: PurePath, is_dir: bool) -> bool: + directory = path if is_dir else path.parent + if directory == PurePath(""): + directory = PurePath(".") + spec = self._spec_for(directory) + if spec is None: + return False + match_path = path.as_posix() + if is_dir and not match_path.endswith("/"): + match_path = f"{match_path}/" + return spec.match_file(match_path) + + def is_dir_included(self, path: PurePath) -> bool: + if self._is_ignored(path, True): + return False + return self._delegate.is_dir_included(path) + + def is_file_included(self, path: PurePath) -> bool: + if self._is_ignored(path, False): + return False + return self._delegate.is_file_included(path) + + +def find_git_root(start: Path) -> Path | None: + """Walk up from ``start`` to the nearest directory holding a ``.git`` entry — a + directory for a normal repo, or a *file* for a submodule or linked worktree. + Returns that directory, or ``None`` if ``start`` is not inside a git repo. + + Used to anchor ``.gitignore`` resolution at the real repo root when grepping a + subdirectory that isn't inside an initialized cocoindex project.""" + current = start.resolve() + while True: + if (current / ".git").exists(): + return current + if current.parent == current: + return None + current = current.parent + + +def build_matcher( + project_root: Path, + included_patterns: list[str], + excluded_patterns: list[str], +) -> FilePathMatcher: + """Build the project's file matcher: include/exclude globs plus nested + ``.gitignore`` awareness anchored at ``project_root``.""" + base_matcher = PatternFilePathMatcher( + included_patterns=included_patterns, + excluded_patterns=excluded_patterns, + ) + return GitignoreAwareMatcher(base_matcher, load_gitignore_spec(project_root), project_root) + + +def iter_included_files( + start: Path, + base: Path, + matcher: FilePathMatcher, +) -> Iterator[tuple[Path, PurePath]]: + """Walk ``start`` recursively, yielding ``(absolute_path, path_relative_to_base)`` + for every file ``matcher`` includes, pruning excluded directories. + + ``base`` anchors the relative paths the matcher sees (the project root, so + its patterns line up); ``start`` is where traversal begins and may be a + subdirectory of ``base``. Both must be absolute. Traversal is deterministic + (directories and files are visited in sorted order). + """ + for dirpath_str, dirnames, filenames in os.walk(start): + dirpath = Path(dirpath_str) + rel_dir = PurePath(dirpath.relative_to(base)) + if rel_dir != PurePath(".") and not matcher.is_dir_included(rel_dir): + dirnames.clear() + continue + dirnames.sort() + for fname in sorted(filenames): + rel_path = rel_dir / fname if rel_dir != PurePath(".") else PurePath(fname) + if matcher.is_file_included(rel_path): + yield dirpath / fname, rel_path diff --git a/src/cocoindex_code/grep.py b/src/cocoindex_code/grep.py new file mode 100644 index 0000000..d769410 --- /dev/null +++ b/src/cocoindex_code/grep.py @@ -0,0 +1,411 @@ +r"""``ccc grep`` — by-example structural code search over files. + +Unlike ``ccc search`` (semantic, needs the index + daemon + embeddings), ``grep`` +runs entirely locally: it compiles a structural pattern (cocoindex ``code_match``) +once per language, walks the matching source files, and matches them in parallel. +No index or daemon is required. + +Patterns use the ``\`` sigil for metavariables, e.g. ``def \NAME(\(ARGS*\)):`` or +``foo(\(ARGS*\))``. See the cocoindex code_match design for the full syntax. +""" + +from __future__ import annotations + +import functools +from collections.abc import Callable, Iterator +from concurrent.futures import ThreadPoolExecutor +from dataclasses import dataclass +from pathlib import Path + +import click +from cocoindex.ops.code import CodeMatch, CodePattern +from cocoindex.ops.text import detect_code_language +from cocoindex.resources.file import PatternFilePathMatcher + +from .file_walk import build_matcher, find_git_root, iter_included_files +from .settings import ( + DEFAULT_EXCLUDED_PATTERNS, + DEFAULT_INCLUDED_PATTERNS, + find_project_root, + load_project_settings, +) + +# A trivial, always-valid pattern (a bare identifier) used to probe whether a +# language is structurally matchable, independent of the user's pattern. +_PROBE_PATTERN = "x" + + +@dataclass(frozen=True, slots=True) +class GrepWarning: + """A non-fatal problem surfaced during a grep — a file that couldn't be read, + or a supported language the pattern failed to compile for. grep keeps going; + the CLI prints these to stderr.""" + + message: str + + language: str | None = None + """Set when this is a per-language pattern-compile failure (so :class:`Grep` can + tell whether the pattern is unusable for *every* language it tried).""" + + +@dataclass(frozen=True, slots=True) +class FileMatches: + """Every match found in one file.""" + + path: str + """Display path, mirroring the root passed to grep (e.g. ``src/a.py`` or + ``/tmp/x/a.py``).""" + + source: str + """Full file content — kept so the renderer can show context around each match.""" + + matches: list[CodeMatch] + """Matches in source order (at least one).""" + + +@dataclass(frozen=True, slots=True) +class GrepRequest: + """A grep invocation.""" + + pattern: str + root: Path + """File or directory to search.""" + + languages: frozenset[str] | None = None + """Restrict to these languages (lowercased canonical names); ``None`` = all.""" + + path_glob: str | None = None + """Extra include glob (globset syntax) on the project-relative path; ``None`` = all.""" + + +@dataclass(frozen=True, slots=True) +class _Target: + path: Path + display: str + pattern: CodePattern + + +# --------------------------------------------------------------------------- +# Per-language pattern compilation +# --------------------------------------------------------------------------- + + +@functools.cache +def _is_match_supported(language: str) -> bool: + """Whether code_match can structurally match ``language``. + + Probes with a trivial always-valid pattern so the answer doesn't depend on + the user's pattern. Cached across the process. + """ + try: + CodePattern(_PROBE_PATTERN, language=language) + return True + except ValueError: + return False + + +class _PatternCompiler: + r"""Compiles one pattern per language on demand, caching the result. + + code_match patterns are language-bound (compiled against a grammar's token + table), so we keep one compiled :class:`CodePattern` per language and reuse it + across every file of that language. A language maps to ``None`` (its files are + skipped) when either code_match can't match it at all, or the pattern won't + compile for it — the latter records a :class:`GrepWarning` once for that + language, so the user learns why those files were skipped instead of the whole + run aborting. + """ + + def __init__(self, pattern: str) -> None: + self._pattern = pattern + self._cache: dict[str, CodePattern | None] = {} + self.warnings: list[GrepWarning] = [] + + def for_language(self, language: str) -> CodePattern | None: + if language not in self._cache: + self._cache[language] = self._compile(language) + return self._cache[language] + + def _compile(self, language: str) -> CodePattern | None: + try: + return CodePattern(self._pattern, language=language) + except ValueError as e: + # A *supported* language that won't compile the pattern is a real + # problem to surface (once per language); an *unsupported* language is + # an expected silent skip. `_is_match_supported` tells them apart, + # independent of the user's pattern. + if _is_match_supported(language): + self.warnings.append( + GrepWarning(f"pattern invalid for {language}: {e}", language=language) + ) + return None + + +# --------------------------------------------------------------------------- +# Target collection (which files to match, with which compiled pattern) +# --------------------------------------------------------------------------- + + +def _detect_language(path: Path, ext_overrides: dict[str, str]) -> str | None: + """Language for ``path``: project extension override first, then auto-detect.""" + return ext_overrides.get(path.suffix) or detect_code_language(filename=path.name) + + +def _ext_overrides(project_root: Path | None) -> dict[str, str]: + if project_root is None: + return {} + ps = load_project_settings(project_root) + return {f".{lo.ext}": lo.lang for lo in ps.language_overrides} + + +def _target_for_file( + abs_path: Path, + display: str, + ext_overrides: dict[str, str], + req: GrepRequest, + compiler: _PatternCompiler, +) -> _Target | None: + language = _detect_language(abs_path, ext_overrides) + if language is None: + return None + if req.languages is not None and language.lower() not in req.languages: + return None + cp = compiler.for_language(language) + if cp is None: + return None + return _Target(path=abs_path, display=display, pattern=cp) + + +def _resolve_file( + abs_path: Path, + display: str, + ext_overrides: dict[str, str], + req: GrepRequest, + compiler: _PatternCompiler, +) -> Iterator[_Target | GrepWarning]: + """Yield any compile warning newly raised for this file's language, then the + file's target if the pattern compiled for that language.""" + before = len(compiler.warnings) + target = _target_for_file(abs_path, display, ext_overrides, req, compiler) + yield from compiler.warnings[before:] + if target is not None: + yield target + + +def _iter_targets(req: GrepRequest, compiler: _PatternCompiler) -> Iterator[_Target | GrepWarning]: + """Lazily resolve a request into the files to match, yielded as they're + discovered (so matching can begin before the walk finishes), interleaved with + each new per-language compile warning. + + Single source of truth with the indexer: the same include/exclude patterns and + ``.gitignore`` rules decide which files belong to the project. Outside a project + we fall back to the default source-file patterns. + """ + root = req.root.resolve() + + if root.is_file(): + project_root = find_project_root(root.parent) + yield from _resolve_file( + root, req.root.as_posix(), _ext_overrides(project_root), req, compiler + ) + return + + project_root = find_project_root(root) + if project_root is not None: + ps = load_project_settings(project_root) + included, excluded = ps.include_patterns, ps.exclude_patterns + ext_overrides = {f".{lo.ext}": lo.lang for lo in ps.language_overrides} + base = project_root + else: + included = list(DEFAULT_INCLUDED_PATTERNS) + excluded = list(DEFAULT_EXCLUDED_PATTERNS) + ext_overrides = {} + # Anchor at the enclosing git repo so grepping a subdirectory still honors the + # repo-root (and intervening) .gitignore; fall back to the target dir itself. + base = find_git_root(root) or root + + matcher = build_matcher(base, included, excluded) + path_filter = ( + PatternFilePathMatcher(included_patterns=[req.path_glob]) if req.path_glob else None + ) + + for abs_path, rel in iter_included_files(root, base, matcher): + if path_filter is not None and not path_filter.is_file_included(rel): + continue + # Display paths mirror the root the user gave (e.g. "src/a.py", "/tmp/x/a.py"), + # rather than always being cwd-relative. + display = (req.root / abs_path.relative_to(root)).as_posix() + yield from _resolve_file(abs_path, display, ext_overrides, req, compiler) + + +# --------------------------------------------------------------------------- +# Matching +# --------------------------------------------------------------------------- + + +def _match_file(target: _Target) -> FileMatches | GrepWarning | None: + """Match one file via code_match's ``match_file``: it reads the file and runs + the prefilter in Rust — skipping the parse (and the Python-side read) for files + that can't match — all with the GIL released, so a worker-thread pool scans many + files truly in parallel. + + Returns ``None`` for a file that's binary, prefilter-rejected, or has no matches + (a silent skip, like ``grep -I``); a :class:`GrepWarning` for an unreadable file. + """ + try: + fm = target.pattern.match_file(str(target.path)) + except OSError as e: + return GrepWarning(f"cannot read {target.display}: {e}") + if fm is None: + return None # binary, prefilter-rejected, or no matches + matches = sorted(fm.matches, key=lambda m: m.chunks[0].start.byte_offset if m.chunks else 0) + return FileMatches(path=target.display, source=fm.ast.source, matches=matches) + + +class Grep: + """A single grep run. :meth:`run` matches each file as it's listed and streams + results as they complete. After ``run`` is exhausted, :attr:`unusable` / + :attr:`failed_languages` report the compile verdict.""" + + def __init__(self, req: GrepRequest) -> None: + self._req = req + self._compiler = _PatternCompiler(req.pattern) + self._target_count = 0 + + def run(self, emit: Callable[[FileMatches | GrepWarning], object]) -> None: + """Match the request, calling ``emit`` with each file's matches and each + compile warning the moment it's ready — *while the walk is still running*. + + Fully synchronous, no event loop: the walk runs on the calling thread and + submits each file's match to a thread pool. ``match_file`` reads + prefilters + + parses + matches in Rust with the GIL released, so the pool threads run + truly in parallel — with the walk and with each other — and hand each result + straight to ``emit`` as it finishes. ``run`` returns once the walk and every + match are done; afterwards :attr:`unusable` / :attr:`failed_languages` hold the + compile verdict. + + ``emit`` is called concurrently from the pool's worker threads (plus this + thread, for warnings), so a consumer that does I/O must serialize it itself. + """ + + def _match(item: _Target) -> None: + result = _match_file(item) + if result is not None: # None = binary / prefiltered / no match (skip) + emit(result) + + with ThreadPoolExecutor() as pool: + for item in _iter_targets(self._req, self._compiler): + if isinstance(item, GrepWarning): + emit(item) + else: + self._target_count += 1 + pool.submit(_match, item) + # ThreadPoolExecutor.__exit__ waits for every submitted match to finish. + + @property + def failed_languages(self) -> list[str]: + """Supported languages the pattern would not compile for (valid once + :meth:`run` is exhausted).""" + return [w.language for w in self._compiler.warnings if w.language is not None] + + @property + def unusable(self) -> bool: + """The pattern compiled for *none* of the languages actually encountered — a + supported language was found but every one rejected the pattern, so there was + nothing to match. A target exists only for a file whose language compiled, so + zero targets plus ≥1 failed language means unusable everywhere it was tried — + distinct from "no matchable files found" (no failures). Valid once + :meth:`run` is exhausted.""" + return bool(self.failed_languages) and self._target_count == 0 + + +# --------------------------------------------------------------------------- +# Rendering +# --------------------------------------------------------------------------- + + +def _line_char_offsets(source: str) -> list[int]: + """Char offset of the start of each line (index ``line - 1``).""" + offsets = [0] + for i, ch in enumerate(source): + if ch == "\n": + offsets.append(i + 1) + return offsets + + +def _paint(text: str, color: bool, **style: object) -> str: + if not color or not text: + return text + return click.style(text, **style) # type: ignore[arg-type] + + +def _render_code_line( + line_no: int, + text: str, + dim_pre_end: int, + dim_post_start: int, + width: int, + color: bool, +) -> str: + """One code line: dimmed ``line_no| `` gutter, then the text with the leading + (before ``dim_pre_end``) and trailing (from ``dim_post_start``) context dimmed + and the matched span shown normally.""" + pre = max(0, min(dim_pre_end, len(text))) + post = max(pre, min(dim_post_start, len(text))) + gutter = _paint(f"{line_no:>{width}}| ", color, fg="bright_black") + if not color: + return f"{gutter}{text}" + before = _paint(text[:pre], color, dim=True) + matched = text[pre:post] + after = _paint(text[post:], color, dim=True) + return f"{gutter}{before}{matched}{after}" + + +def _render_match( + src_lines: list[str], + offsets: list[int], + match: CodeMatch, + width: int, + color: bool, +) -> list[str]: + chunk = match.chunks[0] + s_off, e_off = chunk.start.char_offset, chunk.end.char_offset + s_line, e_line = chunk.start.line, chunk.end.line + out: list[str] = [] + for line_no in range(s_line, e_line + 1): + idx = line_no - 1 + text = src_lines[idx] if 0 <= idx < len(src_lines) else "" + line_start = offsets[idx] if 0 <= idx < len(offsets) else 0 + dim_pre_end = (s_off - line_start) if line_no == s_line else 0 + dim_post_start = (e_off - line_start) if line_no == e_line else len(text) + out.append(_render_code_line(line_no, text, dim_pre_end, dim_post_start, width, color)) + return out + + +def render_file(fm: FileMatches, *, color: bool) -> str: + """Render one file's matches: the path, then each match's line range, with + matches separated by a ``---`` line.""" + # Split on "\n" to keep line numbers aligned with the offsets below (which + # count "\n"), then drop the trailing "\r" that CRLF files leave on each line. + src_lines = [line.rstrip("\r") for line in fm.source.split("\n")] + offsets = _line_char_offsets(fm.source) + max_line = max((m.chunks[0].end.line for m in fm.matches if m.chunks), default=1) + width = len(str(max_line)) + + parts = [_paint(fm.path, color, fg="magenta", bold=True)] + emitted = False + for match in fm.matches: + if not match.chunks: + continue + if emitted: + parts.append(_paint("---", color, fg="bright_black")) + parts.extend(_render_match(src_lines, offsets, match, width, color)) + emitted = True + return "\n".join(parts) + + +def render_results(results: list[FileMatches], *, color: bool) -> str: + """Render a list of per-file matches in the ``ccc grep`` output format, files + separated by a blank line. The CLI streams with :func:`render_file` instead; + this is the batch form (used in tests).""" + return "\n\n".join(render_file(fm, color=color) for fm in results) diff --git a/src/cocoindex_code/indexer.py b/src/cocoindex_code/indexer.py index e028103..d81c08e 100644 --- a/src/cocoindex_code/indexer.py +++ b/src/cocoindex_code/indexer.py @@ -2,20 +2,18 @@ from __future__ import annotations -from collections.abc import Iterable -from pathlib import Path, PurePath +from pathlib import Path import cocoindex as coco from cocoindex.connectors import localfs, sqlite from cocoindex.connectors.sqlite import Vec0TableDef from cocoindex.ops.text import RecursiveSplitter, detect_code_language from cocoindex.resources.chunk import Chunk -from cocoindex.resources.file import FilePathMatcher, PatternFilePathMatcher from cocoindex.resources.id import IdGenerator -from pathspec import GitIgnoreSpec from .chunking import CHUNKER_REGISTRY -from .settings import load_gitignore_spec, load_project_settings +from .file_walk import build_matcher +from .settings import load_project_settings from .shared import ( CODEBASE_DIR, EMBEDDER, @@ -33,107 +31,6 @@ splitter = RecursiveSplitter() -def _normalize_gitignore_lines(lines: Iterable[str], directory: PurePath) -> list[str]: - """Normalize .gitignore lines to root-relative gitignore patterns.""" - if directory in (PurePath("."), PurePath("")): - prefix = "" - else: - prefix = f"{directory.as_posix().rstrip('/')}/" - - normalized: list[str] = [] - for raw_line in lines: - line = raw_line.rstrip("\n\r") - if not line: - continue - stripped = line.lstrip() - if not stripped or stripped.startswith("#"): - continue - if line.startswith("\\#") or line.startswith("\\!"): - line = line[1:] - negated = line.startswith("!") - if negated: - line = line[1:] - body = line.strip() - if not body: - continue - anchor = body.startswith("/") - if anchor: - body = body.lstrip("/") - pattern = f"{prefix}{body}" if prefix else body - else: - contains_slash = "/" in body - base = prefix - if contains_slash: - pattern = f"{base}{body}" - else: - if base: - pattern = f"{base}**/{body}" - else: - pattern = f"**/{body}" - if negated: - pattern = f"!{pattern}" - normalized.append(pattern) - return normalized - - -class GitignoreAwareMatcher(FilePathMatcher): - """Wraps another matcher and applies .gitignore filtering.""" - - def __init__( - self, - delegate: FilePathMatcher, - root_spec: GitIgnoreSpec | None, - project_root: Path, - ) -> None: - self._delegate = delegate - self._root = project_root - self._spec_cache: dict[PurePath, GitIgnoreSpec | None] = {PurePath("."): root_spec} - - def _spec_for(self, directory: PurePath) -> GitIgnoreSpec | None: - if directory in self._spec_cache: - return self._spec_cache[directory] - - parent_dir = directory.parent if directory != PurePath(".") else PurePath(".") - parent_spec = self._spec_for(parent_dir) - spec = parent_spec - - gitignore_path = (self._root / directory) / ".gitignore" - if gitignore_path.is_file(): - try: - lines = gitignore_path.read_text().splitlines() - except (OSError, UnicodeDecodeError): - lines = [] - normalized = _normalize_gitignore_lines(lines, directory) - if normalized: - new_spec = GitIgnoreSpec.from_lines(normalized) - spec = new_spec if spec is None else spec + new_spec - - self._spec_cache[directory] = spec - return spec - - def _is_ignored(self, path: PurePath, is_dir: bool) -> bool: - directory = path if is_dir else path.parent - if directory == PurePath(""): - directory = PurePath(".") - spec = self._spec_for(directory) - if spec is None: - return False - match_path = path.as_posix() - if is_dir and not match_path.endswith("/"): - match_path = f"{match_path}/" - return spec.match_file(match_path) - - def is_dir_included(self, path: PurePath) -> bool: - if self._is_ignored(path, True): - return False - return self._delegate.is_dir_included(path) - - def is_file_included(self, path: PurePath) -> bool: - if self._is_ignored(path, False): - return False - return self._delegate.is_file_included(path) - - @coco.fn(memo=True) async def process_file( file: localfs.File, @@ -199,7 +96,6 @@ async def indexer_main() -> None: """Main indexing function - walks files and processes each.""" project_root = coco.use_context(CODEBASE_DIR) ps = load_project_settings(project_root) - gitignore_spec = load_gitignore_spec(project_root) table = await sqlite.mount_table_target( db=SQLITE_DB, @@ -214,11 +110,7 @@ async def indexer_main() -> None: ), ) - base_matcher = PatternFilePathMatcher( - included_patterns=ps.include_patterns, - excluded_patterns=ps.exclude_patterns, - ) - matcher: FilePathMatcher = GitignoreAwareMatcher(base_matcher, gitignore_spec, project_root) + matcher = build_matcher(project_root, ps.include_patterns, ps.exclude_patterns) files = localfs.walk_dir( CODEBASE_DIR, diff --git a/tests/test_grep.py b/tests/test_grep.py new file mode 100644 index 0000000..bb7e686 --- /dev/null +++ b/tests/test_grep.py @@ -0,0 +1,394 @@ +"""Tests for `ccc grep` — structural code search. + +These run entirely locally (no daemon, no index, no embeddings): the engine +compiles a code_match pattern per language and matches files on disk. +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest +from typer.testing import CliRunner + +from cocoindex_code import grep as g +from cocoindex_code.cli import app + +runner = CliRunner() + + +def run_grep_obj(req: g.GrepRequest) -> tuple[list[g.FileMatches | g.GrepWarning], g.Grep]: + """Collect a Grep run's emitted items (results + warnings, completion order) and + return them alongside the finished Grep, so tests can inspect the verdict.""" + grep_run = g.Grep(req) + items: list[g.FileMatches | g.GrepWarning] = [] + grep_run.run(items.append) + return items, grep_run + + +def collect_grep(req: g.GrepRequest) -> list[g.FileMatches | g.GrepWarning]: + """Drain a Grep run into a list (compile warnings + match results + read + warnings), completion order.""" + return run_grep_obj(req)[0] + + +def run_grep(req: g.GrepRequest) -> list[g.FileMatches]: + """Just the file matches (dropping warnings), sorted by path for deterministic + assertions (the engine itself yields in completion order).""" + files = [it for it in collect_grep(req) if isinstance(it, g.FileMatches)] + files.sort(key=lambda fm: fm.path) + return files + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def codebase(tmp_path: Path) -> Path: + """A small multi-language tree (no cocoindex project marker).""" + (tmp_path / "a.py").write_text( + "import os\n\n\ndef foo(a, b):\n return a + b\n\n\ndef bar(x):\n return foo(x, 1)\n" + ) + (tmp_path / "sub").mkdir() + (tmp_path / "sub" / "b.py").write_text("def baz(y):\n return y * 2\n") + (tmp_path / "c.rs").write_text('fn main() {\n println!("hi");\n}\n') + # A .txt file that contains python-looking text but is not code. + (tmp_path / "notes.txt").write_text("def foo(not real code):\n") + return tmp_path + + +# --------------------------------------------------------------------------- +# Engine unit tests +# --------------------------------------------------------------------------- + + +def test_is_match_supported() -> None: + assert g._is_match_supported("python") is True + assert g._is_match_supported("rust") is True + # Detected-but-not-structurally-matchable languages. + assert g._is_match_supported("text") is False + assert g._is_match_supported("markdown") is False + + +def test_pattern_compiler_caches_per_language() -> None: + compiler = g._PatternCompiler(r"def \NAME(\(A*\)):") + first = compiler.for_language("python") + assert first is not None + # Same object reused (the "compiled pattern map" from the requirement). + assert compiler.for_language("python") is first + + +def test_pattern_compiler_skips_unsupported_language() -> None: + compiler = g._PatternCompiler(r"def \NAME(\(A*\)):") + assert compiler.for_language("text") is None + + +def test_pattern_compiler_warns_on_malformed_pattern() -> None: + compiler = g._PatternCompiler(r"def \NAME \{{ return") # unbalanced containment + # A supported language whose pattern won't compile is skipped with a warning, + # not raised — so one bad pattern doesn't abort the whole multi-language grep. + assert compiler.for_language("python") is None + assert len(compiler.warnings) == 1 + assert compiler.warnings[0].language == "python" + assert "python" in compiler.warnings[0].message + # Re-asking the same language doesn't duplicate the warning (cached). + assert compiler.for_language("python") is None + assert len(compiler.warnings) == 1 + + +def test_grep_finds_across_files(codebase: Path) -> None: + req = g.GrepRequest(pattern=r"def \NAME(\(ARGS*\)):", root=codebase) + results = run_grep(req) + paths = {fm.path for fm in results} + # Both python files matched; rust and txt skipped. + assert any(p.endswith("a.py") for p in paths) + assert any(p.endswith("b.py") for p in paths) + assert not any(p.endswith(".rs") for p in paths) + assert not any(p.endswith(".txt") for p in paths) + + +def test_grep_run_emits_each_result(codebase: Path) -> None: + # Grep.run calls `emit` once per result as each match finishes (no batching). + grep_run = g.Grep(g.GrepRequest(pattern=r"def \NAME(\(ARGS*\)):", root=codebase)) + emitted: list[g.FileMatches | g.GrepWarning] = [] + grep_run.run(emitted.append) + assert len(emitted) == 2 # a.py and sub/b.py + assert all(isinstance(it, g.FileMatches) for it in emitted) # valid pattern → no warnings + assert not grep_run.unusable + assert not grep_run.unusable # valid pattern compiled + + +def test_grep_run_handles_many_files(tmp_path: Path) -> None: + # Many files exercise the WaitGroup counter / completion sentinel — every file + # must be matched exactly once, with no duplicates and no lost results. + n = 250 + for i in range(n): + (tmp_path / f"f{i:04d}.py").write_text(f"def fn{i}(a):\n return a\n") + results = run_grep(g.GrepRequest(pattern=r"def \NAME(\(A*\)):", root=tmp_path)) + assert len(results) == n + assert len({fm.path for fm in results}) == n # no duplicates + + +def test_grep_language_filter(codebase: Path) -> None: + req = g.GrepRequest(pattern=r"\NAME(\(A*\))", root=codebase, languages=frozenset({"rust"})) + results = run_grep(req) + assert {fm.path for fm in results} and all(fm.path.endswith(".rs") for fm in results) + + +def test_grep_single_file(codebase: Path) -> None: + req = g.GrepRequest(pattern=r"def \NAME(\(ARGS*\)):", root=codebase / "a.py") + results = run_grep(req) + assert len(results) == 1 + assert results[0].path == (codebase / "a.py").as_posix() + # a.py defines foo and bar. + assert len(results[0].matches) == 2 + + +def test_grep_path_glob(codebase: Path) -> None: + req = g.GrepRequest(pattern=r"def \NAME(\(ARGS*\)):", root=codebase, path_glob="sub/**") + results = run_grep(req) + assert {fm.path for fm in results} == {(codebase / "sub" / "b.py").as_posix()} + + +def test_grep_no_matches(codebase: Path) -> None: + req = g.GrepRequest(pattern=r"nonexistent_fn(\(A*\))", root=codebase) + assert run_grep(req) == [] + + +def test_grep_binary_file_skipped(tmp_path: Path) -> None: + (tmp_path / "data.py").write_bytes(b"\xff\xfe\x00\x01 def foo(): pass") + req = g.GrepRequest(pattern=r"def \NAME(\(A*\)):", root=tmp_path) + # Non-UTF-8 content is skipped silently (no warning), rather than crashing. + assert collect_grep(req) == [] + + +def test_grep_warns_once_per_supported_language_and_is_unusable(codebase: Path) -> None: + # The fixture has two python files and one rust file. A malformed pattern warns + # once per *supported* language (python, rust) — not once per file, not for the + # unsupported .txt — leaves nothing to match, and is reported unusable. + items, grep_run = run_grep_obj(g.GrepRequest(pattern=r"def \NAME \{{ x", root=codebase)) + warnings = [it for it in items if isinstance(it, g.GrepWarning)] + assert not any(isinstance(it, g.FileMatches) for it in items) + assert len(warnings) == 2 + assert set(grep_run.failed_languages) == {"python", "rust"} + assert grep_run.unusable is True + + +def test_grep_unusable_distinct_from_no_matchable_files(tmp_path: Path) -> None: + # A valid pattern that simply finds nothing is NOT "unusable". + (tmp_path / "a.py").write_text("x = 1\n") + _, ok = run_grep_obj(g.GrepRequest(pattern=r"def \NAME(\(A*\)):", root=tmp_path)) + assert ok.failed_languages == [] + assert ok.unusable is False + + # A malformed pattern but only *unsupported* files (the pattern is never even + # compiled) → not unusable, just "no matchable files found". + onlytxt = tmp_path / "txtonly" + onlytxt.mkdir() + (onlytxt / "d.txt").write_text("hello\n") + _, none_tried = run_grep_obj(g.GrepRequest(pattern=r"def \NAME \{{ x", root=onlytxt)) + assert none_tried.failed_languages == [] + assert none_tried.unusable is False + + +def test_match_file_unreadable_returns_warning(tmp_path: Path) -> None: + # Reading a directory raises IsADirectoryError (an OSError) → surfaced as a + # warning, not a silent skip and not a crash. + cp = g.CodePattern("x", language="python") + target = g._Target(path=tmp_path, display="adir", pattern=cp) + result = g._match_file(target) + assert isinstance(result, g.GrepWarning) + assert "cannot read adir" in result.message + + +# --------------------------------------------------------------------------- +# Rendering +# --------------------------------------------------------------------------- + + +def test_render_plain_format(codebase: Path) -> None: + req = g.GrepRequest(pattern=r"def \NAME(\(ARGS*\)):", root=codebase / "sub" / "b.py") + rendered = g.render_results(run_grep(req), color=False) + lines = rendered.split("\n") + assert lines[0] == (codebase / "sub" / "b.py").as_posix() # path header + # Gutter is "| " — number, pipe, then exactly one space before the code. + assert lines[1] == "1| def baz(y):" + + +def test_render_strips_crlf_carriage_returns(tmp_path: Path) -> None: + # CRLF files must not leak a trailing "\r" into rendered code lines (regression: + # source.split("\n") left it on every line). newline="" writes "\r\n" verbatim + # instead of letting the platform translate it. + (tmp_path / "crlf.py").write_text("def baz(y):\r\n pass\r\n", newline="") + req = g.GrepRequest(pattern=r"def \NAME(\(ARGS*\)):", root=tmp_path / "crlf.py") + rendered = g.render_results(run_grep(req), color=False) + assert "\r" not in rendered + assert "1| def baz(y):" in rendered + + +def test_render_separator_between_matches(codebase: Path) -> None: + req = g.GrepRequest(pattern=r"def \NAME(\(ARGS*\)):", root=codebase / "a.py") + rendered = g.render_results(run_grep(req), color=False) + assert "\n---\n" in rendered # two matches in one file + + +def test_render_line_number_width(tmp_path: Path) -> None: + # A list literal spanning to a 2-digit line: the gutter is right-aligned to + # width 2 (single-digit lines space-padded), with one space after the pipe. + body = "".join(f" {i},\n" for i in range(10)) + (tmp_path / "wide.py").write_text(f"data = [\n{body}]\n") # `[` on line 1, `]` on line 12 + req = g.GrepRequest(pattern=r"[\(ITEMS*\)]", root=tmp_path / "wide.py") + rendered = g.render_results(run_grep(req), color=False) + assert "\n 1| data = [" in rendered # single-digit line, padded to width 2 + assert "\n12| ]" in rendered # two-digit line + + +def test_render_color_dims_unmatched_prefix(codebase: Path) -> None: + # `foo(x, 1)` on the last line of bar — the leading " return " is dimmed. + req = g.GrepRequest(pattern=r"foo(\(A*\))", root=codebase / "a.py") + rendered = g.render_results(run_grep(req), color=True) + assert "\x1b[" in rendered # ANSI present + assert "\x1b[2m" in rendered # dim attribute for unmatched context + + +# --------------------------------------------------------------------------- +# CLI end-to-end (via CliRunner — no daemon needed) +# --------------------------------------------------------------------------- + + +def test_cli_grep_basic(codebase: Path, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.chdir(codebase) + result = runner.invoke(app, ["grep", r"def \NAME(\(ARGS*\)):"], catch_exceptions=False) + assert result.exit_code == 0 + assert "a.py" in result.output + assert "sub/b.py" in result.output + assert "def foo(a, b):" in result.output + + +def test_cli_grep_explicit_path(codebase: Path) -> None: + result = runner.invoke( + app, ["grep", r"def \NAME(\(ARGS*\)):", str(codebase)], catch_exceptions=False + ) + assert result.exit_code == 0 + assert "a.py" in result.output + + +def test_cli_grep_no_matches(codebase: Path) -> None: + result = runner.invoke(app, ["grep", r"nope_fn(\(A*\))", str(codebase)], catch_exceptions=False) + assert result.exit_code == 0 + assert "No matches found." in result.output + + +def test_cli_grep_malformed_pattern(codebase: Path) -> None: + result = runner.invoke( + app, ["grep", r"def \NAME \{{ return", str(codebase)], catch_exceptions=False + ) + # Malformed for every language found (python + rust): per-language warnings + # plus an explicit error, and a non-zero exit. + assert result.exit_code == 1 + assert "pattern invalid for python" in result.output # per-language warning + assert "did not compile for any of the languages found" in result.output # error + + +def test_cli_grep_path_not_found() -> None: + result = runner.invoke( + app, ["grep", r"foo(\(A*\))", "/no/such/path/xyz"], catch_exceptions=False + ) + assert result.exit_code == 1 + assert "path not found" in result.output + + +def test_cli_grep_lang_filter(codebase: Path) -> None: + result = runner.invoke( + app, ["grep", r"\NAME(\(A*\))", str(codebase), "--lang", "rust"], catch_exceptions=False + ) + assert result.exit_code == 0 + assert ".rs" in result.output + assert ".py" not in result.output + + +# --------------------------------------------------------------------------- +# Project- and gitignore-awareness +# --------------------------------------------------------------------------- + + +def test_grep_respects_project_exclude_patterns(tmp_path: Path) -> None: + """Inside an initialized project, grep honors the configured exclude patterns.""" + (tmp_path / ".cocoindex_code").mkdir() + (tmp_path / ".cocoindex_code" / "settings.yml").write_text( + "include_patterns:\n - '**/*.py'\nexclude_patterns:\n - '**/.*'\n - '**/skip'\n" + ) + (tmp_path / "keep.py").write_text("def kept(a):\n return a\n") + (tmp_path / "skip").mkdir() + (tmp_path / "skip" / "hidden.py").write_text("def hidden(a):\n return a\n") + + req = g.GrepRequest(pattern=r"def \NAME(\(A*\)):", root=tmp_path) + results = run_grep(req) + paths = {fm.path for fm in results} + assert any(p.endswith("keep.py") for p in paths) + assert not any("skip" in p for p in paths) + + +def test_grep_respects_gitignore(tmp_path: Path) -> None: + (tmp_path / ".cocoindex_code").mkdir() + (tmp_path / ".cocoindex_code" / "settings.yml").write_text("include_patterns:\n - '**/*.py'\n") + (tmp_path / ".gitignore").write_text("ignored.py\n") + (tmp_path / "kept.py").write_text("def kept(a):\n return a\n") + (tmp_path / "ignored.py").write_text("def ignored(a):\n return a\n") + + req = g.GrepRequest(pattern=r"def \NAME(\(A*\)):", root=tmp_path) + paths = {fm.path for fm in run_grep(req)} + assert any(p.endswith("kept.py") for p in paths) + assert not any(p.endswith("ignored.py") for p in paths) + + +def test_find_git_root(tmp_path: Path) -> None: + from cocoindex_code.file_walk import find_git_root + + # normal repo: .git is a directory + (tmp_path / ".git").mkdir() + sub = tmp_path / "a" / "b" + sub.mkdir(parents=True) + assert find_git_root(sub) == tmp_path + # outside any repo + assert find_git_root(Path(tmp_path.anchor)) is None + + # submodule / linked worktree: .git is a *file* + other = tmp_path / "other" + (other / "x").mkdir(parents=True) + (other / ".git").write_text("gitdir: /elsewhere/.git/modules/other\n") + assert find_git_root(other / "x") == other + + +def test_grep_anchors_gitignore_at_git_root_when_no_project(tmp_path: Path) -> None: + # No cocoindex project, but a git repo with a root .gitignore. Grepping a deep + # subfolder must still honor that repo-root .gitignore (anchored at the git root, + # not the subfolder). + (tmp_path / ".git").mkdir() + (tmp_path / ".gitignore").write_text("ignored.py\n") + sub = tmp_path / "src" / "sub" + sub.mkdir(parents=True) + (sub / "keep.py").write_text("def kept(a):\n return a\n") + (sub / "ignored.py").write_text("def gone(a):\n return a\n") + + paths = { + Path(fm.path).name + for fm in run_grep(g.GrepRequest(pattern=r"def \NAME(\(A*\)):", root=sub)) + } + assert "keep.py" in paths + assert "ignored.py" not in paths # ignored by the git-root .gitignore, not just sub/ + + +def test_grep_language_override(tmp_path: Path) -> None: + """A project language override maps an unusual extension to a matchable language.""" + (tmp_path / ".cocoindex_code").mkdir() + (tmp_path / ".cocoindex_code" / "settings.yml").write_text( + "include_patterns:\n - '**/*.inc'\nlanguage_overrides:\n - ext: inc\n lang: python\n" + ) + (tmp_path / "snippet.inc").write_text("def included(a):\n return a\n") + + req = g.GrepRequest(pattern=r"def \NAME(\(A*\)):", root=tmp_path) + results = run_grep(req) + assert len(results) == 1 and results[0].path.endswith("snippet.inc") diff --git a/uv.lock b/uv.lock index 216ebbf..c834d58 100644 --- a/uv.lock +++ b/uv.lock @@ -336,7 +336,7 @@ wheels = [ [[package]] name = "cocoindex" -version = "1.0.7" +version = "1.0.13" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "click" }, @@ -348,17 +348,17 @@ dependencies = [ { name = "typing-extensions" }, { name = "watchdog" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/65/55/1920a27d0068446c9297a9f07fcaa1b7fcab3543760219badb7f56b18301/cocoindex-1.0.7.tar.gz", hash = "sha256:30aca52897228c1e1e342776f3a6dd6d7a21a89664ecf427a7db4197f312e129", size = 470615, upload-time = "2026-05-31T01:00:38.6Z" } +sdist = { url = "https://files.pythonhosted.org/packages/6e/b9/5fe83e2e0fd4adc54fb4d526c7197942861d184bd006c4a4591b229e290e/cocoindex-1.0.13.tar.gz", hash = "sha256:8e52558f02cbfe2cc5505bea214354f147a8a0385ed7b9a47a0cdca20b8673be", size = 647804, upload-time = "2026-06-22T16:46:17.563Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/3e/0e/30caad56ea2f749685a0493aabdbad249b858ea52a8db543d0f06ad42795/cocoindex-1.0.7-cp311-abi3-macosx_10_12_x86_64.whl", hash = "sha256:49e26cef0d822e9b640f60a64a5c8024a31d6f9b1d2184f16a60ffa3214b0e1d", size = 8743244, upload-time = "2026-05-31T01:00:36.58Z" }, - { url = "https://files.pythonhosted.org/packages/51/a4/6c71852970a5ee370b67cafae6ff610c3603489352b6a119627eae20c108/cocoindex-1.0.7-cp311-abi3-macosx_11_0_arm64.whl", hash = "sha256:31eb94126b5541efbbd96bb7ef27225601afdca255bde1a6e75915effa94df47", size = 8831623, upload-time = "2026-05-31T01:00:31.402Z" }, - { url = "https://files.pythonhosted.org/packages/a5/13/91c55926daf4917a8e9440aa918ba4badbe116b465c03aa5b752448494aa/cocoindex-1.0.7-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:e2b329ab0672d33a9c78e40ac9a3a38dbab5c55290b297bb6fd8388b6281d411", size = 8657824, upload-time = "2026-05-31T01:00:21.754Z" }, - { url = "https://files.pythonhosted.org/packages/9c/b3/f9f494e7293ce07e40c1bdf02cd3d7eb0ace8bcc3b30c6841f9c8e9bd67b/cocoindex-1.0.7-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:988091aee99ee8c766ac6d68c1713391438d599acbf77eb515a804c23452369c", size = 8931944, upload-time = "2026-05-31T01:00:26.4Z" }, - { url = "https://files.pythonhosted.org/packages/cf/54/fc16029bb5aa165aac1af72ddde1807464106c0ed200a9a2ee22a03eeaa7/cocoindex-1.0.7-cp311-abi3-win_amd64.whl", hash = "sha256:c082881938d03fda000325c0052cbf34ad80804745221187bedfd3e960a16db4", size = 9079033, upload-time = "2026-05-31T01:00:39.982Z" }, - { url = "https://files.pythonhosted.org/packages/d0/47/86bed6dd40cc8420798d8f2c715c082a021815538baa931609ac72a94cea/cocoindex-1.0.7-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1afcb5d49fb8fd530dfab8cc72275a6c0f1002cc9460d8d6061202401461db3f", size = 8825330, upload-time = "2026-05-31T01:00:33.684Z" }, - { url = "https://files.pythonhosted.org/packages/7a/d4/d3dbfe6c9f2cb260a22fec818f64595e6dfc472e46dbabc5e28c44beb8cf/cocoindex-1.0.7-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:0d181bb278401ab7a9a5dc8ed0edf01c0de2c23938c80b27ee05e22a00a8b399", size = 8654835, upload-time = "2026-05-31T01:00:24.218Z" }, - { url = "https://files.pythonhosted.org/packages/e7/2b/2db33aa823579df7109d2db4ed214688ea31ac129ae9b0ec70b0d21958b8/cocoindex-1.0.7-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:839c1d2f6565cd65b9a277e168abc57b711a81ffaa23397181cbe3365642ab60", size = 8927075, upload-time = "2026-05-31T01:00:28.943Z" }, - { url = "https://files.pythonhosted.org/packages/fa/c4/63214f8bf37bb9135d8cac72fd91358ec51c3881afe381273d1fc6b902c1/cocoindex-1.0.7-cp314-cp314t-win_amd64.whl", hash = "sha256:62d3c7c0be7cd60ff84b72bde78c0cf2d6dc3f90047926e57a63e7f9194d7f72", size = 9071903, upload-time = "2026-05-31T01:00:42.055Z" }, + { url = "https://files.pythonhosted.org/packages/24/0c/0cd5e797b108b50080dd17fb7f5761a23d41432b694355554609b392e191/cocoindex-1.0.13-cp311-abi3-macosx_10_12_x86_64.whl", hash = "sha256:186aef33ea46fab459188c1ef7c697334a6a7299e726d38d46b01bd5543cf6f7", size = 9109005, upload-time = "2026-06-22T16:46:15.304Z" }, + { url = "https://files.pythonhosted.org/packages/ff/6b/7b1511d1a4036e14fd599f163cd84103b96f4140135295bdf3b75966eed4/cocoindex-1.0.13-cp311-abi3-macosx_11_0_arm64.whl", hash = "sha256:e48570953f4919f3061043d56835d629836052385648d6b874432918bdbf811c", size = 9183196, upload-time = "2026-06-22T16:46:09.447Z" }, + { url = "https://files.pythonhosted.org/packages/f2/a3/4be3a6b0257e27a437edba3942812537b191fe3c662f0c514207b92e831f/cocoindex-1.0.13-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:7970d7ca83d90a15f417e49c09a8f253b6d231c8f3571e0c325efc1857dc449b", size = 9059825, upload-time = "2026-06-22T16:45:59.309Z" }, + { url = "https://files.pythonhosted.org/packages/bb/75/1d7c019746d3dac105eec5f639328f5ccccc1e750d03f3ecb47161460dde/cocoindex-1.0.13-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:b4682c8d84a4a80c1bca10731bc6ab9d2c97f6451194526a39ddd4d0ad984458", size = 9282739, upload-time = "2026-06-22T16:46:04.456Z" }, + { url = "https://files.pythonhosted.org/packages/cd/08/f40e17233c70452b94a4a3791689d43793a11cae8f06f81d8a4186628cab/cocoindex-1.0.13-cp311-abi3-win_amd64.whl", hash = "sha256:47787cd38e7e0b8f74c2ea0b74030564c0405d4795a6af1a48ccc4055150cf39", size = 9497162, upload-time = "2026-06-22T16:46:19.827Z" }, + { url = "https://files.pythonhosted.org/packages/98/06/a0f0766f84f818fc4f79509ef4de0deeacbf7e6e13b74958498fb64ba316/cocoindex-1.0.13-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:dec6e4df7d62b56221427aae1c4f067a18ecf5c8eb50d0fc09cc9d033eed3048", size = 9184066, upload-time = "2026-06-22T16:46:12.68Z" }, + { url = "https://files.pythonhosted.org/packages/1a/71/e265116df84ac5338e0ae7d8e8b6e3f177dc6f28df0a5c7308662333ba37/cocoindex-1.0.13-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:111a5bc1d49e20e06659a5323019f8b7ca195c983e0e6700c66c104973efb689", size = 9062342, upload-time = "2026-06-22T16:46:02.011Z" }, + { url = "https://files.pythonhosted.org/packages/a2/61/3bf28b73b256be57626f60bbc0e360aa5707ddf0625a55c3dd9869f0403d/cocoindex-1.0.13-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:5dbaa672014ddf68bd0ca8ae10c4f6e36381cacc1f482a14b500aa4a458633b4", size = 9279283, upload-time = "2026-06-22T16:46:06.797Z" }, + { url = "https://files.pythonhosted.org/packages/64/30/a8bc89c9c99fe783bab6e79334bc4bc780a18b7bb4d1aa5f69d31bb751c8/cocoindex-1.0.13-cp314-cp314t-win_amd64.whl", hash = "sha256:d070e450e1ed8cb9b234a447dc6171611007ed66b9ef80bcaf328ccae6577ad5", size = 9489393, upload-time = "2026-06-22T16:46:22.353Z" }, ] [package.optional-dependencies] @@ -417,10 +417,10 @@ dev = [ [package.metadata] requires-dist = [ - { name = "cocoindex", extras = ["litellm"], specifier = ">=1.0.6,<1.1.0" }, - { name = "cocoindex", extras = ["sentence-transformers"], marker = "extra == 'dev'", specifier = ">=1.0.6,<1.1.0" }, - { name = "cocoindex", extras = ["sentence-transformers"], marker = "extra == 'embeddings-local'", specifier = ">=1.0.6,<1.1.0" }, - { name = "cocoindex", extras = ["sentence-transformers"], marker = "extra == 'full'", specifier = ">=1.0.6,<1.1.0" }, + { name = "cocoindex", extras = ["litellm"], specifier = ">=1.0.13,<1.1.0" }, + { name = "cocoindex", extras = ["sentence-transformers"], marker = "extra == 'dev'", specifier = ">=1.0.13,<1.1.0" }, + { name = "cocoindex", extras = ["sentence-transformers"], marker = "extra == 'embeddings-local'", specifier = ">=1.0.13,<1.1.0" }, + { name = "cocoindex", extras = ["sentence-transformers"], marker = "extra == 'full'", specifier = ">=1.0.13,<1.1.0" }, { name = "einops", specifier = ">=0.8.2" }, { name = "mcp", specifier = ">=1.0.0" }, { name = "msgspec", specifier = ">=0.19.0" }, @@ -442,7 +442,7 @@ provides-extras = ["dev", "embeddings-local", "full"] [package.metadata.requires-dev] dev = [ - { name = "cocoindex", extras = ["sentence-transformers"], specifier = ">=1.0.7,<1.1.0" }, + { name = "cocoindex", extras = ["sentence-transformers"], specifier = ">=1.0.13,<1.1.0" }, { name = "mypy", specifier = ">=1.0.0" }, { name = "prek", specifier = ">=0.1.0" }, { name = "pytest", specifier = ">=7.0.0" },