Skip to content

Commit 3c7ebb6

Browse files
authored
Feat/excluded patterns (#54)
* feat: support custom exclude patterns * fix: support commas in custom exclude patterns
1 parent dace9da commit 3c7ebb6

4 files changed

Lines changed: 162 additions & 24 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ Use the cocoindex-code MCP server for semantic code search when:
130130
| `COCOINDEX_CODE_EMBEDDING_MODEL` | Embedding model (see below) | `sbert/sentence-transformers/all-MiniLM-L6-v2` |
131131
| `COCOINDEX_CODE_BATCH_SIZE` | Max batch size for local embedding model | `16` |
132132
| `COCOINDEX_CODE_EXTRA_EXTENSIONS` | Additional file extensions to index (comma-separated, e.g. `"inc:php,yaml,toml"` — use `ext:lang` to override language detection) | _(none)_ |
133+
| `COCOINDEX_CODE_EXCLUDED_PATTERNS` | Additional glob patterns to exclude from indexing as a JSON array (e.g. `'["**/migration.sql", "{**/*.md,**/*.txt}"]'`) | _(none)_ |
133134

134135

135136
### Root Path Discovery

src/cocoindex_code/config.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from __future__ import annotations
44

5+
import json
56
import os
67
from dataclasses import dataclass
78
from pathlib import Path
@@ -42,6 +43,33 @@ def _discover_codebase_root() -> Path:
4243
return root if root is not None else cwd
4344

4445

46+
def _parse_json_string_list_env(var_name: str) -> list[str]:
47+
"""Parse an environment variable as a JSON array of strings."""
48+
raw_value = os.environ.get(var_name, "")
49+
if not raw_value.strip():
50+
return []
51+
52+
try:
53+
parsed = json.loads(raw_value)
54+
except json.JSONDecodeError as exc:
55+
raise ValueError(
56+
f"{var_name} must be a JSON array of strings, got invalid JSON"
57+
) from exc
58+
59+
if not isinstance(parsed, list):
60+
raise ValueError(f"{var_name} must be a JSON array of strings")
61+
62+
result: list[str] = []
63+
for item in parsed:
64+
if not isinstance(item, str):
65+
raise ValueError(f"{var_name} must be a JSON array of strings")
66+
item = item.strip()
67+
if item:
68+
result.append(item)
69+
70+
return result
71+
72+
4573
@dataclass
4674
class Config:
4775
"""Configuration loaded from environment variables."""
@@ -52,6 +80,7 @@ class Config:
5280
device: str | None
5381
trust_remote_code: bool
5482
extra_extensions: dict[str, str | None]
83+
excluded_patterns: list[str]
5584

5685
@classmethod
5786
def from_env(cls) -> Config:
@@ -99,13 +128,19 @@ def from_env(cls) -> Config:
99128
else:
100129
extra_extensions[f".{token}"] = None
101130

131+
# Excluded file glob patterns
132+
excluded_patterns = _parse_json_string_list_env(
133+
"COCOINDEX_CODE_EXCLUDED_PATTERNS"
134+
)
135+
102136
return cls(
103137
codebase_root_path=root,
104138
embedding_model=embedding_model,
105139
index_dir=index_dir,
106140
device=device,
107141
trust_remote_code=trust_remote_code,
108142
extra_extensions=extra_extensions,
143+
excluded_patterns=excluded_patterns,
109144
)
110145

111146
@property

src/cocoindex_code/indexer.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
ext: lang for ext, lang in config.extra_extensions.items() if lang is not None
5353
}
5454

55-
EXCLUDED_PATTERNS = [
55+
DEFAULT_EXCLUDED_PATTERNS = [
5656
"**/.*", # Hidden directories
5757
"**/__pycache__", # Python cache
5858
"**/node_modules", # Node.js dependencies
@@ -64,6 +64,8 @@
6464
"**/.cocoindex_code", # Our own index directory
6565
]
6666

67+
EXCLUDED_PATTERNS = DEFAULT_EXCLUDED_PATTERNS + config.excluded_patterns
68+
6769
# Chunking configuration
6870
CHUNK_SIZE = 2000
6971
MIN_CHUNK_SIZE = 300

tests/test_config.py

Lines changed: 123 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6,35 +6,33 @@
66
from pathlib import Path
77
from unittest.mock import patch
88

9-
from cocoindex_code.config import Config, _detect_device
9+
import pytest
1010

11+
from cocoindex_code.config import Config
1112

12-
class TestDetectDevice:
13-
"""Tests for device auto-detection."""
1413

15-
def test_returns_cuda_when_available(self) -> None:
16-
with patch.dict(os.environ, {}, clear=False):
17-
# Ensure env var is unset
18-
os.environ.pop("COCOINDEX_CODE_DEVICE", None)
19-
with patch("torch.cuda.is_available", return_value=True):
20-
assert _detect_device() == "cuda"
14+
class TestConfigDevice:
15+
"""Tests for COCOINDEX_CODE_DEVICE env var handling."""
2116

22-
def test_returns_cpu_when_cuda_unavailable(self) -> None:
23-
with patch.dict(os.environ, {}, clear=False):
17+
def test_none_by_default(self, tmp_path: Path) -> None:
18+
with patch.dict(
19+
os.environ,
20+
{"COCOINDEX_CODE_ROOT_PATH": str(tmp_path)},
21+
):
2422
os.environ.pop("COCOINDEX_CODE_DEVICE", None)
25-
with patch("torch.cuda.is_available", return_value=False):
26-
assert _detect_device() == "cpu"
27-
28-
def test_env_var_overrides_auto_detection(self) -> None:
29-
with patch.dict(os.environ, {"COCOINDEX_CODE_DEVICE": "cpu"}):
30-
with patch("torch.cuda.is_available", return_value=True):
31-
assert _detect_device() == "cpu"
23+
config = Config.from_env()
24+
assert config.device is None
3225

33-
def test_returns_cpu_when_torch_missing(self) -> None:
34-
with patch.dict(os.environ, {}, clear=False):
35-
os.environ.pop("COCOINDEX_CODE_DEVICE", None)
36-
with patch.dict("sys.modules", {"torch": None}):
37-
assert _detect_device() == "cpu"
26+
def test_env_var_overrides_device(self, tmp_path: Path) -> None:
27+
with patch.dict(
28+
os.environ,
29+
{
30+
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
31+
"COCOINDEX_CODE_DEVICE": "cpu",
32+
},
33+
):
34+
config = Config.from_env()
35+
assert config.device == "cpu"
3836

3937

4038
class TestConfigTrustRemoteCode:
@@ -158,3 +156,105 @@ def test_mixed_with_and_without_lang(self, tmp_path: Path) -> None:
158156
):
159157
config = Config.from_env()
160158
assert config.extra_extensions == {".inc": "php", ".yaml": None, ".tpl": "html"}
159+
160+
161+
class TestExcludedPatterns:
162+
"""Tests for COCOINDEX_CODE_EXCLUDED_PATTERNS env var."""
163+
164+
def test_empty_by_default(self, tmp_path: Path) -> None:
165+
with patch.dict(
166+
os.environ,
167+
{"COCOINDEX_CODE_ROOT_PATH": str(tmp_path)},
168+
):
169+
os.environ.pop("COCOINDEX_CODE_EXCLUDED_PATTERNS", None)
170+
config = Config.from_env()
171+
assert config.excluded_patterns == []
172+
173+
def test_parses_json_array(self, tmp_path: Path) -> None:
174+
with patch.dict(
175+
os.environ,
176+
{
177+
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
178+
"COCOINDEX_CODE_EXCLUDED_PATTERNS": '["**/migration.sql", "**/*.d.ts"]',
179+
},
180+
):
181+
config = Config.from_env()
182+
assert config.excluded_patterns == ["**/migration.sql", "**/*.d.ts"]
183+
184+
def test_preserves_commas_inside_globs(self, tmp_path: Path) -> None:
185+
with patch.dict(
186+
os.environ,
187+
{
188+
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
189+
"COCOINDEX_CODE_EXCLUDED_PATTERNS": '["{**/*.md,**/*.txt}"]',
190+
},
191+
):
192+
config = Config.from_env()
193+
assert config.excluded_patterns == ["{**/*.md,**/*.txt}"]
194+
195+
def test_trims_whitespace_and_ignores_empty_entries(self, tmp_path: Path) -> None:
196+
with patch.dict(
197+
os.environ,
198+
{
199+
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
200+
"COCOINDEX_CODE_EXCLUDED_PATTERNS": '[" **/migration.sql ", " ", "**/*.d.ts"]',
201+
},
202+
):
203+
config = Config.from_env()
204+
assert config.excluded_patterns == ["**/migration.sql", "**/*.d.ts"]
205+
206+
def test_empty_string_gives_empty_list(self, tmp_path: Path) -> None:
207+
with patch.dict(
208+
os.environ,
209+
{
210+
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
211+
"COCOINDEX_CODE_EXCLUDED_PATTERNS": "",
212+
},
213+
):
214+
config = Config.from_env()
215+
assert config.excluded_patterns == []
216+
217+
def test_rejects_invalid_json(self, tmp_path: Path) -> None:
218+
with patch.dict(
219+
os.environ,
220+
{
221+
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
222+
"COCOINDEX_CODE_EXCLUDED_PATTERNS": "**/migration.sql,**/*.d.ts",
223+
},
224+
):
225+
with pytest.raises(
226+
ValueError,
227+
match=(
228+
"COCOINDEX_CODE_EXCLUDED_PATTERNS must be a JSON array of strings, "
229+
"got invalid JSON"
230+
),
231+
):
232+
Config.from_env()
233+
234+
def test_rejects_valid_json_non_list(self, tmp_path: Path) -> None:
235+
with patch.dict(
236+
os.environ,
237+
{
238+
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
239+
"COCOINDEX_CODE_EXCLUDED_PATTERNS": "{}",
240+
},
241+
):
242+
with pytest.raises(
243+
ValueError,
244+
match="COCOINDEX_CODE_EXCLUDED_PATTERNS must be a JSON array of strings",
245+
):
246+
Config.from_env()
247+
248+
def test_rejects_non_string_entries(self, tmp_path: Path) -> None:
249+
with patch.dict(
250+
os.environ,
251+
{
252+
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
253+
"COCOINDEX_CODE_EXCLUDED_PATTERNS": '["**/*.py", 1]',
254+
},
255+
):
256+
with pytest.raises(
257+
ValueError,
258+
match="COCOINDEX_CODE_EXCLUDED_PATTERNS must be a JSON array of strings",
259+
):
260+
Config.from_env()

0 commit comments

Comments
 (0)