Skip to content

Commit e3ab9f0

Browse files
authored
Support extra extensions with optional language mapping (#32)
* feat: allow adding file extensions via COCOINDEX_CODE_EXTRA_EXTENSIONS env var Users can now specify additional file extensions to index without editing source code, e.g. COCOINDEX_CODE_EXTRA_EXTENSIONS="inc,tpl" * feat: support language mapping in COCOINDEX_CODE_EXTRA_EXTENSIONS Allow `ext:lang` format (e.g. "inc:php,yaml,tpl:html") to override language detection for extra extensions, enabling proper AST-based chunking for unrecognized extensions.
1 parent 64a3607 commit e3ab9f0

4 files changed

Lines changed: 110 additions & 2 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ Use the cocoindex-code MCP server for semantic code search when:
114114
| `COCOINDEX_CODE_ROOT_PATH` | Root path of the codebase | Auto-discovered (see below) |
115115
| `COCOINDEX_CODE_EMBEDDING_MODEL` | Embedding model (see below) | `sbert/sentence-transformers/all-MiniLM-L6-v2` |
116116
| `COCOINDEX_CODE_BATCH_SIZE` | Max batch size for local embedding model | `16` |
117+
| `COCOINDEX_CODE_EXTRA_EXTENSIONS` | Additional file extensions to index (comma-separated, e.g. `"inc:php,yaml,toml"` — use `ext:lang` to override language detection) | _(none)_ |
117118

118119

119120
### Root Path Discovery

src/cocoindex_code/config.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ class Config:
6666
device: str
6767
trust_remote_code: bool
6868
batch_size: int
69+
extra_extensions: dict[str, str | None]
6970

7071
@classmethod
7172
def from_env(cls) -> Config:
@@ -113,13 +114,27 @@ def from_env(cls) -> Config:
113114
f"COCOINDEX_CODE_BATCH_SIZE must be a positive integer, got: {batch_size}"
114115
)
115116

117+
# Extra file extensions (format: "inc:php,yaml,toml" — optional lang after colon)
118+
raw_extra = os.environ.get("COCOINDEX_CODE_EXTRA_EXTENSIONS", "")
119+
extra_extensions: dict[str, str | None] = {}
120+
for token in raw_extra.split(","):
121+
token = token.strip()
122+
if not token:
123+
continue
124+
if ":" in token:
125+
ext, lang = token.split(":", 1)
126+
extra_extensions[f".{ext.strip()}"] = lang.strip() or None
127+
else:
128+
extra_extensions[f".{token}"] = None
129+
116130
return cls(
117131
codebase_root_path=root,
118132
embedding_model=embedding_model,
119133
index_dir=index_dir,
120134
device=device,
121135
trust_remote_code=trust_remote_code,
122136
batch_size=batch_size,
137+
extra_extensions=extra_extensions,
123138
)
124139

125140
@property

src/cocoindex_code/indexer.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,11 @@
88
from cocoindex.resources.file import PatternFilePathMatcher
99
from cocoindex.resources.id import IdGenerator
1010

11+
from .config import config
1112
from .shared import CODEBASE_DIR, SQLITE_DB, CodeChunk, embedder
1213

1314
# File patterns for supported languages
14-
INCLUDED_PATTERNS = [
15+
DEFAULT_INCLUDED_PATTERNS = [
1516
"**/*.py", # Python
1617
"**/*.pyi", # Python stubs
1718
"**/*.js", # JavaScript
@@ -43,6 +44,13 @@
4344
"**/*.php", # PHP
4445
]
4546

47+
INCLUDED_PATTERNS = DEFAULT_INCLUDED_PATTERNS + [f"**/*{ext}" for ext in config.extra_extensions]
48+
49+
# Language overrides from extra_extensions (e.g. ".inc" -> "php")
50+
LANGUAGE_OVERRIDES: dict[str, str] = {
51+
ext: lang for ext, lang in config.extra_extensions.items() if lang is not None
52+
}
53+
4654
EXCLUDED_PATTERNS = [
4755
"**/.*", # Hidden directories
4856
"**/__pycache__", # Python cache
@@ -81,7 +89,12 @@ async def process_file(
8189
return
8290

8391
# Get relative path and detect language
84-
language = detect_code_language(filename=file.file_path.path.name) or "text"
92+
suffix = file.file_path.path.suffix
93+
language = (
94+
LANGUAGE_OVERRIDES.get(suffix)
95+
or detect_code_language(filename=file.file_path.path.name)
96+
or "text"
97+
)
8598

8699
# Split into chunks
87100
chunks = splitter.split(

tests/test_config.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,3 +138,82 @@ def test_batch_size_raises_on_negative(self, tmp_path: Path) -> None:
138138
):
139139
with pytest.raises(ValueError, match="COCOINDEX_CODE_BATCH_SIZE"):
140140
Config.from_env()
141+
142+
143+
class TestExtraExtensions:
144+
"""Tests for COCOINDEX_CODE_EXTRA_EXTENSIONS env var."""
145+
146+
def test_empty_by_default(self, tmp_path: Path) -> None:
147+
with patch.dict(
148+
os.environ,
149+
{"COCOINDEX_CODE_ROOT_PATH": str(tmp_path)},
150+
):
151+
os.environ.pop("COCOINDEX_CODE_EXTRA_EXTENSIONS", None)
152+
config = Config.from_env()
153+
assert config.extra_extensions == {}
154+
155+
def test_parses_comma_separated(self, tmp_path: Path) -> None:
156+
with patch.dict(
157+
os.environ,
158+
{
159+
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
160+
"COCOINDEX_CODE_EXTRA_EXTENSIONS": "rb,yaml,toml",
161+
},
162+
):
163+
config = Config.from_env()
164+
assert config.extra_extensions == {".rb": None, ".yaml": None, ".toml": None}
165+
166+
def test_trims_whitespace(self, tmp_path: Path) -> None:
167+
with patch.dict(
168+
os.environ,
169+
{
170+
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
171+
"COCOINDEX_CODE_EXTRA_EXTENSIONS": " rb , yaml , ",
172+
},
173+
):
174+
config = Config.from_env()
175+
assert config.extra_extensions == {".rb": None, ".yaml": None}
176+
177+
def test_empty_string_gives_empty_dict(self, tmp_path: Path) -> None:
178+
with patch.dict(
179+
os.environ,
180+
{
181+
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
182+
"COCOINDEX_CODE_EXTRA_EXTENSIONS": "",
183+
},
184+
):
185+
config = Config.from_env()
186+
assert config.extra_extensions == {}
187+
188+
def test_dot_prefix_passed_through(self, tmp_path: Path) -> None:
189+
with patch.dict(
190+
os.environ,
191+
{
192+
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
193+
"COCOINDEX_CODE_EXTRA_EXTENSIONS": ".rb,yaml",
194+
},
195+
):
196+
config = Config.from_env()
197+
assert config.extra_extensions == {"..rb": None, ".yaml": None}
198+
199+
def test_parses_lang_mapping(self, tmp_path: Path) -> None:
200+
with patch.dict(
201+
os.environ,
202+
{
203+
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
204+
"COCOINDEX_CODE_EXTRA_EXTENSIONS": "inc:php",
205+
},
206+
):
207+
config = Config.from_env()
208+
assert config.extra_extensions == {".inc": "php"}
209+
210+
def test_mixed_with_and_without_lang(self, tmp_path: Path) -> None:
211+
with patch.dict(
212+
os.environ,
213+
{
214+
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
215+
"COCOINDEX_CODE_EXTRA_EXTENSIONS": "inc:php,yaml,tpl:html",
216+
},
217+
):
218+
config = Config.from_env()
219+
assert config.extra_extensions == {".inc": "php", ".yaml": None, ".tpl": "html"}

0 commit comments

Comments
 (0)