Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ Optionally, you can run `cocoindex-code index` to create or update the index. Wi
| `COCOINDEX_CODE_ROOT_PATH` | Root path of the codebase | Auto-discovered (see below) |
| `COCOINDEX_CODE_EMBEDDING_MODEL` | Embedding model (see below) | `sbert/sentence-transformers/all-MiniLM-L6-v2` |
| `COCOINDEX_CODE_BATCH_SIZE` | Max batch size for local embedding model | `16` |
| `COCOINDEX_CODE_EXTRA_EXTENSIONS` | Additional file extensions to index (comma-separated, e.g. `"inc:php,yaml,toml"` — use `ext:lang` to override language detection) | _(none)_ |


### Root Path Discovery
Expand Down
15 changes: 15 additions & 0 deletions src/cocoindex_code/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ class Config:
device: str
trust_remote_code: bool
batch_size: int
extra_extensions: dict[str, str | None]

@classmethod
def from_env(cls) -> Config:
Expand Down Expand Up @@ -113,13 +114,27 @@ def from_env(cls) -> Config:
f"COCOINDEX_CODE_BATCH_SIZE must be a positive integer, got: {batch_size}"
)

# Extra file extensions (format: "inc:php,yaml,toml" — optional lang after colon)
raw_extra = os.environ.get("COCOINDEX_CODE_EXTRA_EXTENSIONS", "")
extra_extensions: dict[str, str | None] = {}
for token in raw_extra.split(","):
token = token.strip()
if not token:
continue
if ":" in token:
ext, lang = token.split(":", 1)
extra_extensions[f".{ext.strip()}"] = lang.strip() or None
else:
extra_extensions[f".{token}"] = None

return cls(
codebase_root_path=root,
embedding_model=embedding_model,
index_dir=index_dir,
device=device,
trust_remote_code=trust_remote_code,
batch_size=batch_size,
extra_extensions=extra_extensions,
)

@property
Expand Down
17 changes: 15 additions & 2 deletions src/cocoindex_code/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
from cocoindex.resources.file import PatternFilePathMatcher
from cocoindex.resources.id import IdGenerator

from .config import config
from .shared import CODEBASE_DIR, SQLITE_DB, CodeChunk, embedder

# File patterns for supported languages
INCLUDED_PATTERNS = [
DEFAULT_INCLUDED_PATTERNS = [
"**/*.py", # Python
"**/*.pyi", # Python stubs
"**/*.js", # JavaScript
Expand Down Expand Up @@ -43,6 +44,13 @@
"**/*.php", # PHP
]

INCLUDED_PATTERNS = DEFAULT_INCLUDED_PATTERNS + [f"**/*{ext}" for ext in config.extra_extensions]

# Language overrides from extra_extensions (e.g. ".inc" -> "php")
LANGUAGE_OVERRIDES: dict[str, str] = {
ext: lang for ext, lang in config.extra_extensions.items() if lang is not None
}

EXCLUDED_PATTERNS = [
"**/.*", # Hidden directories
"**/__pycache__", # Python cache
Expand Down Expand Up @@ -81,7 +89,12 @@ async def process_file(
return

# Get relative path and detect language
language = detect_code_language(filename=file.file_path.path.name) or "text"
suffix = file.file_path.path.suffix
language = (
LANGUAGE_OVERRIDES.get(suffix)
or detect_code_language(filename=file.file_path.path.name)
or "text"
)

# Split into chunks
chunks = splitter.split(
Expand Down
79 changes: 79 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,82 @@ def test_batch_size_raises_on_negative(self, tmp_path: Path) -> None:
):
with pytest.raises(ValueError, match="COCOINDEX_CODE_BATCH_SIZE"):
Config.from_env()


class TestExtraExtensions:
"""Tests for COCOINDEX_CODE_EXTRA_EXTENSIONS env var."""

def test_empty_by_default(self, tmp_path: Path) -> None:
with patch.dict(
os.environ,
{"COCOINDEX_CODE_ROOT_PATH": str(tmp_path)},
):
os.environ.pop("COCOINDEX_CODE_EXTRA_EXTENSIONS", None)
config = Config.from_env()
assert config.extra_extensions == {}

def test_parses_comma_separated(self, tmp_path: Path) -> None:
with patch.dict(
os.environ,
{
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
"COCOINDEX_CODE_EXTRA_EXTENSIONS": "rb,yaml,toml",
},
):
config = Config.from_env()
assert config.extra_extensions == {".rb": None, ".yaml": None, ".toml": None}

def test_trims_whitespace(self, tmp_path: Path) -> None:
with patch.dict(
os.environ,
{
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
"COCOINDEX_CODE_EXTRA_EXTENSIONS": " rb , yaml , ",
},
):
config = Config.from_env()
assert config.extra_extensions == {".rb": None, ".yaml": None}

def test_empty_string_gives_empty_dict(self, tmp_path: Path) -> None:
with patch.dict(
os.environ,
{
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
"COCOINDEX_CODE_EXTRA_EXTENSIONS": "",
},
):
config = Config.from_env()
assert config.extra_extensions == {}

def test_dot_prefix_passed_through(self, tmp_path: Path) -> None:
with patch.dict(
os.environ,
{
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
"COCOINDEX_CODE_EXTRA_EXTENSIONS": ".rb,yaml",
},
):
config = Config.from_env()
assert config.extra_extensions == {"..rb": None, ".yaml": None}

def test_parses_lang_mapping(self, tmp_path: Path) -> None:
with patch.dict(
os.environ,
{
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
"COCOINDEX_CODE_EXTRA_EXTENSIONS": "inc:php",
},
):
config = Config.from_env()
assert config.extra_extensions == {".inc": "php"}

def test_mixed_with_and_without_lang(self, tmp_path: Path) -> None:
with patch.dict(
os.environ,
{
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
"COCOINDEX_CODE_EXTRA_EXTENSIONS": "inc:php,yaml,tpl:html",
},
):
config = Config.from_env()
assert config.extra_extensions == {".inc": "php", ".yaml": None, ".tpl": "html"}