-
Notifications
You must be signed in to change notification settings - Fork 114
Expand file tree
/
Copy pathconfig.py
More file actions
123 lines (99 loc) · 4.02 KB
/
config.py
File metadata and controls
123 lines (99 loc) · 4.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""Configuration management for cocoindex-code."""
from __future__ import annotations
import os
from dataclasses import dataclass
from pathlib import Path
_DEFAULT_MODEL = "sbert/sentence-transformers/all-MiniLM-L6-v2"
def _find_root_with_marker(start: Path, markers: list[str]) -> Path | None:
"""Walk up from start, return first directory containing any marker."""
current = start
while True:
if any((current / m).exists() for m in markers):
return current
parent = current.parent
if parent == current:
return None
current = parent
def _discover_codebase_root() -> Path:
"""Discover the codebase root directory.
Discovery order:
1. Find nearest parent with `.cocoindex_code` directory (re-anchor to previously-indexed tree)
2. Find nearest parent with any common project root marker
3. Fall back to current working directory
"""
cwd = Path.cwd()
# First, look for existing .cocoindex_code directory
root = _find_root_with_marker(cwd, [".cocoindex_code"])
if root is not None:
return root
# Then, look for common project root markers
markers = [".git", "pyproject.toml", "package.json", "Cargo.toml", "go.mod"]
root = _find_root_with_marker(cwd, markers)
return root if root is not None else cwd
@dataclass
class Config:
"""Configuration loaded from environment variables."""
codebase_root_path: Path
embedding_model: str
index_dir: Path
device: str | None
trust_remote_code: bool
extra_extensions: dict[str, str | None]
@classmethod
def from_env(cls) -> Config:
"""Load configuration from environment variables."""
# Get root path from env or discover it
root_path_str = os.environ.get("COCOINDEX_CODE_ROOT_PATH")
if root_path_str:
root = Path(root_path_str).resolve()
else:
root = _discover_codebase_root()
# Get embedding model
# Prefix "sbert/" for SentenceTransformers models, otherwise LiteLLM.
embedding_model = os.environ.get(
"COCOINDEX_CODE_EMBEDDING_MODEL",
_DEFAULT_MODEL,
)
# Index directory is always under the root
index_dir = root / ".cocoindex_code"
# Device: auto-detect CUDA or use env override
device = os.environ.get("COCOINDEX_CODE_DEVICE")
# trust_remote_code: opt-in via env var only.
# sentence-transformers 5.x+ supports Jina models natively, so
# auto-enabling this for jinaai/ models causes failures with
# transformers 5.x (removed find_pruneable_heads_and_indices).
trust_remote_code = os.environ.get("COCOINDEX_CODE_TRUST_REMOTE_CODE", "").lower() in (
"1",
"true",
"yes",
)
# Extra file extensions (format: "inc:php,yaml,toml" — optional lang after colon)
raw_extra = os.environ.get("COCOINDEX_CODE_EXTRA_EXTENSIONS", "")
extra_extensions: dict[str, str | None] = {}
for token in raw_extra.split(","):
token = token.strip()
if not token:
continue
if ":" in token:
ext, lang = token.split(":", 1)
extra_extensions[f".{ext.strip()}"] = lang.strip() or None
else:
extra_extensions[f".{token}"] = None
return cls(
codebase_root_path=root,
embedding_model=embedding_model,
index_dir=index_dir,
device=device,
trust_remote_code=trust_remote_code,
extra_extensions=extra_extensions,
)
@property
def target_sqlite_db_path(self) -> Path:
"""Path to the vector index SQLite database."""
return self.index_dir / "target_sqlite.db"
@property
def cocoindex_db_path(self) -> Path:
"""Path to the CocoIndex state database."""
return self.index_dir / "cocoindex.db"
# Module-level singleton — imported directly by all modules that need configuration
config: Config = Config.from_env()