Skip to content

Commit ba692b0

Browse files
authored
chore: upgrade to cocoindex 1.0.0a24 and use its embedder (#36)
1 parent e3ab9f0 commit ba692b0

7 files changed

Lines changed: 13 additions & 303 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ classifiers = [
2323

2424
dependencies = [
2525
"mcp>=1.0.0",
26-
"cocoindex[litellm]==1.0.0a23",
26+
"cocoindex[litellm]==1.0.0a24",
2727
"sentence-transformers>=2.2.0",
2828
"sqlite-vec>=0.1.0",
2929
"pydantic>=2.0.0",
@@ -82,10 +82,6 @@ python_version = "3.11"
8282
strict = true
8383
ignore_missing_imports = true
8484

85-
[[tool.mypy.overrides]]
86-
module = "cocoindex_code.embedder"
87-
warn_unused_ignores = false
88-
8985
[tool.pytest.ini_options]
9086
testpaths = ["tests"]
9187
python_files = ["test_*.py"]

src/cocoindex_code/config.py

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from dataclasses import dataclass
77
from pathlib import Path
88

9-
_SBERT_PREFIX = "sbert/"
109
_DEFAULT_MODEL = "sbert/sentence-transformers/all-MiniLM-L6-v2"
1110

1211

@@ -65,7 +64,6 @@ class Config:
6564
index_dir: Path
6665
device: str
6766
trust_remote_code: bool
68-
batch_size: int
6967
extra_extensions: dict[str, str | None]
7068

7169
@classmethod
@@ -101,19 +99,6 @@ def from_env(cls) -> Config:
10199
"yes",
102100
)
103101

104-
# Batch size for local embedding model
105-
_raw_batch_size = os.environ.get("COCOINDEX_CODE_BATCH_SIZE", "16")
106-
try:
107-
batch_size = int(_raw_batch_size)
108-
except ValueError:
109-
raise ValueError(
110-
f"COCOINDEX_CODE_BATCH_SIZE must be a positive integer, got: {_raw_batch_size!r}"
111-
) from None
112-
if batch_size <= 0:
113-
raise ValueError(
114-
f"COCOINDEX_CODE_BATCH_SIZE must be a positive integer, got: {batch_size}"
115-
)
116-
117102
# Extra file extensions (format: "inc:php,yaml,toml" — optional lang after colon)
118103
raw_extra = os.environ.get("COCOINDEX_CODE_EXTRA_EXTENSIONS", "")
119104
extra_extensions: dict[str, str | None] = {}
@@ -133,7 +118,6 @@ def from_env(cls) -> Config:
133118
index_dir=index_dir,
134119
device=device,
135120
trust_remote_code=trust_remote_code,
136-
batch_size=batch_size,
137121
extra_extensions=extra_extensions,
138122
)
139123

src/cocoindex_code/embedder.py

Lines changed: 0 additions & 102 deletions
This file was deleted.

src/cocoindex_code/shared.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@
1414

1515
if TYPE_CHECKING:
1616
from cocoindex.ops.litellm import LiteLLMEmbedder
17-
18-
from .embedder import LocalEmbedder
17+
from cocoindex.ops.sentence_transformers import SentenceTransformerEmbedder
1918

2019
from .config import config
2120

@@ -24,9 +23,9 @@
2423
SBERT_PREFIX = "sbert/"
2524

2625
# Initialize embedder at module level based on model prefix
27-
embedder: LocalEmbedder | LiteLLMEmbedder
26+
embedder: SentenceTransformerEmbedder | LiteLLMEmbedder
2827
if config.embedding_model.startswith(SBERT_PREFIX):
29-
from .embedder import LocalEmbedder
28+
from cocoindex.ops.sentence_transformers import SentenceTransformerEmbedder
3029

3130
_model_name = config.embedding_model[len(SBERT_PREFIX) :]
3231
# Models that define a "query" prompt for asymmetric retrieval.
@@ -35,7 +34,7 @@
3534
# Models whose custom remote code is known-compatible with transformers 5.x.
3635
_KNOWN_REMOTE_CODE_MODELS = {"nomic-ai/CodeRankEmbed"}
3736
_trust = config.trust_remote_code or _model_name in _KNOWN_REMOTE_CODE_MODELS
38-
embedder = LocalEmbedder(
37+
embedder = SentenceTransformerEmbedder(
3938
_model_name,
4039
device=config.device,
4140
trust_remote_code=_trust,

tests/test_config.py

Lines changed: 0 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66
from pathlib import Path
77
from unittest.mock import patch
88

9-
import pytest
10-
119
from cocoindex_code.config import Config, _detect_device
1210

1311

@@ -83,63 +81,6 @@ def test_default_model_is_minilm(self, tmp_path: Path) -> None:
8381
assert "all-MiniLM-L6-v2" in config.embedding_model
8482

8583

86-
class TestConfigBatchSize:
87-
"""Tests for COCOINDEX_CODE_BATCH_SIZE env var."""
88-
89-
def test_default_batch_size_is_16(self, tmp_path: Path) -> None:
90-
with patch.dict(
91-
os.environ,
92-
{"COCOINDEX_CODE_ROOT_PATH": str(tmp_path)},
93-
):
94-
os.environ.pop("COCOINDEX_CODE_BATCH_SIZE", None)
95-
config = Config.from_env()
96-
assert config.batch_size == 16
97-
98-
def test_batch_size_reads_env_var(self, tmp_path: Path) -> None:
99-
with patch.dict(
100-
os.environ,
101-
{
102-
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
103-
"COCOINDEX_CODE_BATCH_SIZE": "32",
104-
},
105-
):
106-
config = Config.from_env()
107-
assert config.batch_size == 32
108-
109-
def test_batch_size_raises_on_non_integer(self, tmp_path: Path) -> None:
110-
with patch.dict(
111-
os.environ,
112-
{
113-
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
114-
"COCOINDEX_CODE_BATCH_SIZE": "notanint",
115-
},
116-
):
117-
with pytest.raises(ValueError, match="COCOINDEX_CODE_BATCH_SIZE"):
118-
Config.from_env()
119-
120-
def test_batch_size_raises_on_zero(self, tmp_path: Path) -> None:
121-
with patch.dict(
122-
os.environ,
123-
{
124-
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
125-
"COCOINDEX_CODE_BATCH_SIZE": "0",
126-
},
127-
):
128-
with pytest.raises(ValueError, match="COCOINDEX_CODE_BATCH_SIZE"):
129-
Config.from_env()
130-
131-
def test_batch_size_raises_on_negative(self, tmp_path: Path) -> None:
132-
with patch.dict(
133-
os.environ,
134-
{
135-
"COCOINDEX_CODE_ROOT_PATH": str(tmp_path),
136-
"COCOINDEX_CODE_BATCH_SIZE": "-1",
137-
},
138-
):
139-
with pytest.raises(ValueError, match="COCOINDEX_CODE_BATCH_SIZE"):
140-
Config.from_env()
141-
142-
14384
class TestExtraExtensions:
14485
"""Tests for COCOINDEX_CODE_EXTRA_EXTENSIONS env var."""
14586

tests/test_embedder.py

Lines changed: 0 additions & 108 deletions
This file was deleted.

0 commit comments

Comments
 (0)