Skip to content

Commit 4a69ab5

Browse files
authored
Merge pull request #9 from M9nx/copilot/add-batch-size-option
Add configurable embedding batch size to indexing CLI
2 parents def0d6a + 51914be commit 4a69ab5

2 files changed

Lines changed: 58 additions & 5 deletions

File tree

semantic_code_intelligence/cli/commands/index_cmd.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
import click
1212

13-
from semantic_code_intelligence.config.settings import AppConfig
13+
from semantic_code_intelligence.config.settings import AppConfig, load_config, save_config
1414
from semantic_code_intelligence.services.indexing_service import run_indexing
1515
from semantic_code_intelligence.utils.logging import (
1616
get_logger,
@@ -201,7 +201,14 @@ def _on_changes(events: list) -> None:
201201
type=str,
202202
help="Switch embedding model and re-index in one step.",
203203
)
204-
def index_cmd(project_path: Path | None, force: bool, watch: bool, add_file: str | None, inspect_file: str | None, switch_model: str | None) -> int:
204+
@click.option(
205+
"--batch-size",
206+
"batch_size",
207+
type=click.IntRange(1, None),
208+
default=None,
209+
help="Embedding batch size for chunk processing (overrides config).",
210+
)
211+
def index_cmd(project_path: Path | None, force: bool, watch: bool, add_file: str | None, inspect_file: str | None, switch_model: str | None, batch_size: int | None) -> int:
205212
"""Index a codebase for semantic search.
206213
207214
Scans the target directory, extracts code chunks, generates embeddings,
@@ -229,6 +236,8 @@ def index_cmd(project_path: Path | None, force: bool, watch: bool, add_file: str
229236
f"Project not initialized at {root}. Run 'codexa init' first."
230237
)
231238

239+
config: AppConfig | None = None
240+
232241
# --- Inspect mode: show metadata for a file ---
233242
if inspect_file:
234243
_inspect_file_index(root, inspect_file)
@@ -242,10 +251,9 @@ def index_cmd(project_path: Path | None, force: bool, watch: bool, add_file: str
242251
# --- Switch model inline: update config + force re-index ---
243252
if switch_model:
244253
from semantic_code_intelligence.embeddings.model_registry import resolve_model_name
245-
from semantic_code_intelligence.config.settings import load_config, save_config
246254

247255
resolved = resolve_model_name(switch_model)
248-
config = load_config(root)
256+
config = config if config is not None else load_config(root)
249257
old_model = config.embedding.model_name
250258
if old_model == resolved:
251259
print_info(f"Model already set to '{resolved}' — running normal index.")
@@ -255,10 +263,23 @@ def index_cmd(project_path: Path | None, force: bool, watch: bool, add_file: str
255263
print_success(f"Switched model: {old_model}{resolved}")
256264
force = True # force re-index with new model
257265

266+
# --- Optional batch size override (only when indexing will run) ---
267+
if batch_size is not None:
268+
config = config if config is not None else load_config(root)
269+
prev_batch = config.embedding.batch_size
270+
if prev_batch != batch_size:
271+
config.embedding.batch_size = batch_size
272+
save_config(config, root)
273+
print_info(
274+
f"Embedding batch size updated: {prev_batch}{batch_size} "
275+
"(applies to this and future indexing runs)."
276+
)
277+
else:
278+
print_info(f"Embedding batch size already set to {batch_size}.")
279+
258280
# --- Model consistency guard ---
259281
if not force:
260282
from semantic_code_intelligence.storage.index_manifest import IndexManifest
261-
from semantic_code_intelligence.config.settings import load_config
262283
index_dir = AppConfig.index_dir(root)
263284
manifest = IndexManifest.load(index_dir)
264285
if manifest:

semantic_code_intelligence/tests/test_cli.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import pytest
1010
from click.testing import CliRunner
11+
from semantic_code_intelligence.config.settings import load_config
1112

1213
from semantic_code_intelligence.cli.main import cli
1314
from semantic_code_intelligence.embeddings.generator import BYTES_PER_GB
@@ -133,6 +134,37 @@ def test_index_force_flag(self, runner: CliRunner, tmp_path: Path):
133134
result = runner.invoke(cli, ["index", str(tmp_path), "--force"])
134135
assert result.exit_code == 0
135136

137+
def test_index_batch_size_override(self, runner: CliRunner, tmp_path: Path, monkeypatch: pytest.MonkeyPatch):
138+
project = tmp_path
139+
(project / "sample.py").write_text("def foo():\n return 1\n", encoding="utf-8")
140+
141+
captured: dict[str, int] = {}
142+
143+
class DummyResult:
144+
files_scanned = 1
145+
files_indexed = 1
146+
files_skipped = 0
147+
chunks_created = 1
148+
total_vectors = 1
149+
150+
def fake_run_indexing(project_root, force=False):
151+
cfg = load_config(project_root)
152+
captured["batch_size"] = cfg.embedding.batch_size
153+
return DummyResult()
154+
155+
monkeypatch.setattr(
156+
"semantic_code_intelligence.cli.commands.index_cmd.run_indexing",
157+
fake_run_indexing,
158+
)
159+
160+
runner.invoke(cli, ["init", str(project)])
161+
result = runner.invoke(cli, ["index", str(project), "--batch-size", "8"])
162+
assert result.exit_code == 0
163+
164+
config = json.loads((project / ".codexa" / "config.json").read_text(encoding="utf-8"))
165+
assert config["embedding"]["batch_size"] == 8
166+
assert captured["batch_size"] == 8
167+
136168
def test_index_network_oserror_is_nonfatal(self, runner: CliRunner, tmp_path: Path, monkeypatch: pytest.MonkeyPatch):
137169
runner.invoke(cli, ["init", str(tmp_path)])
138170

0 commit comments

Comments
 (0)