chore: treat chatbot faiss index as generated state

akgohain · akgohain · commit c41aa39f9fbd · 2026-04-07T11:09:02.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,4 @@ __pycache__
 sql_app.db
 uploads
 pytorch_connectomics
+server_api/chatbot/faiss_index/
diff --git a/README.md b/README.md
@@ -37,6 +37,23 @@ PYTC_ALLOWED_ORIGINS=http://localhost:3000,http://127.0.0.1:3000,null
 PYTC_NEUROGLANCER_PUBLIC_BASE=http://localhost:4244
 ```
 
+## Chatbot Docs Index
+
+The chatbot's FAISS index is generated locally from the markdown files in
+`server_api/chatbot/file_summaries/` and should not be committed to git.
+
+When you update those markdown docs, rebuild the generated index with:
+
+```
+uv run python server_api/chatbot/update_faiss.py
+```
+
+You can override the embeddings endpoint if needed:
+
+```
+OLLAMA_BASE_URL=http://localhost:11434 uv run python server_api/chatbot/update_faiss.py
+```
+
 If restarting after a crash or interrupted session, kill any lingering processes first:
 
 ```
diff --git a/server_api/chatbot/chatbot.py b/server_api/chatbot/chatbot.py
@@ -14,6 +14,7 @@
 from langchain_core.tools import tool
 from langchain.agents import create_agent
 from server_api.utils.utils import process_path
+from server_api.chatbot.update_faiss import ensure_faiss_index
 from server_api.chatbot.tools import (
     list_training_configs,
     read_config,
@@ -124,6 +125,11 @@ def build_chain():
     llm = ChatOllama(model=ollama_model, base_url=ollama_base_url, temperature=0)
     embeddings = OllamaEmbeddings(model=ollama_embed_model, base_url=ollama_base_url)
     faiss_path = process_path("server_api/chatbot/faiss_index")
+    if ensure_faiss_index(
+        model=ollama_embed_model,
+        base_url=ollama_base_url,
+    ):
+        print(f"[SEARCH] Generated chatbot FAISS index at {faiss_path}")
     vectorstore = FAISS.load_local(
         faiss_path,
         embeddings,
@@ -301,6 +307,11 @@ def build_helper_chain():
     llm = ChatOllama(model=ollama_model, base_url=ollama_base_url, temperature=0)
     embeddings = OllamaEmbeddings(model=ollama_embed_model, base_url=ollama_base_url)
     faiss_path = process_path("server_api/chatbot/faiss_index")
+    if ensure_faiss_index(
+        model=ollama_embed_model,
+        base_url=ollama_base_url,
+    ):
+        print(f"[SEARCH] Generated chatbot FAISS index at {faiss_path}")
     vectorstore = FAISS.load_local(
         faiss_path,
         embeddings,
diff --git a/server_api/chatbot/faiss_index/index.faiss b/server_api/chatbot/faiss_index/index.faiss
diff --git a/server_api/chatbot/faiss_index/index.pkl b/server_api/chatbot/faiss_index/index.pkl
diff --git a/server_api/chatbot/update_faiss.py b/server_api/chatbot/update_faiss.py
@@ -1,54 +1,49 @@
-# How to update faiss_index:
-#     1. Edit the markdown files in server_api/chatbot/file_summaries/ as needed.
-#        These are end-user-focused guides (one per application page/feature) that
-#        serve as the knowledge base for the RAG chatbot.
-#     2. Run this script:
-#         python server_api/chatbot/update_faiss.py
-#
-#     You can override the embeddings model and Ollama base URL via:
-#     - Environment variables: OLLAMA_EMBED_MODEL, OLLAMA_BASE_URL
-#     - CLI arguments: --model, --base-url
-
-import os
 import argparse
+import os
 from pathlib import Path
-from langchain_core.documents import Document
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain_community.vectorstores import FAISS
-from langchain_ollama import OllamaEmbeddings
+from typing import Optional, Tuple
 
+DEFAULT_OLLAMA_BASE_URL = "http://cscigpu08.bc.edu:4443"
+DEFAULT_OLLAMA_EMBED_MODEL = "qwen3-embedding:8b"
+INDEX_FILENAMES = ("index.faiss", "index.pkl")
+
+
+def get_chatbot_paths(base_dir: Optional[Path] = None) -> Tuple[Path, Path]:
+    root = (base_dir or Path(__file__).parent).resolve()
+    return root / "file_summaries", root / "faiss_index"
 
-def main():
-    # Parse CLI arguments
-    parser = argparse.ArgumentParser(
-        description="Update FAISS index for RAG chatbot documentation search"
-    )
-    parser.add_argument(
-        "--model",
-        default=None,
-        help="Ollama embeddings model (default: from OLLAMA_EMBED_MODEL env or 'qwen3-embedding:8b')",
-    )
-    parser.add_argument(
-        "--base-url",
-        default=None,
-        help="Ollama base URL (default: from OLLAMA_BASE_URL env or 'http://cscigpu08.bc.edu:4443')",
-    )
-    args = parser.parse_args()
 
-    # Use same defaults as build_chain() in chatbot.py
-    embed_model = args.model or os.getenv("OLLAMA_EMBED_MODEL", "qwen3-embedding:8b")
-    base_url = args.base_url or os.getenv(
-        "OLLAMA_BASE_URL", "http://cscigpu08.bc.edu:4443"
+def resolve_ollama_settings(
+    model: Optional[str] = None, base_url: Optional[str] = None
+) -> Tuple[str, str]:
+    embed_model = model or os.getenv("OLLAMA_EMBED_MODEL", DEFAULT_OLLAMA_EMBED_MODEL)
+    resolved_base_url = base_url or os.getenv(
+        "OLLAMA_BASE_URL", DEFAULT_OLLAMA_BASE_URL
     )
+    return embed_model, resolved_base_url
 
-    print(f"Using embeddings model: {embed_model}")
-    print(f"Using Ollama base URL: {base_url}")
 
-    script_directory = Path(__file__).parent.resolve()
-    summaries_directory = script_directory / "file_summaries"
-    faiss_directory = script_directory / "faiss_index"
+def faiss_index_exists(faiss_directory: Path) -> bool:
+    return all((faiss_directory / name).is_file() for name in INDEX_FILENAMES)
+
+
+def build_faiss_index(
+    summaries_directory: Path,
+    faiss_directory: Path,
+    *,
+    model: Optional[str] = None,
+    base_url: Optional[str] = None,
+):
+    from langchain_core.documents import Document
+    from langchain_text_splitters import RecursiveCharacterTextSplitter
+    from langchain_community.vectorstores import FAISS
+    from langchain_ollama import OllamaEmbeddings
+
+    embed_model, resolved_base_url = resolve_ollama_settings(model, base_url)
+
+    print(f"Using embeddings model: {embed_model}")
+    print(f"Using Ollama base URL: {resolved_base_url}")
 
-    # Load full documents
     documents = []
     for md_file in summaries_directory.rglob("*.md"):
         summary = md_file.read_text(encoding="utf-8")
@@ -60,7 +55,6 @@ def main():
             )
         )
 
-    # Split into chunks for better embedding quality
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1000,
         chunk_overlap=200,
@@ -73,12 +67,66 @@ def main():
             f"  - {c.metadata['source']} (start={c.metadata.get('start_index', '?')}, {len(c.page_content)} chars)"
         )
 
-    embeddings = OllamaEmbeddings(model=embed_model, base_url=base_url)
+    embeddings = OllamaEmbeddings(model=embed_model, base_url=resolved_base_url)
     vectorstore = FAISS.from_documents(chunks, embeddings)
     faiss_directory.mkdir(parents=True, exist_ok=True)
     vectorstore.save_local(str(faiss_directory))
     print(f"FAISS index saved with {vectorstore.index.ntotal} vectors")
 
 
+def ensure_faiss_index(
+    *,
+    summaries_directory: Optional[Path] = None,
+    faiss_directory: Optional[Path] = None,
+    model: Optional[str] = None,
+    base_url: Optional[str] = None,
+) -> bool:
+    default_summaries_directory, default_faiss_directory = get_chatbot_paths()
+    summaries_directory = summaries_directory or default_summaries_directory
+    faiss_directory = faiss_directory or default_faiss_directory
+
+    if faiss_index_exists(faiss_directory):
+        return False
+
+    build_faiss_index(
+        summaries_directory,
+        faiss_directory,
+        model=model,
+        base_url=base_url,
+    )
+    return True
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Rebuild the generated FAISS index for chatbot documentation search"
+    )
+    parser.add_argument(
+        "--model",
+        default=None,
+        help=(
+            "Ollama embeddings model "
+            f"(default: OLLAMA_EMBED_MODEL or '{DEFAULT_OLLAMA_EMBED_MODEL}')"
+        ),
+    )
+    parser.add_argument(
+        "--base-url",
+        default=None,
+        help=(
+            "Ollama base URL "
+            f"(default: OLLAMA_BASE_URL or '{DEFAULT_OLLAMA_BASE_URL}')"
+        ),
+    )
+    args = parser.parse_args()
+
+    summaries_directory, faiss_directory = get_chatbot_paths()
+    build_faiss_index(
+        summaries_directory,
+        faiss_directory,
+        model=args.model,
+        base_url=args.base_url,
+    )
+
+
 if __name__ == "__main__":
     main()
diff --git a/tests/test_chatbot_faiss_generation.py b/tests/test_chatbot_faiss_generation.py
@@ -0,0 +1,86 @@
+from server_api.chatbot import update_faiss
+
+
+def test_faiss_index_exists_requires_both_files(tmp_path):
+    faiss_dir = tmp_path / "faiss_index"
+    faiss_dir.mkdir()
+
+    assert update_faiss.faiss_index_exists(faiss_dir) is False
+
+    (faiss_dir / "index.faiss").write_text("stub")
+    assert update_faiss.faiss_index_exists(faiss_dir) is False
+
+    (faiss_dir / "index.pkl").write_text("stub")
+    assert update_faiss.faiss_index_exists(faiss_dir) is True
+
+
+def test_ensure_faiss_index_builds_when_missing(tmp_path, monkeypatch):
+    summaries_dir = tmp_path / "summaries"
+    faiss_dir = tmp_path / "faiss_index"
+    summaries_dir.mkdir()
+    calls = []
+
+    def fake_build(summaries_directory, target_directory, *, model=None, base_url=None):
+        calls.append((summaries_directory, target_directory, model, base_url))
+        target_directory.mkdir(parents=True, exist_ok=True)
+        (target_directory / "index.faiss").write_text("stub")
+        (target_directory / "index.pkl").write_text("stub")
+
+    monkeypatch.setattr(update_faiss, "build_faiss_index", fake_build)
+
+    generated = update_faiss.ensure_faiss_index(
+        summaries_directory=summaries_dir,
+        faiss_directory=faiss_dir,
+        model="embed-model",
+        base_url="http://example.test:11434",
+    )
+
+    assert generated is True
+    assert calls == [
+        (summaries_dir, faiss_dir, "embed-model", "http://example.test:11434")
+    ]
+    assert update_faiss.faiss_index_exists(faiss_dir) is True
+
+
+def test_ensure_faiss_index_skips_when_present(tmp_path, monkeypatch):
+    summaries_dir = tmp_path / "summaries"
+    faiss_dir = tmp_path / "faiss_index"
+    summaries_dir.mkdir()
+    faiss_dir.mkdir()
+    (faiss_dir / "index.faiss").write_text("stub")
+    (faiss_dir / "index.pkl").write_text("stub")
+
+    def fail_build(*args, **kwargs):
+        raise AssertionError("build_faiss_index should not be called")
+
+    monkeypatch.setattr(update_faiss, "build_faiss_index", fail_build)
+
+    generated = update_faiss.ensure_faiss_index(
+        summaries_directory=summaries_dir,
+        faiss_directory=faiss_dir,
+    )
+
+    assert generated is False
+
+
+def test_resolve_ollama_settings_uses_env_defaults(monkeypatch):
+    monkeypatch.delenv("OLLAMA_EMBED_MODEL", raising=False)
+    monkeypatch.delenv("OLLAMA_BASE_URL", raising=False)
+
+    model, base_url = update_faiss.resolve_ollama_settings()
+
+    assert model == update_faiss.DEFAULT_OLLAMA_EMBED_MODEL
+    assert base_url == update_faiss.DEFAULT_OLLAMA_BASE_URL
+
+
+def test_resolve_ollama_settings_prefers_explicit_values(monkeypatch):
+    monkeypatch.setenv("OLLAMA_EMBED_MODEL", "env-model")
+    monkeypatch.setenv("OLLAMA_BASE_URL", "http://env.test:9999")
+
+    model, base_url = update_faiss.resolve_ollama_settings(
+        model="cli-model",
+        base_url="http://cli.test:8888",
+    )
+
+    assert model == "cli-model"
+    assert base_url == "http://cli.test:8888"