CodeForPhilly · sahilds1 · Jun 19, 2026 · Apr 17, 2026 · Apr 20, 2026 · Apr 23, 2026
diff --git a/server/api/views/assistant/assistant_prompts.py b/server/api/views/assistant/assistant_prompts.py
@@ -0,0 +1,38 @@
+INSTRUCTIONS = """
+You are an AI assistant that helps users find and understand information about bipolar disorder 
+from your internal library of bipolar disorder research sources using semantic search.
+
+IMPORTANT CONTEXT:
+- You have access to a library of sources that the user CANNOT see
+- The user did not upload these sources and doesn't know about them
+- You must explain what information exists in your sources and provide clear references
+
+TOPIC RESTRICTIONS:
+When a prompt is received that is unrelated to bipolar disorder, mental health treatment, 
+or psychiatric medications, respond by saying you are limited to bipolar-specific conversations.
+
+SEMANTIC SEARCH STRATEGY:
+- Always perform semantic search using the search_documents function when users ask questions
+- Use conceptually related terms and synonyms, not just exact keyword matches
+- Search for the meaning and context of the user's question, not just literal words
+- Consider medical terminology, lay terms, and related conditions when searching
+
+FUNCTION USAGE:
+- When a user asks about information that might be in your source library, ALWAYS use the search_documents function first
+- Perform semantic searches using concepts, symptoms, treatments, and related terms from the user's question
+- Only provide answers based on information found through your source searches
+
+RESPONSE FORMAT:
+After gathering information through semantic searches, provide responses that:
+1. Answer the user's question directly using only the found information
+2. Structure responses with clear sections and paragraphs
+3. Explain what information you found in your sources and provide context
+4. Include citations using this exact format: [Name {name}, Page {page_number}]
+5. Only cite information that directly supports your statements
+
+If no relevant information is found in your source library, clearly state that the information 
+is not available in your current sources.
+
+REMEMBER: You are working with an internal library of bipolar disorder sources that the user 
+cannot see. Always search these sources first, explain what you found, and provide proper citations.
+"""
diff --git a/server/api/views/assistant/assistant_services.py b/server/api/views/assistant/assistant_services.py
@@ -0,0 +1,72 @@
+import os
+import logging
+
+from openai import OpenAI
+
+from .assistant_prompts import INSTRUCTIONS
+from .tool_services import (
+    SEARCH_TOOLS_SCHEMA,
+    make_search_tool_mapping,
+    handle_tool_calls_with_reasoning,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def run_assistant(
+    message: str,
+    user,
+    previous_response_id: str | None = None,
+) -> tuple[str, str]:
+    """Wire together the OpenAI client, retrieval, and the agentic reasoning loop.
+
+    Parameters
+    ----------
+    message : str
+        The user's input message.
+    user : User
+        The Django user object used for document access control in search_documents.
+    previous_response_id : str | None
+        ID of a prior response for multi-turn conversation continuity.
+
+    Returns
+    -------
+    tuple[str, str]
+        (final_response_output_text, final_response_id)
+    """
+    # TODO: Track total duration, cost metrics, and tool_calls_made count
+    # and return them from run_assistant for use in eval_assistant.py CSV output
+
+    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+
+    MODEL_DEFAULTS = {
+        "instructions": INSTRUCTIONS,
+        "model": "gpt-5-nano",  # 400,000 token context window
+        # A summary of the reasoning performed by the model. This can be useful for debugging and understanding the model's reasoning process.
+        "reasoning": {"effort": "low", "summary": None},
+        "tools": SEARCH_TOOLS_SCHEMA,
+    }
+
+    # TOOLS_SCHEMA tells the model what tools exist and what arguments to generate.
+    # tool_mapping wires those tool names to the Python functions that execute them.
+    # They are separate because the model generates arguments (schema concern) but
+    # cannot supply request-time values like user (mapping concern).
+    tool_mapping = make_search_tool_mapping(user)
+
+    if not previous_response_id:
+        response = client.responses.create(
+            input=[
+                {"type": "message", "role": "user", "content": str(message)}
+            ],
+            **MODEL_DEFAULTS,
+        )
+    else:
+        response = client.responses.create(
+            input=[
+                {"type": "message", "role": "user", "content": str(message)}
+            ],
+            previous_response_id=str(previous_response_id),
+            **MODEL_DEFAULTS,
+        )
+
+    return handle_tool_calls_with_reasoning(response, client, MODEL_DEFAULTS, tool_mapping)
diff --git a/server/api/views/assistant/eval_assistant.py b/server/api/views/assistant/eval_assistant.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env -S uv run --script
+# /// script
+# requires-python = "==3.11.11"
+# dependencies = [
+#   "pandas==2.2.3",
+#   "openai",
+#   "django",
+# ]
+# ///
+
+# uv script (or plain Python) to generate results to CSV, run from the terminal
+# Run from inside the container (working dir is /usr/src/server):
+#   docker compose exec backend python api/views/assistant/eval_assistant.py
+# 
+
+
+import os
+import sys
+import logging
+import datetime
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+# Django setup must come before any imports that touch the ORM
+# NOTE: from api/views/assistant/, "../../../../" resolves four levels up to
+# /usr/src (not /usr/src/server, where balancer_backend lives). So this insert
+# alone does not put the settings package on sys.path — running the script
+# relies on the container already having /usr/src/server on PYTHONPATH. Sanity-
+# check this the first time the eval is run for real; the path depth may need
+# adjusting (e.g. "../../../").
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../../")))
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "balancer_backend.settings")
+
+import django
+django.setup()
+
+from django.contrib.auth import get_user_model
+
+from api.views.assistant.assistant_services import run_assistant
+# TODO: remove unused import or use INSTRUCTIONS to record an instructions_hash column
+from api.views.assistant.assistant_prompts import INSTRUCTIONS
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+# Read model and INSTRUCTIONS from the source file or add a lightweight config endpoint to the backend
+
+# Read model and INSTRUCTIONS from the source file
+# INSTRUCTIONS is imported from assistant_prompts.py
+# MODEL is read from assistant_services.py MODEL_DEFAULTS
+# TODO: import a shared MODEL_NAME constant from assistant_services instead of hardcoding
+MODEL = "gpt-5-nano"
+
+# Set of representative questions to evaluate the assistant
+QUESTIONS = [
+    "What medications are recommended for bipolar depression?",
+    "What are the risks of lithium for patients with kidney disease?",
+    "Which mood stabilizers are safe during pregnancy?",
+    "What is the evidence for quetiapine in bipolar disorder?",
+    "How does valproate compare to lithium for mania?",
+]
+
+
+def run_one(question: str, user, branch: str) -> dict:
+    """Run the assistant for a single question and return a result row.
+
+    Uses ThreadPoolExecutor (not asyncio.gather + await run_assistant) for concurrency.
+
+    Concurrency approach comparison:
+    - ThreadPoolExecutor (this implementation):
+        - run_assistant stays sync — views.py and the WSGI web app are unaffected
+        - Each question runs in a thread pool worker, blocking on OpenAI + DB I/O
+        - Django DB safe when run via `docker compose exec backend python eval_assistant.py`:
+          this is a synchronous Django process context. Each ThreadPoolExecutor worker
+          is a real OS thread with its own threading.local() storage, so each thread
+          gets its own DB connection created lazily on first use. There is no shared
+          event loop thread, so connections cannot clash or bleed between questions.
+          The connection isolation concern only arises in ASGI contexts where multiple
+          coroutines share one thread and therefore one threading.local() connection —
+          which is not the case here.
+        - Runtime: bottlenecked by OpenAI rate limits, not thread overhead
+    - asyncio.gather + await run_assistant (alternative):
+        - run_assistant becomes async — requires async def post in views.py,
+          AsyncOpenAI client, and async handle_tool_calls_with_reasoning
+        - Django DB unsafe if get_closest_embeddings is called directly in an async
+          context without wrapping: get_closest_embeddings is a sync function that
+          hits the ORM, so calling it on the event loop thread blocks all other
+          coroutines until the DB responds. The fix is sync_to_async(get_closest_embeddings),
+          which runs it in a dedicated worker thread with its own threading.local()
+          connection. Bare await does not work at all — Django ORM querysets are not
+          awaitables and raise TypeError immediately.
+        - Under WSGI (manage.py runserver), async views run in a new event loop
+          per request — adds overhead to every web request for no benefit
+        - Cleaner call site in eval_assistant.py but wrong trade-off given WSGI
+    """
+    try:
+        response_text, response_id = run_assistant(message=question, user=user)
+        return {
+            "branch": branch,
+            "model": MODEL,
+            "question": question,
+            "response_output_text": response_text,
+            "error": None,
+        }
+    except Exception as e:
+        logger.error(f"Error evaluating question '{question}': {e}")
+        return {
+            "branch": branch,
+            "model": MODEL,
+            "question": question,
+            "response_output_text": None,
+            "error": str(e),
+        }
+
+
+def main():
+    branch = os.environ.get("EVAL_BRANCH", "develop")
+
+    User = get_user_model()
+    user = User.objects.filter(is_superuser=True).first()
+    if not user:
+        raise RuntimeError("No superuser found. Create one with manage.py createsuperuser.")
+
+    logger.info(f"Starting evaluation: branch={branch}, model={MODEL}, questions={len(QUESTIONS)}")
+
+    # ThreadPoolExecutor runs questions concurrently — see run_one docstring
+    # for trade-off discussion vs asyncio.gather + await run_assistant.
+    # max_workers=5 stays safely under OpenAI rate limits for gpt-5-nano.
+    results = []
+    with ThreadPoolExecutor(max_workers=5) as pool:
+        futures = {
+            pool.submit(run_one, question, user, branch): question
+            for question in QUESTIONS
+        }
+        for future in as_completed(futures):
+            results.append(future.result())
+
+    # Import pandas here, not at module top, so that importing this module (e.g.
+    # run_one from test_eval_assistant.py) does not require pandas. It is only
+    # needed for the CSV output below, when this script is run directly.
+    import pandas as pd
+
+    df = pd.DataFrame(results)
+
+    results_dir = os.path.join(os.path.dirname(__file__), "results")
+    os.makedirs(results_dir, exist_ok=True)
+    timestamp = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S")
+    output_path = os.path.join(results_dir, f"{branch}-{timestamp}.csv")
+    df.to_csv(output_path, index=False)
+
+    logger.info(f"Results saved to {output_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/server/api/views/assistant/review.ipynb b/server/api/views/assistant/review.ipynb
@@ -0,0 +1,58 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Assistant eval review\n",
+    "\n",
+    "Load two result CSVs produced by `eval_assistant.py` (one per branch) and compare the\n",
+    "assistant's responses side by side.\n",
+    "\n",
+    "Set `DEVELOP_CSV` and `FEATURE_CSV` below to the two files in `results/` you want to compare."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "DEVELOP_CSV = \"results/develop-<timestamp>.csv\"\n",
+    "FEATURE_CSV = \"results/<branch>-<timestamp>.csv\"\n",
+    "\n",
+    "develop_df = pd.read_csv(DEVELOP_CSV)\n",
+    "feature_df = pd.read_csv(FEATURE_CSV)\n",
+    "\n",
+    "# Outer join on question so questions missing from either run stay visible.\n",
+    "comparison = develop_df.merge(\n",
+    "    feature_df, on=\"question\", how=\"outer\", suffixes=(\"_develop\", \"_feature\")\n",
+    ")\n",
+    "comparison[[\"question\", \"response_output_text_develop\", \"response_output_text_feature\"]]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## TODO (follow-up branch)\n",
+    "\n",
+    "Once `eval_assistant.py` records per-row metrics (`tool_calls_made`, token counts,\n",
+    "`cost_usd`, `duration_s`, `instructions_hash`), extend the comparison to:\n",
+    "\n",
+    "- Flag rows where `instructions_hash` differs between branches (prompt changed).\n",
+    "- Summarize cost/token totals grouped by branch.\n",
+    "- Highlight rows where `tool_calls_made` differs (retrieval used differently)."
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}