added docstrings to ingestion, retrieval and generation functions

Amelie-Artmann · Amelie-Artmann · commit f3608c83a5a1 · 2026-03-12T09:52:01.000+01:00
diff --git a/app/generation/response.py b/app/generation/response.py
@@ -4,6 +4,17 @@
 client = get_mistral_client()
 
 def generate_response(query: str, context: list[tuple[str, float]]) -> str:
+    """
+    Generate a response to a user query using DevGuard documentation context.
+
+    Formats the provided context into a prompt, sends it to the Mistral API.  If context
+    is unavailable, the assistant will indicate so. If the query is unrelated to DevGuard,
+    the assistant will politely decline and redirect to DevGuard topics.
+
+   Safe prompt prepends: "Always assist with care, respect, and truth. Respond with
+        utmost utility yet securely. Avoid harmful, unethical, prejudiced, or negative
+        content. Ensure replies promote fairness and positivity."
+    """
     # format context
     context_text = "\n\n".join(
         f"- {content}" for content, _ in context
@@ -25,10 +36,6 @@ def generate_response(query: str, context: list[tuple[str, float]]) -> str:
 
     message= [{"role": "user", "content": prompt}]
 
-    """
-        Toggling the safe prompt will prepend your messages with the following system prompt:
-        Always assist with care, respect, and truth. Respond with utmost utility yet securely. Avoid harmful, unethical, prejudiced, or negative content. Ensure replies promote fairness and positivity.
-    """
     response = client.chat.complete(
         model=MODEL_GENERATION,
         messages=message,
diff --git a/app/ingestion/chunking.py b/app/ingestion/chunking.py
@@ -1,7 +1,7 @@
 from app.config import CHUNK_SIZE, OVERLAP_SIZE
 
-# split the given docs up in chunks without spliting up words
 def chunking(docs: str) -> list[str]:
+    """Split documentation into chunks of approximately CHUNK_SIZE without breaking words."""
     chunks : list[str] = []
     start : int = 0
     while start < len(docs):
@@ -19,8 +19,9 @@ def chunking(docs: str) -> list[str]:
             start = last_space - OVERLAP_SIZE
     return chunks
 
-# option: apply overlap to the chunks after initial chunking to ensure that there is some context between them
 def apply_overlap(chunks: list[str]) -> list[str]:
+    """Apply overlap to chunks by prepending the last OVERLAP_SIZE characters
+    from the previous chunk to each subsequent chunk"""
     if OVERLAP_SIZE <= 0:
         return chunks
 
@@ -38,9 +39,8 @@ def apply_overlap(chunks: list[str]) -> list[str]:
     return overlapped
 
 
-# split recursively for a hierarchy of separators
-# attempt to split on high-level separators first, then move to increasingly finer separators if chunks remain too large
 def recursive_chunking(docs: str, separators: list[str] = ["\n\n", "\n", ". ", " ", ""]):
+    """Split text recursively using a hierarchy of separators. """
     # base case
     if len(docs) <= CHUNK_SIZE:
         return [docs]
diff --git a/app/ingestion/embedding.py b/app/ingestion/embedding.py
@@ -3,8 +3,8 @@
 from app.clients import get_mistral_client
 from app.config import BATCH_SIZE, MODEL_EMBEDDING
 
-# get the embeddings for a list of chunks, return a list of embeddings
 def get_embeddings(chunks: list[str]) -> list[list[float]]:
+    """Generate and return embeddings for a list of text chunks"""
     client = get_mistral_client()
     embeddings: list[list[float]] = []
     # call the api with batches to avoid hitting the rate limit
@@ -19,8 +19,8 @@ def get_embeddings(chunks: list[str]) -> list[list[float]]:
             embeddings.append(list(embedding))
     return embeddings
 
-# get embedding for a single chunk of text
 def text_embedding(chunk: str) -> list[float]:
+    """Generate an embedding for a single piece of text"""
     client = get_mistral_client()
     # call the mistral api to get the embedding for the given text
     response = client.embeddings.create(
diff --git a/app/ingestion/reader.py b/app/ingestion/reader.py
@@ -1,13 +1,12 @@
 import os
 from app.config import PATH_DIR
 
-# read the docs and return as a single string
 def read_docs() -> str:
-    # search for all .md files in the directory
+    """Traverse PATH_DIR and concatenate all markdown files."""
     docs : str = ""
     for root, _, files in os.walk(str(PATH_DIR)):
         for file in files:
             if file.endswith(".md"):
-                with open(os.path.join(root, file), "r") as f:
+                with open(os.path.join(root, file), "r", encoding="utf-8") as f:
                     docs += f.read() + "\n"
     return docs
diff --git a/app/retrieval/vector_store.py b/app/retrieval/vector_store.py
@@ -2,6 +2,10 @@
 from app.ingestion.embedding import text_embedding
 
 def retrieve_top_k(query: str, k: int = 5):
+    """Retrieve the top k most similar documents to the given query.
+
+    This function computes the embedding for the query, queries the vector database
+    for documents ordered by cosine similarity, and returns the top k results."""
     embedding = text_embedding(query)
 
     conn = get_db_connection()