Skip to content

Commit f3608c8

Browse files
added docstrings to ingestion, retrieval and generation functions
1 parent 1768b77 commit f3608c8

5 files changed

Lines changed: 23 additions & 13 deletions

File tree

app/generation/response.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,17 @@
44
client = get_mistral_client()
55

66
def generate_response(query: str, context: list[tuple[str, float]]) -> str:
7+
"""
8+
Generate a response to a user query using DevGuard documentation context.
9+
10+
Formats the provided context into a prompt, sends it to the Mistral API. If context
11+
is unavailable, the assistant will indicate so. If the query is unrelated to DevGuard,
12+
the assistant will politely decline and redirect to DevGuard topics.
13+
14+
Safe prompt prepends: "Always assist with care, respect, and truth. Respond with
15+
utmost utility yet securely. Avoid harmful, unethical, prejudiced, or negative
16+
content. Ensure replies promote fairness and positivity."
17+
"""
718
# format context
819
context_text = "\n\n".join(
920
f"- {content}" for content, _ in context
@@ -25,10 +36,6 @@ def generate_response(query: str, context: list[tuple[str, float]]) -> str:
2536

2637
message= [{"role": "user", "content": prompt}]
2738

28-
"""
29-
Toggling the safe prompt will prepend your messages with the following system prompt:
30-
Always assist with care, respect, and truth. Respond with utmost utility yet securely. Avoid harmful, unethical, prejudiced, or negative content. Ensure replies promote fairness and positivity.
31-
"""
3239
response = client.chat.complete(
3340
model=MODEL_GENERATION,
3441
messages=message,

app/ingestion/chunking.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from app.config import CHUNK_SIZE, OVERLAP_SIZE
22

3-
# split the given docs up in chunks without spliting up words
43
def chunking(docs: str) -> list[str]:
4+
"""Split documentation into chunks of approximately CHUNK_SIZE without breaking words."""
55
chunks : list[str] = []
66
start : int = 0
77
while start < len(docs):
@@ -19,8 +19,9 @@ def chunking(docs: str) -> list[str]:
1919
start = last_space - OVERLAP_SIZE
2020
return chunks
2121

22-
# option: apply overlap to the chunks after initial chunking to ensure that there is some context between them
2322
def apply_overlap(chunks: list[str]) -> list[str]:
23+
"""Apply overlap to chunks by prepending the last OVERLAP_SIZE characters
24+
from the previous chunk to each subsequent chunk"""
2425
if OVERLAP_SIZE <= 0:
2526
return chunks
2627

@@ -38,9 +39,8 @@ def apply_overlap(chunks: list[str]) -> list[str]:
3839
return overlapped
3940

4041

41-
# split recursively for a hierarchy of separators
42-
# attempt to split on high-level separators first, then move to increasingly finer separators if chunks remain too large
4342
def recursive_chunking(docs: str, separators: list[str] = ["\n\n", "\n", ". ", " ", ""]):
43+
"""Split text recursively using a hierarchy of separators. """
4444
# base case
4545
if len(docs) <= CHUNK_SIZE:
4646
return [docs]

app/ingestion/embedding.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
from app.clients import get_mistral_client
44
from app.config import BATCH_SIZE, MODEL_EMBEDDING
55

6-
# get the embeddings for a list of chunks, return a list of embeddings
76
def get_embeddings(chunks: list[str]) -> list[list[float]]:
7+
"""Generate and return embeddings for a list of text chunks"""
88
client = get_mistral_client()
99
embeddings: list[list[float]] = []
1010
# call the api with batches to avoid hitting the rate limit
@@ -19,8 +19,8 @@ def get_embeddings(chunks: list[str]) -> list[list[float]]:
1919
embeddings.append(list(embedding))
2020
return embeddings
2121

22-
# get embedding for a single chunk of text
2322
def text_embedding(chunk: str) -> list[float]:
23+
"""Generate an embedding for a single piece of text"""
2424
client = get_mistral_client()
2525
# call the mistral api to get the embedding for the given text
2626
response = client.embeddings.create(

app/ingestion/reader.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
11
import os
22
from app.config import PATH_DIR
33

4-
# read the docs and return as a single string
54
def read_docs() -> str:
6-
# search for all .md files in the directory
5+
"""Traverse PATH_DIR and concatenate all markdown files."""
76
docs : str = ""
87
for root, _, files in os.walk(str(PATH_DIR)):
98
for file in files:
109
if file.endswith(".md"):
11-
with open(os.path.join(root, file), "r") as f:
10+
with open(os.path.join(root, file), "r", encoding="utf-8") as f:
1211
docs += f.read() + "\n"
1312
return docs

app/retrieval/vector_store.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22
from app.ingestion.embedding import text_embedding
33

44
def retrieve_top_k(query: str, k: int = 5):
5+
"""Retrieve the top k most similar documents to the given query.
6+
7+
This function computes the embedding for the query, queries the vector database
8+
for documents ordered by cosine similarity, and returns the top k results."""
59
embedding = text_embedding(query)
610

711
conn = get_db_connection()

0 commit comments

Comments
 (0)