From 682e9c076b984864bd03973c69299cb673f53034 Mon Sep 17 00:00:00 2001 From: Radek Kowalski Date: Tue, 11 Feb 2025 10:10:48 +0000 Subject: [PATCH 01/32] upgrade --- src/tools/rag/write_descriptions.py | 313 ++++++++++++++++++++-------- 1 file changed, 231 insertions(+), 82 deletions(-) diff --git a/src/tools/rag/write_descriptions.py b/src/tools/rag/write_descriptions.py index 4518593e..5cb4e1e1 100644 --- a/src/tools/rag/write_descriptions.py +++ b/src/tools/rag/write_descriptions.py @@ -1,123 +1,272 @@ +"""Functions to create an index of files for RAG.""" + +import logging import os +import sys from pathlib import Path + +import chromadb +from dotenv import find_dotenv, load_dotenv from langchain.prompts import ChatPromptTemplate from langchain_core.output_parsers import StrOutputParser -from dotenv import load_dotenv, find_dotenv -import chromadb -import sys -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))) -from src.utilities.util_functions import join_paths, read_coderrules -from src.utilities.start_work_functions import CoderIgnore, file_folder_ignored +from langchain_core.runnables.base import RunnableSequence + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))) +from src.utilities.exceptions import MissingEnvironmentVariableError +from src.utilities.util_functions import join_paths from src.utilities.llms import init_llms_mini -load_dotenv(find_dotenv()) -work_dir = os.getenv("WORK_DIR") +## Configure the logging level +logging.basicConfig(level=logging.INFO) -def is_code_file(file_path): +def relevant_extension(file_path: Path, file_extension_constraint: set[str]) -> bool: + """Checker for whether file extension indicates a script.""" # List of common code file extensions - code_extensions = { - '.js', '.jsx', '.ts', '.tsx', '.vue', '.py', '.rb', '.php', '.java', '.c', '.cpp', '.cs', '.go', '.swift', - '.kt', '.rs', '.htm','.html', '.css', '.scss', '.sass', '.less', '.prompt', - } - return file_path.suffix.lower() in code_extensions + return file_path.suffix.lower() in file_extension_constraint # read file content. place name of file in the top -def get_content(file_path): - with open(file_path, 'r', encoding='utf-8') as file: +def get_content(file_path: Path) -> str: + """Collect file name and content to return them together as string.""" + with open(file_path, encoding="utf-8") as file: content = file.read() - content = file_path.name + '\n' + content - return content - -def collect_file_pathes(subfolders, work_dir): - """ - Collect and return a list of allowed code files from the given subfolders - under the work_dir according to is_code_file criteria and .coderignore patterns. - """ - allowed_files = [] - for folder in subfolders: - for root, _, files in os.walk(work_dir + folder): - for file in files: - file_path = Path(root) / file - if not is_code_file(file_path): - continue - relative_path_str = file_path.relative_to(work_dir).as_posix() - if file_folder_ignored(relative_path_str): - continue - allowed_files.append(file_path) - return allowed_files + return file_path.name + "\n" + content -def write_descriptions(subfolders_with_files=['/']): - all_files = collect_file_pathes(subfolders_with_files, work_dir) +def evaluate_file(root: str, file: str, file_extension_constraint: set[str] | None, ignore: set[str]) -> Path | None: + """Return file path if the file is to be considered.""" + file_path = Path(root).joinpath(file) + if len(ignore.intersection(file_path.parts)) > 0: + return None + if file_extension_constraint and relevant_extension( + file_path, file_extension_constraint=file_extension_constraint + ): + return file_path + return None - coderrules = read_coderrules() - prompt = ChatPromptTemplate.from_template( -f"""First, get known with info about project (may be useful, may be not): - -''' -{coderrules} -''' +def files_in_directory( + directories_with_files_to_describe: list[str | Path], + file_extension_constraint: set[str] | None, + ignore: set[str], +) -> list[Path]: + """Fetch paths of files in directory.""" + files_to_describe = [] + for directory in directories_with_files_to_describe: + directory_files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))] + tmp = [ + evaluate_file( + root=str(directory), + file=file, + file_extension_constraint=file_extension_constraint, + ignore=ignore, + ) + for file in directory_files + ] + files_to_describe.extend(tmp) + for root, _, files in os.walk(directory): + tmp = [ + evaluate_file( + root=root, + file=file, + file_extension_constraint=file_extension_constraint, + ignore=ignore, + ) + for file in files + ] + files_to_describe.extend(tmp) + return files_to_describe -Describe the code in 4 sentences or less, focusing only on important information from integration point of view. -Write what file is responsible for. -Go traight to the thing in description, without starting sentence. +def save_file_description(file_path: Path, work_dir: str, description: str, file_description_dir: str) -> None: + """Save file description.""" + file_name = file_path.relative_to(work_dir).as_posix().replace("/", "=") + output_path = join_paths(file_description_dir, f"{file_name}.txt") + with open(output_path, "w", encoding="utf-8") as out_file: + out_file.write(description) -''' -{{code}} -''' -""" - ) - llms = init_llms_mini(tools=[], run_name='File Describer') - llm = llms[0] - chain = prompt | llm | StrOutputParser() - description_folder = join_paths(work_dir, '.clean_coder/files_and_folders_descriptions') - Path(description_folder).mkdir(parents=True, exist_ok=True) +def output_descriptions( + files_to_describe: list[Path], chain: RunnableSequence, file_description_dir: str, work_dir: str +) -> None: + """Generate & output file descriptions to designated directory in WORK_DIR.""" # iterate over all files, take 8 files at once batch_size = 8 - for i in range(0, len(all_files), batch_size): - files_iteration = all_files[i:i + batch_size] + for i in range(0, len(files_to_describe), batch_size): + files_iteration = [f for f in files_to_describe[i : i + batch_size] if f is not None] descriptions = chain.batch([get_content(file_path) for file_path in files_iteration]) - print(descriptions) + logging.debug(descriptions) + [ + save_file_description( + file_path=file_path, + work_dir=work_dir, + description=description, + file_description_dir=file_description_dir, + ) + for file_path, description in zip(files_iteration, descriptions, strict=True) + ] - for file_path, description in zip(files_iteration, descriptions): - file_name = file_path.relative_to(work_dir).as_posix().replace('/', '=') - output_path = join_paths(description_folder, f"{file_name}.txt") - with open(output_path, 'w', encoding='utf-8') as out_file: - out_file.write(description) +def produce_descriptions( + directories_with_files_to_describe: list[str | Path], + file_description_dir: str, + work_dir: str, + ignore: set[str], + file_extension_constraint: set[str] | None = None, +) -> None: + """ + Produce short descriptions of files. Store the descriptions in .clean_coder folder in WORK_DIR. + Inputs: + directories_with_files_to_describe: directories from which files are to be described. + file_description_dir: directory where generated file descriptions are to be saved to. + work_dir: project directory worked on with Clean Coder. + ignore: files and folders to ignore. + file_extension_constraint: The list of file extension types accepted, if it's provided. -def upload_descriptions_to_vdb(): - chroma_client = chromadb.PersistentClient(path=join_paths(work_dir, '.clean_coder/chroma_base')) - collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions" + Example: + work_dir = os.getenv("WORK_DIR") # provide your own directory of choice if WORK_DIR is not set. + if not work_dir: + msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file" + raise MissingEnvironmentVariableError(msg) + file_description_dir = join_paths(work_dir, ".clean_coder/workdir_file_descriptions") + file_extension_constraint = { + ".js", ".jsx", ".ts", ".tsx", ".vue", ".py", ".rb", ".php", ".java", ".c", ".cpp", ".cs", ".go", ".swift", + ".kt", ".rs", ".htm",".html", ".css", ".scss", ".sass", ".less", ".prompt", + } + ignore = {".clean_coder", ".coderrules"} + produce_descriptions(directories_with_files_to_describe=[work_dir], + file_description_dir=file_description_dir, + work_dir=work_dir, + file_extension_constraint=file_extension_constraint, + ignore=ignore, + ) + """ + files_to_describe = files_in_directory( + directories_with_files_to_describe=directories_with_files_to_describe, + file_extension_constraint=file_extension_constraint, + ignore=ignore, + ) - collection = chroma_client.get_or_create_collection( - name=collection_name + prompt = ChatPromptTemplate.from_template( + """Describe the following code in 4 sentences or less, focusing only on important information from integration point of view. + Write what file is responsible for.\n\n'''\n{code}''' + """, ) - # read files and upload to base - description_folder = join_paths(work_dir, '.clean_coder/files_and_folders_descriptions') - for root, _, files in os.walk(description_folder): + llms = init_llms_mini(tools=[], run_name="File Describer") + llm = llms[0] + chain = prompt | llm | StrOutputParser() + Path(file_description_dir).mkdir(parents=True, exist_ok=True) + output_descriptions( + files_to_describe=files_to_describe, work_dir=work_dir, chain=chain, file_description_dir=file_description_dir + ) + + +def upload_to_collection(collection: chromadb.PersistentClient, file_description_dir: str) -> None: + """Insert file information to chroma database.""" + for root, _, files in os.walk(file_description_dir): for file in files: file_path = Path(root) / file - with open(file_path, 'r', encoding='utf-8') as file: - content = file.read() + with open(file_path, encoding="utf-8") as f: + content = f.read() collection.upsert( documents=[ - content + content, ], - ids=[file_path.name.replace('=', '/').removesuffix(".txt")], + ids=[file_path.name.replace("=", "/").removesuffix(".txt")], ) -if __name__ == '__main__': - #provide optionally which subfolders needs to be checked, if you don't want to describe all project folder - write_descriptions(subfolders_with_files=['/']) +def upload_descriptions_to_vdb( + chroma_collection_name: str, + work_dir: str, + file_description_dir: str, + vdb_location: str = ".clean_coder/chroma_base", +) -> None: + """ + Upload file descriptions to chroma database. + + Inputs: + chroma_collection_name: name of the collection within Chroma vector database to save file descriptions in. + file_description_dir: directory where generated file descriptions are available. + work_dir: project directory worked on with Clean Coder. + vdb_location: (optional) location for storing the vector database. + + Example: + work_dir = os.getenv("WORK_DIR") # provide your own directory of choice if WORK_DIR is not set. + if not work_dir: + msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file" + raise MissingEnvironmentVariableError(msg) + file_description_dir = join_paths(work_dir, ".clean_coder/workdir_file_descriptions") + file_extension_constraint = { + ".js", ".jsx", ".ts", ".tsx", ".vue", ".py", ".rb", ".php", ".java", ".c", ".cpp", ".cs", ".go", ".swift", + ".kt", ".rs", ".htm",".html", ".css", ".scss", ".sass", ".less", ".prompt", + } + ignore = {".clean_coder", ".coderrules"} + produce_descriptions(directories_with_files_to_describe=[work_dir], + file_description_dir=file_description_dir, + work_dir=work_dir, + file_extension_constraint=file_extension_constraint, + ignore=ignore, + ) + chroma_collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions" + upload_descriptions_to_vdb(chroma_collection_name=chroma_collection_name, work_dir=work_dir, file_description_dir=file_description_dir) + """ + chroma_client = chromadb.PersistentClient(path=join_paths(work_dir, vdb_location)) + collection = chroma_client.get_or_create_collection( + name=chroma_collection_name, + ) + + # read files and upload to base + upload_to_collection(collection=collection, file_description_dir=file_description_dir) + - upload_descriptions_to_vdb() +if __name__ == "__main__": + # provide optionally which subfolders needs to be checked, if you don't want to describe all project folder + # load environment + load_dotenv(find_dotenv()) + work_dir = os.getenv("WORK_DIR") + if not work_dir: + msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file" + raise MissingEnvironmentVariableError(msg) + file_description_dir = join_paths(work_dir, ".clean_coder/workdir_file_descriptions") + file_extension_constraint = { + ".js", + ".jsx", + ".ts", + ".tsx", + ".vue", + ".py", + ".rb", + ".php", + ".java", + ".c", + ".cpp", + ".cs", + ".go", + ".swift", + ".kt", + ".rs", + ".htm", + ".html", + ".css", + ".scss", + ".sass", + ".less", + ".prompt", + } + ignore = {".clean_coder", ".coderrules"} + produce_descriptions( + directories_with_files_to_describe=[work_dir], + file_description_dir=file_description_dir, + work_dir=work_dir, + file_extension_constraint=file_extension_constraint, + ignore=ignore, + ) + chroma_collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions" + upload_descriptions_to_vdb( + chroma_collection_name=chroma_collection_name, work_dir=work_dir, file_description_dir=file_description_dir + ) From 983454560f2c9e80d7c3542c33caf8b042a83d9f Mon Sep 17 00:00:00 2001 From: Radek Kowalski Date: Tue, 18 Feb 2025 16:05:08 +0000 Subject: [PATCH 02/32] custom exception message --- src/utilities/exceptions.py | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 src/utilities/exceptions.py diff --git a/src/utilities/exceptions.py b/src/utilities/exceptions.py new file mode 100644 index 00000000..e1c8f66d --- /dev/null +++ b/src/utilities/exceptions.py @@ -0,0 +1,4 @@ +"""Custom exception messages.""" + +class MissingEnvironmentVariableError(Exception): + """Enviromental variable missing.""" From aae7de3cb217f4f76ae43dd201ced5c719130109 Mon Sep 17 00:00:00 2001 From: Radek Kowalski Date: Tue, 18 Feb 2025 16:05:25 +0000 Subject: [PATCH 03/32] response to comments --- src/tools/rag/write_descriptions.py | 53 +++++++++++++---------------- 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/src/tools/rag/write_descriptions.py b/src/tools/rag/write_descriptions.py index 5cb4e1e1..22a5e3ad 100644 --- a/src/tools/rag/write_descriptions.py +++ b/src/tools/rag/write_descriptions.py @@ -13,9 +13,9 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))) from src.utilities.exceptions import MissingEnvironmentVariableError -from src.utilities.util_functions import join_paths from src.utilities.llms import init_llms_mini - +from src.utilities.start_work_functions import file_folder_ignored +from src.utilities.util_functions import join_paths ## Configure the logging level logging.basicConfig(level=logging.INFO) @@ -35,13 +35,16 @@ def get_content(file_path: Path) -> str: return file_path.name + "\n" + content -def evaluate_file(root: str, file: str, file_extension_constraint: set[str] | None, ignore: set[str]) -> Path | None: +def add_to_indexing_if_relevant(root: str, file: str, file_extension_constraint: set[str] | None) -> Path | None: """Return file path if the file is to be considered.""" file_path = Path(root).joinpath(file) - if len(ignore.intersection(file_path.parts)) > 0: + if file_folder_ignored(str(file_path)): + # ignore files and folders mentioned in .coderignore return None - if file_extension_constraint and relevant_extension( - file_path, file_extension_constraint=file_extension_constraint + if not file_extension_constraint: + return file_path + if relevant_extension( + file_path, file_extension_constraint=file_extension_constraint, ): return file_path return None @@ -50,29 +53,26 @@ def evaluate_file(root: str, file: str, file_extension_constraint: set[str] | No def files_in_directory( directories_with_files_to_describe: list[str | Path], file_extension_constraint: set[str] | None, - ignore: set[str], ) -> list[Path]: """Fetch paths of files in directory.""" files_to_describe = [] for directory in directories_with_files_to_describe: directory_files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))] tmp = [ - evaluate_file( + add_to_indexing_if_relevant( root=str(directory), file=file, file_extension_constraint=file_extension_constraint, - ignore=ignore, ) for file in directory_files ] files_to_describe.extend(tmp) for root, _, files in os.walk(directory): tmp = [ - evaluate_file( + add_to_indexing_if_relevant( root=root, file=file, file_extension_constraint=file_extension_constraint, - ignore=ignore, ) for file in files ] @@ -80,8 +80,11 @@ def files_in_directory( return files_to_describe -def save_file_description(file_path: Path, work_dir: str, description: str, file_description_dir: str) -> None: +def save_file_description(file_path: Path, description: str, file_description_dir: str) -> None: """Save file description.""" + if not work_dir: + msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file" + raise MissingEnvironmentVariableError(msg) file_name = file_path.relative_to(work_dir).as_posix().replace("/", "=") output_path = join_paths(file_description_dir, f"{file_name}.txt") with open(output_path, "w", encoding="utf-8") as out_file: @@ -89,7 +92,7 @@ def save_file_description(file_path: Path, work_dir: str, description: str, file def output_descriptions( - files_to_describe: list[Path], chain: RunnableSequence, file_description_dir: str, work_dir: str + files_to_describe: list[Path], chain: RunnableSequence, file_description_dir: str, ) -> None: """Generate & output file descriptions to designated directory in WORK_DIR.""" # iterate over all files, take 8 files at once @@ -101,7 +104,6 @@ def output_descriptions( [ save_file_description( file_path=file_path, - work_dir=work_dir, description=description, file_description_dir=file_description_dir, ) @@ -112,8 +114,6 @@ def output_descriptions( def produce_descriptions( directories_with_files_to_describe: list[str | Path], file_description_dir: str, - work_dir: str, - ignore: set[str], file_extension_constraint: set[str] | None = None, ) -> None: """ @@ -122,7 +122,6 @@ def produce_descriptions( Inputs: directories_with_files_to_describe: directories from which files are to be described. file_description_dir: directory where generated file descriptions are to be saved to. - work_dir: project directory worked on with Clean Coder. ignore: files and folders to ignore. file_extension_constraint: The list of file extension types accepted, if it's provided. @@ -139,15 +138,12 @@ def produce_descriptions( ignore = {".clean_coder", ".coderrules"} produce_descriptions(directories_with_files_to_describe=[work_dir], file_description_dir=file_description_dir, - work_dir=work_dir, file_extension_constraint=file_extension_constraint, - ignore=ignore, ) """ files_to_describe = files_in_directory( directories_with_files_to_describe=directories_with_files_to_describe, file_extension_constraint=file_extension_constraint, - ignore=ignore, ) prompt = ChatPromptTemplate.from_template( @@ -161,7 +157,7 @@ def produce_descriptions( chain = prompt | llm | StrOutputParser() Path(file_description_dir).mkdir(parents=True, exist_ok=True) output_descriptions( - files_to_describe=files_to_describe, work_dir=work_dir, chain=chain, file_description_dir=file_description_dir + files_to_describe=files_to_describe, chain=chain, file_description_dir=file_description_dir ) @@ -182,7 +178,6 @@ def upload_to_collection(collection: chromadb.PersistentClient, file_description def upload_descriptions_to_vdb( chroma_collection_name: str, - work_dir: str, file_description_dir: str, vdb_location: str = ".clean_coder/chroma_base", ) -> None: @@ -192,7 +187,6 @@ def upload_descriptions_to_vdb( Inputs: chroma_collection_name: name of the collection within Chroma vector database to save file descriptions in. file_description_dir: directory where generated file descriptions are available. - work_dir: project directory worked on with Clean Coder. vdb_location: (optional) location for storing the vector database. Example: @@ -208,13 +202,15 @@ def upload_descriptions_to_vdb( ignore = {".clean_coder", ".coderrules"} produce_descriptions(directories_with_files_to_describe=[work_dir], file_description_dir=file_description_dir, - work_dir=work_dir, file_extension_constraint=file_extension_constraint, - ignore=ignore, ) chroma_collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions" - upload_descriptions_to_vdb(chroma_collection_name=chroma_collection_name, work_dir=work_dir, file_description_dir=file_description_dir) + upload_descriptions_to_vdb(chroma_collection_name=chroma_collection_name, file_description_dir=file_description_dir) """ + work_dir = os.getenv("WORK_DIR") + if not work_dir: + msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file" + raise MissingEnvironmentVariableError(msg) chroma_client = chromadb.PersistentClient(path=join_paths(work_dir, vdb_location)) collection = chroma_client.get_or_create_collection( name=chroma_collection_name, @@ -258,15 +254,12 @@ def upload_descriptions_to_vdb( ".less", ".prompt", } - ignore = {".clean_coder", ".coderrules"} produce_descriptions( directories_with_files_to_describe=[work_dir], file_description_dir=file_description_dir, - work_dir=work_dir, file_extension_constraint=file_extension_constraint, - ignore=ignore, ) chroma_collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions" upload_descriptions_to_vdb( - chroma_collection_name=chroma_collection_name, work_dir=work_dir, file_description_dir=file_description_dir + chroma_collection_name=chroma_collection_name, file_description_dir=file_description_dir, ) From 838c88f52f4ea8b04c0e88ae7bb1831546a42a2f Mon Sep 17 00:00:00 2001 From: Radek Kowalski Date: Sat, 22 Feb 2025 12:34:03 +0000 Subject: [PATCH 04/32] remove examples from docstrings. Remove 'if not workdir' from if __name__ == main --- src/tools/rag/write_descriptions.py | 37 ----------------------------- 1 file changed, 37 deletions(-) diff --git a/src/tools/rag/write_descriptions.py b/src/tools/rag/write_descriptions.py index 22a5e3ad..b1ebc4b1 100644 --- a/src/tools/rag/write_descriptions.py +++ b/src/tools/rag/write_descriptions.py @@ -124,22 +124,6 @@ def produce_descriptions( file_description_dir: directory where generated file descriptions are to be saved to. ignore: files and folders to ignore. file_extension_constraint: The list of file extension types accepted, if it's provided. - - Example: - work_dir = os.getenv("WORK_DIR") # provide your own directory of choice if WORK_DIR is not set. - if not work_dir: - msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file" - raise MissingEnvironmentVariableError(msg) - file_description_dir = join_paths(work_dir, ".clean_coder/workdir_file_descriptions") - file_extension_constraint = { - ".js", ".jsx", ".ts", ".tsx", ".vue", ".py", ".rb", ".php", ".java", ".c", ".cpp", ".cs", ".go", ".swift", - ".kt", ".rs", ".htm",".html", ".css", ".scss", ".sass", ".less", ".prompt", - } - ignore = {".clean_coder", ".coderrules"} - produce_descriptions(directories_with_files_to_describe=[work_dir], - file_description_dir=file_description_dir, - file_extension_constraint=file_extension_constraint, - ) """ files_to_describe = files_in_directory( directories_with_files_to_describe=directories_with_files_to_describe, @@ -188,29 +172,8 @@ def upload_descriptions_to_vdb( chroma_collection_name: name of the collection within Chroma vector database to save file descriptions in. file_description_dir: directory where generated file descriptions are available. vdb_location: (optional) location for storing the vector database. - - Example: - work_dir = os.getenv("WORK_DIR") # provide your own directory of choice if WORK_DIR is not set. - if not work_dir: - msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file" - raise MissingEnvironmentVariableError(msg) - file_description_dir = join_paths(work_dir, ".clean_coder/workdir_file_descriptions") - file_extension_constraint = { - ".js", ".jsx", ".ts", ".tsx", ".vue", ".py", ".rb", ".php", ".java", ".c", ".cpp", ".cs", ".go", ".swift", - ".kt", ".rs", ".htm",".html", ".css", ".scss", ".sass", ".less", ".prompt", - } - ignore = {".clean_coder", ".coderrules"} - produce_descriptions(directories_with_files_to_describe=[work_dir], - file_description_dir=file_description_dir, - file_extension_constraint=file_extension_constraint, - ) - chroma_collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions" - upload_descriptions_to_vdb(chroma_collection_name=chroma_collection_name, file_description_dir=file_description_dir) """ work_dir = os.getenv("WORK_DIR") - if not work_dir: - msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file" - raise MissingEnvironmentVariableError(msg) chroma_client = chromadb.PersistentClient(path=join_paths(work_dir, vdb_location)) collection = chroma_client.get_or_create_collection( name=chroma_collection_name, From 41b73dcda559e4fa444abf64b55aa9303ac2ff7e Mon Sep 17 00:00:00 2001 From: Radek Kowalski Date: Sat, 22 Feb 2025 12:36:54 +0000 Subject: [PATCH 05/32] fix for 'if not work_dir --- src/tools/rag/write_descriptions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tools/rag/write_descriptions.py b/src/tools/rag/write_descriptions.py index b1ebc4b1..aedf6c40 100644 --- a/src/tools/rag/write_descriptions.py +++ b/src/tools/rag/write_descriptions.py @@ -174,6 +174,9 @@ def upload_descriptions_to_vdb( vdb_location: (optional) location for storing the vector database. """ work_dir = os.getenv("WORK_DIR") + if not work_dir: + msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file" + raise MissingEnvironmentVariableError(msg) chroma_client = chromadb.PersistentClient(path=join_paths(work_dir, vdb_location)) collection = chroma_client.get_or_create_collection( name=chroma_collection_name, @@ -188,9 +191,6 @@ def upload_descriptions_to_vdb( # load environment load_dotenv(find_dotenv()) work_dir = os.getenv("WORK_DIR") - if not work_dir: - msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file" - raise MissingEnvironmentVariableError(msg) file_description_dir = join_paths(work_dir, ".clean_coder/workdir_file_descriptions") file_extension_constraint = { ".js", From d2f8a67d460ac5ff25e9e1c94e66a67f76bb65f6 Mon Sep 17 00:00:00 2001 From: Radek Kowalski Date: Sat, 22 Feb 2025 12:48:43 +0000 Subject: [PATCH 06/32] rename file_extension_constraint to code_extensions --- src/tools/rag/write_descriptions.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/tools/rag/write_descriptions.py b/src/tools/rag/write_descriptions.py index aedf6c40..a45d28ba 100644 --- a/src/tools/rag/write_descriptions.py +++ b/src/tools/rag/write_descriptions.py @@ -21,10 +21,10 @@ logging.basicConfig(level=logging.INFO) -def relevant_extension(file_path: Path, file_extension_constraint: set[str]) -> bool: +def relevant_extension(file_path: Path, code_extensions: set[str]) -> bool: """Checker for whether file extension indicates a script.""" # List of common code file extensions - return file_path.suffix.lower() in file_extension_constraint + return file_path.suffix.lower() in code_extensions # read file content. place name of file in the top @@ -35,16 +35,16 @@ def get_content(file_path: Path) -> str: return file_path.name + "\n" + content -def add_to_indexing_if_relevant(root: str, file: str, file_extension_constraint: set[str] | None) -> Path | None: +def add_to_indexing_if_relevant(root: str, file: str, code_extensions: set[str] | None) -> Path | None: """Return file path if the file is to be considered.""" file_path = Path(root).joinpath(file) if file_folder_ignored(str(file_path)): # ignore files and folders mentioned in .coderignore return None - if not file_extension_constraint: + if not code_extensions: return file_path if relevant_extension( - file_path, file_extension_constraint=file_extension_constraint, + file_path, code_extensions=code_extensions, ): return file_path return None @@ -52,7 +52,7 @@ def add_to_indexing_if_relevant(root: str, file: str, file_extension_constraint: def files_in_directory( directories_with_files_to_describe: list[str | Path], - file_extension_constraint: set[str] | None, + code_extensions: set[str] | None, ) -> list[Path]: """Fetch paths of files in directory.""" files_to_describe = [] @@ -62,7 +62,7 @@ def files_in_directory( add_to_indexing_if_relevant( root=str(directory), file=file, - file_extension_constraint=file_extension_constraint, + code_extensions=code_extensions, ) for file in directory_files ] @@ -72,7 +72,7 @@ def files_in_directory( add_to_indexing_if_relevant( root=root, file=file, - file_extension_constraint=file_extension_constraint, + code_extensions=code_extensions, ) for file in files ] @@ -114,7 +114,7 @@ def output_descriptions( def produce_descriptions( directories_with_files_to_describe: list[str | Path], file_description_dir: str, - file_extension_constraint: set[str] | None = None, + code_extensions: set[str] | None = None, ) -> None: """ Produce short descriptions of files. Store the descriptions in .clean_coder folder in WORK_DIR. @@ -123,11 +123,11 @@ def produce_descriptions( directories_with_files_to_describe: directories from which files are to be described. file_description_dir: directory where generated file descriptions are to be saved to. ignore: files and folders to ignore. - file_extension_constraint: The list of file extension types accepted, if it's provided. + code_extensions: The list of file extension types accepted, if it's provided. """ files_to_describe = files_in_directory( directories_with_files_to_describe=directories_with_files_to_describe, - file_extension_constraint=file_extension_constraint, + code_extensions=code_extensions, ) prompt = ChatPromptTemplate.from_template( @@ -192,7 +192,7 @@ def upload_descriptions_to_vdb( load_dotenv(find_dotenv()) work_dir = os.getenv("WORK_DIR") file_description_dir = join_paths(work_dir, ".clean_coder/workdir_file_descriptions") - file_extension_constraint = { + code_extensions = { ".js", ".jsx", ".ts", @@ -220,7 +220,7 @@ def upload_descriptions_to_vdb( produce_descriptions( directories_with_files_to_describe=[work_dir], file_description_dir=file_description_dir, - file_extension_constraint=file_extension_constraint, + code_extensions=code_extensions, ) chroma_collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions" upload_descriptions_to_vdb( From 62b86c40c052e64208ca16d556be4ac25321874c Mon Sep 17 00:00:00 2001 From: Radek Kowalski Date: Sat, 22 Feb 2025 12:58:04 +0000 Subject: [PATCH 07/32] make default file extensions as the baseline option --- src/tools/rag/write_descriptions.py | 31 ++++------------------------- 1 file changed, 4 insertions(+), 27 deletions(-) diff --git a/src/tools/rag/write_descriptions.py b/src/tools/rag/write_descriptions.py index a45d28ba..5ec0a305 100644 --- a/src/tools/rag/write_descriptions.py +++ b/src/tools/rag/write_descriptions.py @@ -4,6 +4,7 @@ import os import sys from pathlib import Path +from typing import Literal import chromadb from dotenv import find_dotenv, load_dotenv @@ -114,7 +115,7 @@ def output_descriptions( def produce_descriptions( directories_with_files_to_describe: list[str | Path], file_description_dir: str, - code_extensions: set[str] | None = None, + code_extensions: set[str] | Literal["default"] | None = "default", ) -> None: """ Produce short descriptions of files. Store the descriptions in .clean_coder folder in WORK_DIR. @@ -125,6 +126,8 @@ def produce_descriptions( ignore: files and folders to ignore. code_extensions: The list of file extension types accepted, if it's provided. """ + if code_extensions == "default": + code_extensions = {".py", ".java", ".js", ".ts", ".html", ".css", ".scss", ".sql", ".json", ".xml"} files_to_describe = files_in_directory( directories_with_files_to_describe=directories_with_files_to_describe, code_extensions=code_extensions, @@ -192,35 +195,9 @@ def upload_descriptions_to_vdb( load_dotenv(find_dotenv()) work_dir = os.getenv("WORK_DIR") file_description_dir = join_paths(work_dir, ".clean_coder/workdir_file_descriptions") - code_extensions = { - ".js", - ".jsx", - ".ts", - ".tsx", - ".vue", - ".py", - ".rb", - ".php", - ".java", - ".c", - ".cpp", - ".cs", - ".go", - ".swift", - ".kt", - ".rs", - ".htm", - ".html", - ".css", - ".scss", - ".sass", - ".less", - ".prompt", - } produce_descriptions( directories_with_files_to_describe=[work_dir], file_description_dir=file_description_dir, - code_extensions=code_extensions, ) chroma_collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions" upload_descriptions_to_vdb( From 881f300822689f6ddcb50062fdcd2257e8e23de2 Mon Sep 17 00:00:00 2001 From: Radek Kowalski Date: Sat, 22 Feb 2025 14:54:18 +0000 Subject: [PATCH 08/32] improve function names --- src/tools/rag/write_descriptions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/tools/rag/write_descriptions.py b/src/tools/rag/write_descriptions.py index 5ec0a305..c3cd017b 100644 --- a/src/tools/rag/write_descriptions.py +++ b/src/tools/rag/write_descriptions.py @@ -51,7 +51,7 @@ def add_to_indexing_if_relevant(root: str, file: str, code_extensions: set[str] return None -def files_in_directory( +def find_files_to_describe( directories_with_files_to_describe: list[str | Path], code_extensions: set[str] | None, ) -> list[Path]: @@ -92,7 +92,7 @@ def save_file_description(file_path: Path, description: str, file_description_di out_file.write(description) -def output_descriptions( +def describe_files( files_to_describe: list[Path], chain: RunnableSequence, file_description_dir: str, ) -> None: """Generate & output file descriptions to designated directory in WORK_DIR.""" @@ -128,7 +128,7 @@ def produce_descriptions( """ if code_extensions == "default": code_extensions = {".py", ".java", ".js", ".ts", ".html", ".css", ".scss", ".sql", ".json", ".xml"} - files_to_describe = files_in_directory( + files_to_describe = find_files_to_describe( directories_with_files_to_describe=directories_with_files_to_describe, code_extensions=code_extensions, ) @@ -143,7 +143,7 @@ def produce_descriptions( llm = llms[0] chain = prompt | llm | StrOutputParser() Path(file_description_dir).mkdir(parents=True, exist_ok=True) - output_descriptions( + describe_files( files_to_describe=files_to_describe, chain=chain, file_description_dir=file_description_dir ) From d02ea9504291945aeb032e944cdbc9402027542a Mon Sep 17 00:00:00 2001 From: Radek Kowalski Date: Sat, 22 Feb 2025 15:01:12 +0000 Subject: [PATCH 09/32] debug --- src/tools/rag/write_descriptions.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/tools/rag/write_descriptions.py b/src/tools/rag/write_descriptions.py index c3cd017b..d855572f 100644 --- a/src/tools/rag/write_descriptions.py +++ b/src/tools/rag/write_descriptions.py @@ -21,6 +21,10 @@ ## Configure the logging level logging.basicConfig(level=logging.INFO) +# load environment +load_dotenv(find_dotenv()) # load environment variables from .env file +work_dir = os.getenv("WORK_DIR") + def relevant_extension(file_path: Path, code_extensions: set[str]) -> bool: """Checker for whether file extension indicates a script.""" @@ -192,8 +196,6 @@ def upload_descriptions_to_vdb( if __name__ == "__main__": # provide optionally which subfolders needs to be checked, if you don't want to describe all project folder # load environment - load_dotenv(find_dotenv()) - work_dir = os.getenv("WORK_DIR") file_description_dir = join_paths(work_dir, ".clean_coder/workdir_file_descriptions") produce_descriptions( directories_with_files_to_describe=[work_dir], From 563055957753955e1128976a08d6f885ac3b925e Mon Sep 17 00:00:00 2001 From: Radek Kowalski Date: Mon, 24 Feb 2025 10:01:26 +0000 Subject: [PATCH 10/32] remove use of work_dir checks and missing environment variable error. --- src/tools/rag/write_descriptions.py | 8 +------- src/utilities/exceptions.py | 4 ---- 2 files changed, 1 insertion(+), 11 deletions(-) delete mode 100644 src/utilities/exceptions.py diff --git a/src/tools/rag/write_descriptions.py b/src/tools/rag/write_descriptions.py index d855572f..5641e4a0 100644 --- a/src/tools/rag/write_descriptions.py +++ b/src/tools/rag/write_descriptions.py @@ -13,7 +13,6 @@ from langchain_core.runnables.base import RunnableSequence sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))) -from src.utilities.exceptions import MissingEnvironmentVariableError from src.utilities.llms import init_llms_mini from src.utilities.start_work_functions import file_folder_ignored from src.utilities.util_functions import join_paths @@ -87,9 +86,7 @@ def find_files_to_describe( def save_file_description(file_path: Path, description: str, file_description_dir: str) -> None: """Save file description.""" - if not work_dir: - msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file" - raise MissingEnvironmentVariableError(msg) + work_dir = os.getenv("WORK_DIR") file_name = file_path.relative_to(work_dir).as_posix().replace("/", "=") output_path = join_paths(file_description_dir, f"{file_name}.txt") with open(output_path, "w", encoding="utf-8") as out_file: @@ -181,9 +178,6 @@ def upload_descriptions_to_vdb( vdb_location: (optional) location for storing the vector database. """ work_dir = os.getenv("WORK_DIR") - if not work_dir: - msg = "WORK_DIR variable not provided. Please add WORK_DIR to .env file" - raise MissingEnvironmentVariableError(msg) chroma_client = chromadb.PersistentClient(path=join_paths(work_dir, vdb_location)) collection = chroma_client.get_or_create_collection( name=chroma_collection_name, diff --git a/src/utilities/exceptions.py b/src/utilities/exceptions.py deleted file mode 100644 index e1c8f66d..00000000 --- a/src/utilities/exceptions.py +++ /dev/null @@ -1,4 +0,0 @@ -"""Custom exception messages.""" - -class MissingEnvironmentVariableError(Exception): - """Enviromental variable missing.""" From d0d60939f2a9e9e319c8710290a059f9984cc8f0 Mon Sep 17 00:00:00 2001 From: Grigorij Dudnik Date: Tue, 25 Feb 2025 11:32:30 +0100 Subject: [PATCH 11/32] numpy in req --- requirements.txt | 3 +- src/utilities/syntax_checker_functions.py | 426 ++++------------------ 2 files changed, 82 insertions(+), 347 deletions(-) diff --git a/requirements.txt b/requirements.txt index e0e73e24..7dff3c4d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,4 +30,5 @@ pyright==1.1.390 ruff==0.8.2 httpx==0.27.2 questionary==2.1.0 -pathspec==0.12.1 \ No newline at end of file +pathspec==0.12.1 +numpy==1.26.4 \ No newline at end of file diff --git a/src/utilities/syntax_checker_functions.py b/src/utilities/syntax_checker_functions.py index d6b778fc..8ca50316 100644 --- a/src/utilities/syntax_checker_functions.py +++ b/src/utilities/syntax_checker_functions.py @@ -216,355 +216,89 @@ def parse_yaml(yaml_string): if __name__ == "__main__": code = """ -"use client"; - -import { useState, useEffect, useRef } from "react"; -import Image from "next/image"; -import { useRouter } from "next/navigation"; -import ProfileCard from "./components/ProfileCard"; -import PopupNotification from "./components/PopupNotification"; - -interface ProfileItem { - uuid: string; - full_name: string; - short_bio?: string; - bio?: string; -} - -export default function Home() { - const [activeTab, setActiveTab] = useState<'Explore' | 'Received' | 'Sent' | 'Matches'>('Explore'); - const [exploreItems, setExploreItems] = useState([]); - const [receivedItems, setReceivedItems] = useState([]); - const [sentItems, setSentItems] = useState([]); - const [matchedItems, setMatchedItems] = useState([]); - const [error, setError] = useState(''); - const [notification, setNotification] = useState<{ message: string, type: 'positive' | 'negative' } | null>(null); - const [loading, setLoading] = useState(false); - const [iconLoading, setIconLoading] = useState(true); - const [skip, setSkip] = useState(0); - const [limit] = useState(10); - const [totalExploreItems, setTotalExploreItems] = useState(0); - const sentinelRef = useRef(null); - const router = useRouter(); - - function goToProfile(uuid: string) { - const userRole = localStorage.getItem('role'); - if (userRole === "intern") { - router.push(`/campaign/${uuid}`); - } else { - router.push(`/intern/${uuid}`); - } - } - - async function handleConnect(uuid: string) { - setLoading(true); - try { - const token = localStorage.getItem('token'); - if (!token) throw new Error('Authentication token not found'); - const response = await fetch(`${process.env.NEXT_PUBLIC_API_URL}/invitations/create/${uuid}`, { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - 'Authorization': `Bearer ${token}`, - }, - }); - if (!response.ok) throw new Error('Failed to create invitation'); - setNotification({ message: 'Invitation sent successfully', type: 'positive' }); - - // Optimistically update the explore list - setExploreItems((prevItems) => prevItems.filter(item => item.uuid !== uuid)); - } catch (err: any) { - setNotification({ message: err.message, type: 'negative' }); - } finally { - setLoading(false); - setTimeout(() => setNotification(null), 3000); - } - } - - async function handleAccept(invitationId: string) { - setLoading(true); - try { - const token = localStorage.getItem('token'); - if (!token) throw new Error('Authentication token not found'); - const response = await fetch(`${process.env.NEXT_PUBLIC_API_URL}/invitations/accept/${invitationId}`, { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - 'Authorization': `Bearer ${token}`, - }, - }); - if (!response.ok) throw new Error('Failed to accept invitation'); - setNotification({ message: 'Invitation accepted successfully', type: 'positive' }); - setReceivedItems((prevItems) => prevItems.filter(item => item.invitation_id !== invitationId)); - } catch (err: any) { - setNotification({ message: err.message, type: 'negative' }); - } finally { - setLoading(false); - setTimeout(() => setNotification(null), 3000); - } - } - - async function handleReject(invitationId: string) { - setLoading(true); - try { - const token = localStorage.getItem('token'); - if (!token) throw new Error('Authentication token not found'); - const response = await fetch(`${process.env.NEXT_PUBLIC_API_URL}/invitations/reject/${invitationId}`, { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - 'Authorization': `Bearer ${token}`, - }, - }); - if (!response.ok) throw new Error('Failed to reject invitation'); - setNotification({ message: 'Invitation rejected successfully', type: 'positive' }); - setReceivedItems((prevItems) => prevItems.filter(item => item.invitation_id !== invitationId)); - } catch (err: any) { - setNotification({ message: err.message, type: 'negative' }); - } finally { - setLoading(false); - setTimeout(() => setNotification(null), 3000); - } - } - - async function handleCancel(invitationId: string) { - setLoading(true); - try { - const token = localStorage.getItem('token'); - if (!token) throw new Error('Authentication token not found'); - const response = await fetch(`${process.env.NEXT_PUBLIC_API_URL}/invitations/cancel/${invitationId}`, { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - 'Authorization': `Bearer ${token}`, - }, - }); - if (!response.ok) throw new Error('Failed to cancel invitation'); - setNotification({ message: 'Invitation canceled successfully', type: 'positive' }); - setSentItems((prevItems) => prevItems.filter(item => item.invitation_id !== invitationId)); - } catch (err: any) { - setNotification({ message: err.message, type: 'negative' }); - } finally { - setLoading(false); - setTimeout(() => setNotification(null), 3000); - } - } - async function fetchExplore() { - try { - const userRole = localStorage.getItem('role'); - const token = localStorage.getItem('token'); - - if (!token) { - throw new Error('Authentication token not found'); - } - - if (!userRole) { - throw new Error('User role not found'); - } - - const url = `${process.env.NEXT_PUBLIC_API_URL}${ - userRole === "intern" - ? '/fetch-campaigns-for-main-page' - : '/fetch-interns-for-main-page' - }?skip=${skip}&limit=${limit}`; - - console.log('Fetching from URL:', url); - - const response = await fetch(url, { - method: 'GET', - headers: { - 'Content-Type': 'application/json', - 'Authorization': `Bearer ${token}`, - }, - }); - - if (!response.ok) { - const errorData = await response.json(); - throw new Error(errorData.detail || 'Failed to fetch explore items'); + + + + """ - print(parse_tsx(code)) \ No newline at end of file + print(parse_vue_basic(code)) \ No newline at end of file From 5660f79ad96b8f3cffd59dae0013f4a84bfa5083 Mon Sep 17 00:00:00 2001 From: Grigorij Dudnik Date: Tue, 25 Feb 2025 11:53:47 +0100 Subject: [PATCH 12/32] code splitter added --- src/tools/rag/code_splitter.py | 54 ++++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/src/tools/rag/code_splitter.py b/src/tools/rag/code_splitter.py index bfa71650..09bdcfc1 100644 --- a/src/tools/rag/code_splitter.py +++ b/src/tools/rag/code_splitter.py @@ -4,9 +4,7 @@ ) -python_splitter = RecursiveCharacterTextSplitter.from_language( - language=Language.PYTHON, chunk_size=1000, chunk_overlap=0 -) + code = """ @@ -310,9 +308,55 @@ def load_system_message(): project_rules=read_coderrules() )) """ +extension_to_language = { + 'cpp': 'cpp', + 'go': 'go', + 'java': 'java', + 'kt': 'kotlin', + 'js': 'js', + 'jsx': 'js', + 'vue': 'js', + 'ts': 'ts', + 'tsx': 'ts', + 'mjs': 'js', + 'cjs': 'js', + 'php': 'php', + 'proto': 'proto', + 'py': 'python', + 'rst': 'rst', + 'rb': 'ruby', + 'rs': 'rust', + 'scala': 'scala', + 'swift': 'swift', + 'md': 'markdown', + 'tex': 'latex', + 'html': 'html', + 'sol': 'sol', + 'cs': 'csharp', + 'cob': 'cobol', + 'c': 'c', + 'lua': 'lua', + 'pl': 'perl', + 'hs': 'haskell', + 'ex': 'elixir', + 'ps1': 'powershell', + 'json': 'json', + 'xml': 'xml', + 'bash': 'powershell', + 'zsh': 'powershell', + 'sh': 'powershell', + 'dockerfile': 'proto', +} +def split_code(code, extension, chunk_size=1000): + language = extension_to_language.get(extension) + if not language: + return + splitter = RecursiveCharacterTextSplitter.from_language( + language=Language(language), chunk_size=chunk_size, chunk_overlap=0 + ) + return splitter.split_text(code) -splitted = python_splitter.split_text(code) -print(RecursiveCharacterTextSplitter.get_separators_for_language(Language.PYTHON)) +splitted = split_code(code, "py") for doc in splitted: print(doc) print("###") From b65614a7200b12e015486b0651ecfce93993ca95 Mon Sep 17 00:00:00 2001 From: Grigorij Dudnik Date: Tue, 25 Feb 2025 11:58:50 +0100 Subject: [PATCH 13/32] code splitter added --- src/tools/rag/code_splitter.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/tools/rag/code_splitter.py b/src/tools/rag/code_splitter.py index 09bdcfc1..cd8ade29 100644 --- a/src/tools/rag/code_splitter.py +++ b/src/tools/rag/code_splitter.py @@ -3,10 +3,6 @@ RecursiveCharacterTextSplitter, ) - - - - code = """ from langchain_openai.chat_models import ChatOpenAI from langchain_community.chat_models import ChatOllama @@ -347,7 +343,10 @@ def load_system_message(): 'sh': 'powershell', 'dockerfile': 'proto', } -def split_code(code, extension, chunk_size=1000): + + +def split_code(code: str, extension: str, chunk_size: int = 1000): + """Splits code for smaller elements as functions. That allows to describe functions for semantic retrieval tool.""" language = extension_to_language.get(extension) if not language: return @@ -356,6 +355,7 @@ def split_code(code, extension, chunk_size=1000): ) return splitter.split_text(code) + splitted = split_code(code, "py") for doc in splitted: print(doc) From 13f5ed1c08a5497fd8139cf8ebd9cace1ab405f1 Mon Sep 17 00:00:00 2001 From: Grigorij Dudnik Date: Wed, 26 Feb 2025 11:43:59 +0100 Subject: [PATCH 14/32] PR tempalte --- .github/PULL_REQUEST_TEMPLATE/template.md | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .github/PULL_REQUEST_TEMPLATE/template.md diff --git a/.github/PULL_REQUEST_TEMPLATE/template.md b/.github/PULL_REQUEST_TEMPLATE/template.md new file mode 100644 index 00000000..cfb25108 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE/template.md @@ -0,0 +1,11 @@ +## Pull Request Template + +### Description +Please provide a detailed description of the changes made in this pull request. + +### Related Issue +If this pull request addresses an existing issue, please reference it here (e.g., "Fixes #123"). + +### Checklist +- [ ] I have tested these changes locally. +- [ ] I'm making contribution to the `dev` branch. Direct contributions to `master` are not allowed. Don't worry, they will be merged to `master` on the nearest release. From c17adcba2030f37e432b0a929b447ee6b6b48358 Mon Sep 17 00:00:00 2001 From: Radek Kowalski Date: Wed, 26 Feb 2025 11:25:54 +0000 Subject: [PATCH 15/32] restore previous prompt --- src/tools/rag/write_descriptions.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/tools/rag/write_descriptions.py b/src/tools/rag/write_descriptions.py index 5641e4a0..c67c51bc 100644 --- a/src/tools/rag/write_descriptions.py +++ b/src/tools/rag/write_descriptions.py @@ -15,7 +15,7 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))) from src.utilities.llms import init_llms_mini from src.utilities.start_work_functions import file_folder_ignored -from src.utilities.util_functions import join_paths +from src.utilities.util_functions import join_paths, read_coderrules ## Configure the logging level logging.basicConfig(level=logging.INFO) @@ -133,11 +133,19 @@ def produce_descriptions( directories_with_files_to_describe=directories_with_files_to_describe, code_extensions=code_extensions, ) - + coderrules = read_coderrules() prompt = ChatPromptTemplate.from_template( - """Describe the following code in 4 sentences or less, focusing only on important information from integration point of view. - Write what file is responsible for.\n\n'''\n{code}''' - """, +f"""First, get known with info about project (may be useful, may be not): +''' +{coderrules} +''' +Describe the code in 4 sentences or less, focusing only on important information from integration point of view. +Write what file is responsible for. +Go traight to the thing in description, without starting sentence. +''' +{{code}} +''' +""", ) llms = init_llms_mini(tools=[], run_name="File Describer") From b8352ac677af8004f2c030cc201e46d24250fab2 Mon Sep 17 00:00:00 2001 From: Grigorij Dudnik Date: Thu, 27 Feb 2025 14:51:43 +0100 Subject: [PATCH 16/32] write descriptions - old code restored --- .github/PULL_REQUEST_TEMPLATE/template.md | 3 + src/tools/rag/code_splitter.py | 9 +- src/tools/rag/write_descriptions.py | 297 ++++++++++------------ 3 files changed, 141 insertions(+), 168 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE/template.md b/.github/PULL_REQUEST_TEMPLATE/template.md index cfb25108..fb2040e4 100644 --- a/.github/PULL_REQUEST_TEMPLATE/template.md +++ b/.github/PULL_REQUEST_TEMPLATE/template.md @@ -3,6 +3,9 @@ ### Description Please provide a detailed description of the changes made in this pull request. +### How to use +If you created new functionality, please describe how it could be enabled and used. + ### Related Issue If this pull request addresses an existing issue, please reference it here (e.g., "Fixes #123"). diff --git a/src/tools/rag/code_splitter.py b/src/tools/rag/code_splitter.py index cd8ade29..c7247c01 100644 --- a/src/tools/rag/code_splitter.py +++ b/src/tools/rag/code_splitter.py @@ -356,7 +356,8 @@ def split_code(code: str, extension: str, chunk_size: int = 1000): return splitter.split_text(code) -splitted = split_code(code, "py") -for doc in splitted: - print(doc) - print("###") +if __name__ == "__main__": + splitted = split_code(code, "py") + for doc in splitted: + print(doc) + print("###") diff --git a/src/tools/rag/write_descriptions.py b/src/tools/rag/write_descriptions.py index c67c51bc..11b1a631 100644 --- a/src/tools/rag/write_descriptions.py +++ b/src/tools/rag/write_descriptions.py @@ -1,209 +1,178 @@ -"""Functions to create an index of files for RAG.""" - -import logging import os -import sys from pathlib import Path -from typing import Literal - -import chromadb -from dotenv import find_dotenv, load_dotenv from langchain.prompts import ChatPromptTemplate from langchain_core.output_parsers import StrOutputParser -from langchain_core.runnables.base import RunnableSequence - -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))) -from src.utilities.llms import init_llms_mini -from src.utilities.start_work_functions import file_folder_ignored +from dotenv import load_dotenv, find_dotenv +import chromadb +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))) from src.utilities.util_functions import join_paths, read_coderrules +from src.utilities.start_work_functions import CoderIgnore, file_folder_ignored +from src.utilities.llms import init_llms_mini +from src.tools.rag.code_splitter import split_code -## Configure the logging level -logging.basicConfig(level=logging.INFO) -# load environment -load_dotenv(find_dotenv()) # load environment variables from .env file +load_dotenv(find_dotenv()) work_dir = os.getenv("WORK_DIR") -def relevant_extension(file_path: Path, code_extensions: set[str]) -> bool: - """Checker for whether file extension indicates a script.""" +def is_code_file(file_path): # List of common code file extensions + code_extensions = { + '.js', '.jsx', '.ts', '.tsx', '.vue', '.py', '.rb', '.php', '.java', '.c', '.cpp', '.cs', '.go', '.swift', + '.kt', '.rs', '.htm','.html', '.css', '.scss', '.sass', '.less', '.prompt', + } return file_path.suffix.lower() in code_extensions # read file content. place name of file in the top -def get_content(file_path: Path) -> str: - """Collect file name and content to return them together as string.""" - with open(file_path, encoding="utf-8") as file: +def get_content(file_path): + with open(file_path, 'r', encoding='utf-8') as file: content = file.read() - return file_path.name + "\n" + content - - -def add_to_indexing_if_relevant(root: str, file: str, code_extensions: set[str] | None) -> Path | None: - """Return file path if the file is to be considered.""" - file_path = Path(root).joinpath(file) - if file_folder_ignored(str(file_path)): - # ignore files and folders mentioned in .coderignore - return None - if not code_extensions: - return file_path - if relevant_extension( - file_path, code_extensions=code_extensions, - ): - return file_path - return None - - -def find_files_to_describe( - directories_with_files_to_describe: list[str | Path], - code_extensions: set[str] | None, -) -> list[Path]: - """Fetch paths of files in directory.""" - files_to_describe = [] - for directory in directories_with_files_to_describe: - directory_files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))] - tmp = [ - add_to_indexing_if_relevant( - root=str(directory), - file=file, - code_extensions=code_extensions, - ) - for file in directory_files - ] - files_to_describe.extend(tmp) - for root, _, files in os.walk(directory): - tmp = [ - add_to_indexing_if_relevant( - root=root, - file=file, - code_extensions=code_extensions, - ) - for file in files - ] - files_to_describe.extend(tmp) - return files_to_describe - - -def save_file_description(file_path: Path, description: str, file_description_dir: str) -> None: - """Save file description.""" - work_dir = os.getenv("WORK_DIR") - file_name = file_path.relative_to(work_dir).as_posix().replace("/", "=") - output_path = join_paths(file_description_dir, f"{file_name}.txt") - with open(output_path, "w", encoding="utf-8") as out_file: - out_file.write(description) - - -def describe_files( - files_to_describe: list[Path], chain: RunnableSequence, file_description_dir: str, -) -> None: - """Generate & output file descriptions to designated directory in WORK_DIR.""" - # iterate over all files, take 8 files at once - batch_size = 8 - for i in range(0, len(files_to_describe), batch_size): - files_iteration = [f for f in files_to_describe[i : i + batch_size] if f is not None] - descriptions = chain.batch([get_content(file_path) for file_path in files_iteration]) - logging.debug(descriptions) - [ - save_file_description( - file_path=file_path, - description=description, - file_description_dir=file_description_dir, - ) - for file_path, description in zip(files_iteration, descriptions, strict=True) - ] - + content = file_path.name + '\n' + content + return content -def produce_descriptions( - directories_with_files_to_describe: list[str | Path], - file_description_dir: str, - code_extensions: set[str] | Literal["default"] | None = "default", -) -> None: +def collect_file_pathes(subfolders, work_dir): """ - Produce short descriptions of files. Store the descriptions in .clean_coder folder in WORK_DIR. - - Inputs: - directories_with_files_to_describe: directories from which files are to be described. - file_description_dir: directory where generated file descriptions are to be saved to. - ignore: files and folders to ignore. - code_extensions: The list of file extension types accepted, if it's provided. + Collect and return a list of allowed code files from the given subfolders + under the work_dir according to is_code_file criteria and .coderignore patterns. """ - if code_extensions == "default": - code_extensions = {".py", ".java", ".js", ".ts", ".html", ".css", ".scss", ".sql", ".json", ".xml"} - files_to_describe = find_files_to_describe( - directories_with_files_to_describe=directories_with_files_to_describe, - code_extensions=code_extensions, - ) + allowed_files = [] + for folder in subfolders: + for root, _, files in os.walk(work_dir + folder): + for file in files: + file_path = Path(root) / file + if not is_code_file(file_path): + continue + relative_path_str = file_path.relative_to(work_dir).as_posix() + if file_folder_ignored(relative_path_str): + continue + allowed_files.append(file_path) + return allowed_files + + +def write_file_descriptions(subfolders_with_files=['/']): + all_files = collect_file_pathes(subfolders_with_files, work_dir) + coderrules = read_coderrules() + prompt = ChatPromptTemplate.from_template( f"""First, get known with info about project (may be useful, may be not): + ''' {coderrules} ''' + Describe the code in 4 sentences or less, focusing only on important information from integration point of view. Write what file is responsible for. -Go traight to the thing in description, without starting sentence. + +Go straight to the thing in description, without starting sentence. + ''' {{code}} ''' -""", +""" ) - - llms = init_llms_mini(tools=[], run_name="File Describer") + llms = init_llms_mini(tools=[], run_name='File Describer') llm = llms[0] chain = prompt | llm | StrOutputParser() - Path(file_description_dir).mkdir(parents=True, exist_ok=True) - describe_files( - files_to_describe=files_to_describe, chain=chain, file_description_dir=file_description_dir + + description_folder = join_paths(work_dir, '.clean_coder/files_and_folders_descriptions') + Path(description_folder).mkdir(parents=True, exist_ok=True) + # iterate over all files, take 8 files at once and descrive files in batch + batch_size = 8 + for i in range(0, len(all_files), batch_size): + files_iteration = all_files[i:i + batch_size] + descriptions = chain.batch([get_content(file_path) for file_path in files_iteration]) + print(descriptions) + + for file_path, description in zip(files_iteration, descriptions): + file_name = file_path.relative_to(work_dir).as_posix().replace('/', '=') + output_path = join_paths(description_folder, f"{file_name}.txt") + + with open(output_path, 'w', encoding='utf-8') as out_file: + out_file.write(description) + + + +def write_file_chunks_descriptions(subfolders_with_files=['/']): + all_files = collect_file_pathes(subfolders_with_files, work_dir) + + coderrules = read_coderrules() + + prompt = ChatPromptTemplate.from_template( +f"""First, get known with info about project (may be useful, may be not): + +''' +{coderrules} +''' + +For the reference, you have code of whole file here: + +''' +{{file_code}} +''' + +Describe provided function/file_chunk in 4 sentences or less, focusing only on important information from integration point of view. +Write what function/file chunk is responsible for. + +Go straight to the thing in description, without starting sentence. + +Here is file chunk to describe: +''' +{{chunk_code}} +''' +""" ) + llms = init_llms_mini(tools=[], run_name='File Describer') + llm = llms[0] + chain = prompt | llm | StrOutputParser() + description_folder = join_paths(work_dir, '.clean_coder/files_and_folders_descriptions') + Path(description_folder).mkdir(parents=True, exist_ok=True) + # iterate chunks inside of the file + for file_path in all_files: + file_content = get_content(file_path) + # get file extenstion + extension = file_path.split('.')[-1] + file_chunks = split_code(file_content, extension) + descriptions = chain.batch(file_chunks) + print(descriptions) -def upload_to_collection(collection: chromadb.PersistentClient, file_description_dir: str) -> None: - """Insert file information to chroma database.""" - for root, _, files in os.walk(file_description_dir): - for file in files: - file_path = Path(root) / file - with open(file_path, encoding="utf-8") as f: - content = f.read() - collection.upsert( - documents=[ - content, - ], - ids=[file_path.name.replace("=", "/").removesuffix(".txt")], - ) + for file_path, description in zip(files_iteration, descriptions): + file_name = file_path.relative_to(work_dir).as_posix().replace('/', '=') + output_path = join_paths(description_folder, f"{file_name}.txt") + with open(output_path, 'w', encoding='utf-8') as out_file: + out_file.write(description) -def upload_descriptions_to_vdb( - chroma_collection_name: str, - file_description_dir: str, - vdb_location: str = ".clean_coder/chroma_base", -) -> None: - """ - Upload file descriptions to chroma database. - Inputs: - chroma_collection_name: name of the collection within Chroma vector database to save file descriptions in. - file_description_dir: directory where generated file descriptions are available. - vdb_location: (optional) location for storing the vector database. - """ - work_dir = os.getenv("WORK_DIR") - chroma_client = chromadb.PersistentClient(path=join_paths(work_dir, vdb_location)) + +def upload_descriptions_to_vdb(): + chroma_client = chromadb.PersistentClient(path=join_paths(work_dir, '.clean_coder/chroma_base')) + collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions" + collection = chroma_client.get_or_create_collection( - name=chroma_collection_name, + name=collection_name ) # read files and upload to base - upload_to_collection(collection=collection, file_description_dir=file_description_dir) + description_folder = join_paths(work_dir, '.clean_coder/files_and_folders_descriptions') + for root, _, files in os.walk(description_folder): + for file in files: + file_path = Path(root) / file + with open(file_path, 'r', encoding='utf-8') as file: + content = file.read() + collection.upsert( + documents=[ + content + ], + ids=[file_path.name.replace('=', '/').removesuffix(".txt")], + ) -if __name__ == "__main__": - # provide optionally which subfolders needs to be checked, if you don't want to describe all project folder - # load environment - file_description_dir = join_paths(work_dir, ".clean_coder/workdir_file_descriptions") - produce_descriptions( - directories_with_files_to_describe=[work_dir], - file_description_dir=file_description_dir, - ) - chroma_collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions" - upload_descriptions_to_vdb( - chroma_collection_name=chroma_collection_name, file_description_dir=file_description_dir, - ) +if __name__ == '__main__': + #provide optionally which subfolders needs to be checked, if you don't want to describe all project folder + write_file_descriptions(subfolders_with_files=['/']) + + upload_descriptions_to_vdb() \ No newline at end of file From 1e0c448f5786477727555bd3d8414a1ecef85d1e Mon Sep 17 00:00:00 2001 From: Grigorij Dudnik Date: Fri, 28 Feb 2025 09:16:20 +0100 Subject: [PATCH 17/32] write chunk descriptions started to work --- src/prompts/describe_file_chunks.prompt | 21 +++++++++++++ src/tools/rag/write_descriptions.py | 40 +++++++------------------ 2 files changed, 31 insertions(+), 30 deletions(-) create mode 100644 src/prompts/describe_file_chunks.prompt diff --git a/src/prompts/describe_file_chunks.prompt b/src/prompts/describe_file_chunks.prompt new file mode 100644 index 00000000..a10f7368 --- /dev/null +++ b/src/prompts/describe_file_chunks.prompt @@ -0,0 +1,21 @@ +First, get known with info about project (may be useful, may be not): + +''' +{coderrules} +''' + +For the reference, you have code of whole file here: + +''' +{file_code} +''' + +Describe provided function/file_chunk in 4 sentences or less, focusing only on important information from integration point of view. +Write what function/file chunk is responsible for. + +Go straight to the thing in description, without starting sentence. + +Here is file chunk to describe: +''' +{chunk_code} +''' \ No newline at end of file diff --git a/src/tools/rag/write_descriptions.py b/src/tools/rag/write_descriptions.py index 11b1a631..d4cb3351 100644 --- a/src/tools/rag/write_descriptions.py +++ b/src/tools/rag/write_descriptions.py @@ -29,7 +29,7 @@ def is_code_file(file_path): def get_content(file_path): with open(file_path, 'r', encoding='utf-8') as file: content = file.read() - content = file_path.name + '\n' + content + content = file_path.name + '\n\n' + content return content def collect_file_pathes(subfolders, work_dir): @@ -97,33 +97,13 @@ def write_file_descriptions(subfolders_with_files=['/']): def write_file_chunks_descriptions(subfolders_with_files=['/']): all_files = collect_file_pathes(subfolders_with_files, work_dir) - coderrules = read_coderrules() - prompt = ChatPromptTemplate.from_template( -f"""First, get known with info about project (may be useful, may be not): - -''' -{coderrules} -''' - -For the reference, you have code of whole file here: - -''' -{{file_code}} -''' + grandparent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) + with open(f"{grandparent_dir}/prompts/describe_file_chunks.prompt", "r") as f: + chunks_describe_template = f.read() -Describe provided function/file_chunk in 4 sentences or less, focusing only on important information from integration point of view. -Write what function/file chunk is responsible for. - -Go straight to the thing in description, without starting sentence. - -Here is file chunk to describe: -''' -{{chunk_code}} -''' -""" - ) + prompt = ChatPromptTemplate.from_template(chunks_describe_template) llms = init_llms_mini(tools=[], run_name='File Describer') llm = llms[0] chain = prompt | llm | StrOutputParser() @@ -134,20 +114,19 @@ def write_file_chunks_descriptions(subfolders_with_files=['/']): for file_path in all_files: file_content = get_content(file_path) # get file extenstion - extension = file_path.split('.')[-1] + extension = file_path.suffix.lstrip('.') file_chunks = split_code(file_content, extension) - descriptions = chain.batch(file_chunks) + descriptions = chain.batch([{'coderrules': coderrules, 'file_code': file_content, 'chunk_code': chunk} for chunk in file_chunks]) print(descriptions) - for file_path, description in zip(files_iteration, descriptions): - file_name = file_path.relative_to(work_dir).as_posix().replace('/', '=') + for nr, description in enumerate(descriptions): + file_name = f"{file_path.relative_to(work_dir).as_posix().replace('/', '=')}_chunk{nr}" output_path = join_paths(description_folder, f"{file_name}.txt") with open(output_path, 'w', encoding='utf-8') as out_file: out_file.write(description) - def upload_descriptions_to_vdb(): chroma_client = chromadb.PersistentClient(path=join_paths(work_dir, '.clean_coder/chroma_base')) collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions" @@ -174,5 +153,6 @@ def upload_descriptions_to_vdb(): if __name__ == '__main__': #provide optionally which subfolders needs to be checked, if you don't want to describe all project folder write_file_descriptions(subfolders_with_files=['/']) + write_file_chunks_descriptions() upload_descriptions_to_vdb() \ No newline at end of file From 17dbdb3e11d55fa7205a99824cdd6b96c96a581e Mon Sep 17 00:00:00 2001 From: Grigorij Dudnik Date: Fri, 28 Feb 2025 09:24:38 +0100 Subject: [PATCH 18/32] write chunk descriptions started to work --- .github/PULL_REQUEST_TEMPLATE/template.md | 1 + src/prompts/describe_file_chunks.prompt | 2 -- src/tools/rag/write_descriptions.py | 3 +++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE/template.md b/.github/PULL_REQUEST_TEMPLATE/template.md index fb2040e4..0fb59ae7 100644 --- a/.github/PULL_REQUEST_TEMPLATE/template.md +++ b/.github/PULL_REQUEST_TEMPLATE/template.md @@ -11,4 +11,5 @@ If this pull request addresses an existing issue, please reference it here (e.g. ### Checklist - [ ] I have tested these changes locally. +- [ ] I used docstrings on the begin of every function I created to describe it. Both humans and AI will have no problem to understand my code. - [ ] I'm making contribution to the `dev` branch. Direct contributions to `master` are not allowed. Don't worry, they will be merged to `master` on the nearest release. diff --git a/src/prompts/describe_file_chunks.prompt b/src/prompts/describe_file_chunks.prompt index a10f7368..0ebefc5f 100644 --- a/src/prompts/describe_file_chunks.prompt +++ b/src/prompts/describe_file_chunks.prompt @@ -1,11 +1,9 @@ First, get known with info about project (may be useful, may be not): - ''' {coderrules} ''' For the reference, you have code of whole file here: - ''' {file_code} ''' diff --git a/src/tools/rag/write_descriptions.py b/src/tools/rag/write_descriptions.py index d4cb3351..0f4a36d8 100644 --- a/src/tools/rag/write_descriptions.py +++ b/src/tools/rag/write_descriptions.py @@ -96,6 +96,8 @@ def write_file_descriptions(subfolders_with_files=['/']): def write_file_chunks_descriptions(subfolders_with_files=['/']): + """Writes descriptions of whole file chunks in codebase. Gets list of whole files to describe, divides files + into chunks and describes each chunk separately.""" all_files = collect_file_pathes(subfolders_with_files, work_dir) coderrules = read_coderrules() @@ -128,6 +130,7 @@ def write_file_chunks_descriptions(subfolders_with_files=['/']): def upload_descriptions_to_vdb(): + """Uploads descriptions, created by write_file_chunks_descriptions, into vector database.""" chroma_client = chromadb.PersistentClient(path=join_paths(work_dir, '.clean_coder/chroma_base')) collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions" From 05d5b86a36b5cc0e3cdca7132cb7328c209bc89e Mon Sep 17 00:00:00 2001 From: Grigorij Dudnik Date: Fri, 28 Feb 2025 10:51:20 +0100 Subject: [PATCH 19/32] repairing chunk describing --- src/tools/rag/code_splitter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tools/rag/code_splitter.py b/src/tools/rag/code_splitter.py index c7247c01..a3dd6d90 100644 --- a/src/tools/rag/code_splitter.py +++ b/src/tools/rag/code_splitter.py @@ -349,7 +349,8 @@ def split_code(code: str, extension: str, chunk_size: int = 1000): """Splits code for smaller elements as functions. That allows to describe functions for semantic retrieval tool.""" language = extension_to_language.get(extension) if not language: - return + print(f'problem with extension {extension}') + return [] splitter = RecursiveCharacterTextSplitter.from_language( language=Language(language), chunk_size=chunk_size, chunk_overlap=0 ) From d522b677076e61baf198d0feca89b9d30d5751f4 Mon Sep 17 00:00:00 2001 From: Grigorij Dudnik Date: Mon, 3 Mar 2025 14:45:42 +0100 Subject: [PATCH 20/32] researcher improved --- manager.py | 4 ++-- src/agents/debugger_agent.py | 4 ++-- src/agents/executor_agent.py | 4 ++-- src/agents/frontend_feedback.py | 4 ++-- src/agents/planner_agent.py | 4 ++-- src/agents/researcher_agent.py | 6 +++--- src/prompts/researcher_system.prompt | 31 ++++++++++++++++------------ src/tools/rag/code_splitter.py | 1 - src/tools/rag/retrieval.py | 12 +++++++++-- src/tools/rag/write_descriptions.py | 3 +++ src/utilities/llms.py | 2 +- src/utilities/manager_utils.py | 4 ++-- 12 files changed, 47 insertions(+), 32 deletions(-) diff --git a/manager.py b/manager.py index afdbe311..9c60d830 100644 --- a/manager.py +++ b/manager.py @@ -20,7 +20,7 @@ from src.utilities.langgraph_common_functions import call_model, call_tool, multiple_tools_msg, no_tools_msg, empty_message_msg from src.utilities.start_project_functions import set_up_dot_clean_coder_dir from src.utilities.util_functions import join_paths -from src.utilities.llms import init_llms +from src.utilities.llms import init_llms_medium_intelligence from src.utilities.print_formatters import print_formatted import json import os @@ -36,7 +36,7 @@ def __init__(self): self.work_dir = os.getenv("WORK_DIR") set_up_dot_clean_coder_dir(self.work_dir) self.tools = self.prepare_tools() - self.llms = init_llms(tools=self.tools, run_name="Manager") + self.llms = init_llms_medium_intelligence(tools=self.tools, run_name="Manager") self.manager = self.setup_workflow() self.saved_messages_path = join_paths(self.work_dir, ".clean_coder/manager_messages.json") diff --git a/src/agents/debugger_agent.py b/src/agents/debugger_agent.py index 5adf5741..9dff2782 100644 --- a/src/agents/debugger_agent.py +++ b/src/agents/debugger_agent.py @@ -17,7 +17,7 @@ read_coderrules, convert_images, ) -from src.utilities.llms import init_llms +from src.utilities.llms import init_llms_medium_intelligence from src.utilities.langgraph_common_functions import ( call_model, call_tool, ask_human, after_ask_human_condition, multiple_tools_msg, no_tools_msg, agent_looped_human_help, ) @@ -50,7 +50,7 @@ class Debugger(): def __init__(self, files, work_dir, human_feedback, image_paths, playwright_code=None): self.work_dir = work_dir self.tools = prepare_tools(work_dir) - self.llms = init_llms(self.tools, "Debugger") + self.llms = init_llms_medium_intelligence(self.tools, "Debugger") self.system_message = SystemMessage( content=system_prompt_template.format(project_rules=read_coderrules()) ) diff --git a/src/agents/executor_agent.py b/src/agents/executor_agent.py index 5b40c6e6..d873ba45 100644 --- a/src/agents/executor_agent.py +++ b/src/agents/executor_agent.py @@ -7,7 +7,7 @@ from langgraph.graph import StateGraph, END from dotenv import load_dotenv, find_dotenv from langchain.tools import tool -from src.utilities.llms import init_llms +from src.utilities.llms import init_llms_medium_intelligence from src.utilities.print_formatters import print_formatted, print_error from src.utilities.util_functions import ( check_file_contents, exchange_file_contents, bad_tool_call_looped @@ -43,7 +43,7 @@ class Executor(): def __init__(self, files, work_dir): self.work_dir = work_dir self.tools = prepare_tools(work_dir) - self.llms = init_llms(self.tools, "Executor") + self.llms = init_llms_medium_intelligence(self.tools, "Executor") self.system_message = SystemMessage( content=system_prompt_template ) diff --git a/src/agents/frontend_feedback.py b/src/agents/frontend_feedback.py index 430397ba..f4da34cf 100644 --- a/src/agents/frontend_feedback.py +++ b/src/agents/frontend_feedback.py @@ -1,6 +1,6 @@ import os from langchain_core.messages import HumanMessage -from src.utilities.llms import init_llms +from src.utilities.llms import init_llms_medium_intelligence from src.utilities.start_work_functions import read_frontend_feedback_story import base64 import textwrap @@ -9,7 +9,7 @@ from pydantic import BaseModel, Field -llms = init_llms(run_name="Frontend Feedback") +llms = init_llms_medium_intelligence(run_name="Frontend Feedback") llm = llms[0].with_fallbacks(llms[1:]) diff --git a/src/agents/planner_agent.py b/src/agents/planner_agent.py index 15e916f4..007e23bb 100644 --- a/src/agents/planner_agent.py +++ b/src/agents/planner_agent.py @@ -7,7 +7,7 @@ from src.utilities.langgraph_common_functions import after_ask_human_condition from src.utilities.user_input import user_input from src.utilities.graphics import LoadingAnimation -from src.utilities.llms import init_llms_high_intelligence, init_llms_mini, init_llms +from src.utilities.llms import init_llms_high_intelligence, init_llms_mini, init_llms_medium_intelligence import os @@ -15,7 +15,7 @@ llms_planners = init_llms_high_intelligence(run_name="Planner") llm_strong = llms_planners[0].with_fallbacks(llms_planners[1:]) -llms_middle_strength = init_llms(run_name="Plan finalizer") +llms_middle_strength = init_llms_medium_intelligence(run_name="Plan finalizer") llm_middle_strength = llms_middle_strength[0].with_fallbacks(llms_middle_strength[1:]) llms_controller = init_llms_mini(run_name="Plan Files Controller") llm_controller = llms_controller[0].with_fallbacks(llms_controller[1:]) diff --git a/src/agents/researcher_agent.py b/src/agents/researcher_agent.py index c96ed5df..bdea126a 100644 --- a/src/agents/researcher_agent.py +++ b/src/agents/researcher_agent.py @@ -13,7 +13,7 @@ call_model, call_tool, ask_human, after_ask_human_condition, no_tools_msg ) from src.utilities.print_formatters import print_formatted -from src.utilities.llms import init_llms_mini +from src.utilities.llms import init_llms_medium_intelligence import os @@ -27,7 +27,7 @@ @tool def final_response_researcher( files_to_work_on: Annotated[List[str], "List of existing files to potentially introduce changes"], - reference_files: Annotated[List[str], "List of code files useful as a reference without images"], + reference_files: Annotated[List[str], "List of code files useful as a reference. There are files where similar task been implemented already."], template_images: Annotated[List[str], "List of template images"]): """That tool outputs list of files programmer will need to change and paths to graphical patterns if some. Use that tool only when you 100% sure you found all the files programmer will need to modify. @@ -65,7 +65,7 @@ def __init__(self, work_dir): self.tools = [see_file, list_dir, final_response_researcher] if vdb_available(): self.tools.append(retrieve_files_by_semantic_query) - self.llms = init_llms_mini(self.tools, "Researcher") + self.llms = init_llms_medium_intelligence(self.tools, "Researcher") # workflow definition researcher_workflow = StateGraph(AgentState) diff --git a/src/prompts/researcher_system.prompt b/src/prompts/researcher_system.prompt index bd5d036b..e53aaaa0 100644 --- a/src/prompts/researcher_system.prompt +++ b/src/prompts/researcher_system.prompt @@ -1,21 +1,26 @@ -As a curious filesystem researcher, examine files thoroughly, prioritizing comprehensive checks. -You checking a lot of different folders looking around for interesting files (hey, you are very curious!) before giving the final answer. -The more folders/files you will check, the more they will pay you. -When you discover significant dependencies from one file to another, ensure to inspect both. -Important: you are can not modify any files! You are reasearching only, but modifications will introduce another guys. Do not execute the task, just prepare ground for it's execution. -Your final selection should include files needed to be modified or needed as reference for a programmer -(for example to see how code in similar file implemented). -Avoid recommending unseen or non-existent files in final response. - -You need to point out all files programmer needed to see to execute the task and only that task. Task is: +As a curious filesystem researcher, thoroughly inspect the files for a task by following these steps: + +1. Break down the task to identify which parts of the application are responsible for executing it. Identify the root of the problem. + +2. Search through various folders to find all necessary files needed to modify for completing the task. Explore numerous folders and files to maximize your understanding. + +3. When you find significant dependencies between files, examine both thoroughly. + +4. Remember, you are only researching. Do not modify any files; modifications will be handled by others. Just prepare the groundwork for task execution. + +5. Also identify files that need to be used as a reference for a programmer. Reference files should include examples where similar tasks have been solved or similar coding tools been used and can serve as code guidance. + +6. Only include files that exist and are necessary for the task. You must not provide information about files you haven’t seen or that don’t exist. + +Lastly, list all files the programmer needs to see to execute the task and only include those relevant to this specific task: + ''' {task} ''' -Here is some additional info about project: +Here's some additional information about the project: ''' {project_rules} ''' -First, provide reasoning about results of your previous action. Think what do you need to find now in order to accomplish the task. -Next, call tool(s). You can use up to 3 tool cals simultaniousely to speed up research. \ No newline at end of file +First, think about what you need to find to accomplish the task based on past actions. Then, use up to 3 tools simultaneously to gather this information. \ No newline at end of file diff --git a/src/tools/rag/code_splitter.py b/src/tools/rag/code_splitter.py index a3dd6d90..9201e389 100644 --- a/src/tools/rag/code_splitter.py +++ b/src/tools/rag/code_splitter.py @@ -349,7 +349,6 @@ def split_code(code: str, extension: str, chunk_size: int = 1000): """Splits code for smaller elements as functions. That allows to describe functions for semantic retrieval tool.""" language = extension_to_language.get(extension) if not language: - print(f'problem with extension {extension}') return [] splitter = RecursiveCharacterTextSplitter.from_language( language=Language(language), chunk_size=chunk_size, chunk_overlap=0 diff --git a/src/tools/rag/retrieval.py b/src/tools/rag/retrieval.py index 603ca265..0c96d3ba 100644 --- a/src/tools/rag/retrieval.py +++ b/src/tools/rag/retrieval.py @@ -28,8 +28,16 @@ def vdb_available(): return True if get_collection() else False -def retrieve(question): - # collection should be initialized once, in the class init +def retrieve(question: str) -> str: + """ + Retrieve files descriptions by semantic query. + + Parameters: + question (str): The query to retrieve information for. + + Returns: + str: A formatted response with file descriptions of found files. + """ collection = get_collection() retrieval = collection.query(query_texts=[question], n_results=8) reranked_docs = cohere_client.rerank( diff --git a/src/tools/rag/write_descriptions.py b/src/tools/rag/write_descriptions.py index 0f4a36d8..2ddac8b4 100644 --- a/src/tools/rag/write_descriptions.py +++ b/src/tools/rag/write_descriptions.py @@ -118,6 +118,9 @@ def write_file_chunks_descriptions(subfolders_with_files=['/']): # get file extenstion extension = file_path.suffix.lstrip('.') file_chunks = split_code(file_content, extension) + # do not describe chunk of 1-chunk files + if len(file_chunks) <= 1: + continue descriptions = chain.batch([{'coderrules': coderrules, 'file_code': file_content, 'chunk_code': chunk} for chunk in file_chunks]) print(descriptions) diff --git a/src/utilities/llms.py b/src/utilities/llms.py index 3885dcc9..81fbf2ac 100644 --- a/src/utilities/llms.py +++ b/src/utilities/llms.py @@ -31,7 +31,7 @@ def llm_open_local_hosted(model): timeout=90, ) -def init_llms(tools=None, run_name="Clean Coder", temp=0): +def init_llms_medium_intelligence(tools=None, run_name="Clean Coder", temp=0): llms = [] if getenv("ANTHROPIC_API_KEY"): llms.append(ChatAnthropic(model='claude-3-5-sonnet-20241022', temperature=temp, timeout=60, max_tokens=2048)) diff --git a/src/utilities/manager_utils.py b/src/utilities/manager_utils.py index a2b2e560..1a2d05cd 100644 --- a/src/utilities/manager_utils.py +++ b/src/utilities/manager_utils.py @@ -6,7 +6,7 @@ from langchain_community.chat_models import ChatOllama from langchain_anthropic import ChatAnthropic from langchain_core.messages import HumanMessage, SystemMessage, ToolMessage, AIMessage -from src.utilities.llms import init_llms +from src.utilities.llms import init_llms_medium_intelligence from src.utilities.util_functions import join_paths, read_coderrules, list_directory_tree from src.utilities.start_project_functions import create_project_plan_file from langchain_core.output_parsers import StrOutputParser @@ -52,7 +52,7 @@ with open(f"{parent_dir}/prompts/manager_progress.prompt", "r") as f: tasks_progress_template = f.read() -llms = init_llms(run_name="Progress description") +llms = init_llms_medium_intelligence(run_name="Progress description") llm = llms[0].with_fallbacks(llms[1:]) From 28ff399ead8796ac32ad4a4a775e5ede5b4b1b39 Mon Sep 17 00:00:00 2001 From: Grigorij Dudnik Date: Tue, 4 Mar 2025 10:45:47 +0100 Subject: [PATCH 21/32] updates --- src/tools/rag/retrieval.py | 9 ++++++++- src/tools/rag/write_descriptions.py | 4 +++- src/utilities/llms.py | 7 ++++--- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/tools/rag/retrieval.py b/src/tools/rag/retrieval.py index 0c96d3ba..9d7b7bee 100644 --- a/src/tools/rag/retrieval.py +++ b/src/tools/rag/retrieval.py @@ -40,6 +40,13 @@ def retrieve(question: str) -> str: """ collection = get_collection() retrieval = collection.query(query_texts=[question], n_results=8) + response = "" + for i, description in enumerate(retrieval["documents"]): + filename = retrieval["ids"][0][i] + response += f"{filename}:\n\n{description}\n\n" + response += "\n\nRemember to see files before adding to final response!" + return response + reranked_docs = cohere_client.rerank( query=question, documents=retrieval["documents"][0], @@ -48,7 +55,7 @@ def retrieve(question: str) -> str: #return_documents=True, ) reranked_indexes = [result.index for result in reranked_docs.results] - response = "" + for index in reranked_indexes: filename = retrieval["ids"][0][index] description = retrieval["documents"][0][index] diff --git a/src/tools/rag/write_descriptions.py b/src/tools/rag/write_descriptions.py index 2ddac8b4..b8b23c33 100644 --- a/src/tools/rag/write_descriptions.py +++ b/src/tools/rag/write_descriptions.py @@ -10,6 +10,7 @@ from src.utilities.start_work_functions import CoderIgnore, file_folder_ignored from src.utilities.llms import init_llms_mini from src.tools.rag.code_splitter import split_code +from src.utilities.print_formatters import print_formatted load_dotenv(find_dotenv()) @@ -74,7 +75,7 @@ def write_file_descriptions(subfolders_with_files=['/']): """ ) llms = init_llms_mini(tools=[], run_name='File Describer') - llm = llms[0] + llm = llms[0].with_fallbacks(llms[1:]) chain = prompt | llm | StrOutputParser() description_folder = join_paths(work_dir, '.clean_coder/files_and_folders_descriptions') @@ -134,6 +135,7 @@ def write_file_chunks_descriptions(subfolders_with_files=['/']): def upload_descriptions_to_vdb(): """Uploads descriptions, created by write_file_chunks_descriptions, into vector database.""" + print_formatted("Uploading file descriptions to vector storage...", color='magenta') chroma_client = chromadb.PersistentClient(path=join_paths(work_dir, '.clean_coder/chroma_base')) collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions" diff --git a/src/utilities/llms.py b/src/utilities/llms.py index 81fbf2ac..ed81e8e5 100644 --- a/src/utilities/llms.py +++ b/src/utilities/llms.py @@ -34,9 +34,9 @@ def llm_open_local_hosted(model): def init_llms_medium_intelligence(tools=None, run_name="Clean Coder", temp=0): llms = [] if getenv("ANTHROPIC_API_KEY"): - llms.append(ChatAnthropic(model='claude-3-5-sonnet-20241022', temperature=temp, timeout=60, max_tokens=2048)) + llms.append(ChatAnthropic(model='claude-3-7-sonnet-latest', temperature=temp, timeout=60, max_tokens=2048)) if getenv("OPENROUTER_API_KEY"): - llms.append(llm_open_router("anthropic/claude-3.5-sonnet")) + llms.append(llm_open_router("anthropic/claude-3.7-sonnet")) if getenv("OPENAI_API_KEY"): llms.append(ChatOpenAI(model="gpt-4o", temperature=temp, timeout=60)) # if os.getenv("GOOGLE_API_KEY"): @@ -79,12 +79,13 @@ def init_llms_high_intelligence(tools=None, run_name="Clean Coder", temp=0.2): llms.append(ChatOpenAI(model="o3-mini", temperature=1, timeout=60, reasoning_effort="high")) if os.getenv("OPENAI_API_KEY"): llms.append(ChatOpenAI(model="o1", temperature=1, timeout=60)) + if os.getenv("OPENROUTER_API_KEY"): llms.append(llm_open_router("openai/gpt-4o")) if os.getenv("OPENAI_API_KEY"): llms.append(ChatOpenAI(model="gpt-4o", temperature=temp, timeout=60)) if os.getenv("ANTHROPIC_API_KEY"): - llms.append(ChatAnthropic(model='claude-3-5-sonnet-20241022', temperature=temp, timeout=60, max_tokens=2048)) + llms.append(ChatAnthropic(model='claude-3-7-sonnet-latest', temperature=temp, timeout=60, max_tokens=2048)) # if os.getenv("GOOGLE_API_KEY"): # llms.append(ChatGoogleGenerativeAI(model="gemini-2.0-flash-exp", temperature=temp, timeout=60)) if os.getenv("OLLAMA_MODEL"): From 7ea944e30b4554f414be105631bf50b9512eaf24 Mon Sep 17 00:00:00 2001 From: Grigorij Dudnik Date: Tue, 4 Mar 2025 11:01:53 +0100 Subject: [PATCH 22/32] updating semantic retrieval --- src/tools/rag/retrieval.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tools/rag/retrieval.py b/src/tools/rag/retrieval.py index 9d7b7bee..1e5d24a6 100644 --- a/src/tools/rag/retrieval.py +++ b/src/tools/rag/retrieval.py @@ -41,9 +41,9 @@ def retrieve(question: str) -> str: collection = get_collection() retrieval = collection.query(query_texts=[question], n_results=8) response = "" - for i, description in enumerate(retrieval["documents"]): + for i, description in enumerate(retrieval["documents"][0]): filename = retrieval["ids"][0][i] - response += f"{filename}:\n\n{description}\n\n" + response += f"{filename}:\n\n{description}\n\n###\n\n" response += "\n\nRemember to see files before adding to final response!" return response @@ -59,7 +59,7 @@ def retrieve(question: str) -> str: for index in reranked_indexes: filename = retrieval["ids"][0][index] description = retrieval["documents"][0][index] - response += f"{filename}:\n\n{description}\n\n" + response += f"{filename}:\n{description}\n\n###" response += "\n\nRemember to see files before adding to final response!" return response From c4dca440e711b4ee148c8c4c2071c2083204be85 Mon Sep 17 00:00:00 2001 From: Grigorij Dudnik Date: Tue, 4 Mar 2025 12:02:01 +0100 Subject: [PATCH 23/32] working on binary ranker --- src/tools/rag/retrieval.py | 52 ++++++ src/utilities/syntax_checker_functions.py | 195 +++++++++++++--------- 2 files changed, 172 insertions(+), 75 deletions(-) diff --git a/src/tools/rag/retrieval.py b/src/tools/rag/retrieval.py index 1e5d24a6..0c2ced23 100644 --- a/src/tools/rag/retrieval.py +++ b/src/tools/rag/retrieval.py @@ -3,6 +3,9 @@ import chromadb from pathlib import Path from dotenv import load_dotenv, find_dotenv +from src.utilities.llms import init_llms_mini +from langchain.prompts import ChatPromptTemplate +from langchain_core.output_parsers import StrOutputParser load_dotenv(find_dotenv()) @@ -63,10 +66,59 @@ def retrieve(question: str) -> str: response += "\n\nRemember to see files before adding to final response!" return response +# New class added for binary ranking with lazy loading. +class BinaryRanker: + def __init__(self): + # Lazy-loaded chain; not initialized until rank() is called. + self.chain = None + + def initialize_chain(self): + if self.chain is None: + # Define prompt template for binary ranking. + template = ( + "You are a binary ranker. Evaluate the relevance of a document to a given question.\n" + "Question: {question}\n" + "Document: {document}\n\n" + "If the document is relevant to the question, output only '1'. " + "If it may be useful for programmer as contains similar code, but no relevant directly, also output '1'. " + "If it is not relevant at all, output only '0'." + ) + prompt = ChatPromptTemplate.from_template(template) + # Initialize LLMs with minimal intelligence and set run name to 'BinaryRanker' + llms = init_llms_mini(tools=[], run_name='BinaryRanker') + llm = llms[0].with_fallbacks(llms[1:]) + # Build the chain by combining the prompt template, the LLM instance, and StrOutputParser. + self.chain = prompt | llm | StrOutputParser() + + def rank(self, question: str, retrieval: dict) -> list: + # Ensure the chain is initialized (lazy loading) + self.initialize_chain() + # Extract list of documents and their ids from the retrieval result. + documents_list = retrieval["documents"][0] + id_list = retrieval["ids"][0] + # Build input for batch processing: list of dicts containing question and document. + batch_inputs = [] + for doc in documents_list: + batch_inputs.append({"question": question, "document": doc}) + # Use the chain batch function to get binary outputs. + results = self.chain.batch(batch_inputs) + # Pair each document id with its binary ranking result. + ranking = [] + for idx, result in enumerate(results): + ranking.append((id_list[idx], result.strip())) + return ranking if __name__ == "__main__": + # Example usage of BinaryRanker for testing. question = "Common styles, used in the main page" + collection = get_collection() + retrieval = collection.query(query_texts=[question], n_results=8) + binary_ranker = BinaryRanker() + ranking = binary_ranker.rank(question, retrieval) + print("Binary Ranking Results:", ranking) + + # Test the retrieve function results = retrieve(question) print("\n\n") print("results: ", results) diff --git a/src/utilities/syntax_checker_functions.py b/src/utilities/syntax_checker_functions.py index 8ca50316..bc6122a4 100644 --- a/src/utilities/syntax_checker_functions.py +++ b/src/utilities/syntax_checker_functions.py @@ -216,89 +216,134 @@ def parse_yaml(yaml_string): if __name__ == "__main__": code = """ - - - + }; + + fetchProfile(); + }, [uuid]); + + if (error) { + return ( +
+ {error} +
+ ); + } + + if (!profile) { + return ( +
+ Loading profile data... +
+ ); + } + + if (!profile.survey_data) { + return
No survey data available.
; + } + + return ( +
+
+ + +
- + {profile.survey_data.map((category) => ( +
+

+ {category.name} +

+ {category.statements.map((statement: any) => ( +
+ + {statement.value} + +

+ {statement.text} +

+
+ ))} +
+ ))} +
+ ); +} """ - print(parse_vue_basic(code)) \ No newline at end of file + print(parse_tsx(code)) \ No newline at end of file From de89d423037b25f6115aa7c18618e7986586c8c9 Mon Sep 17 00:00:00 2001 From: Grigorij Dudnik Date: Tue, 4 Mar 2025 12:54:23 +0100 Subject: [PATCH 24/32] binary ranker for semantic retrieval done --- src/prompts/binary_ranker.prompt | 8 +++ src/tools/rag/retrieval.py | 102 +++++++++++++++++++------------ 2 files changed, 71 insertions(+), 39 deletions(-) create mode 100644 src/prompts/binary_ranker.prompt diff --git a/src/prompts/binary_ranker.prompt b/src/prompts/binary_ranker.prompt new file mode 100644 index 00000000..b5110a6b --- /dev/null +++ b/src/prompts/binary_ranker.prompt @@ -0,0 +1,8 @@ +You are a binary ranker. Evaluate if document can contain answer for a given question. +Question: """{question}""" +Filename: """{filename}""" +Document: """{document}""" + +If the document is relevant to the question, output only '1'. +If it may be useful for programmer as contains similar code, but no relevant directly, also output only '1'. +If it is not relevant at all, output only '0'. diff --git a/src/tools/rag/retrieval.py b/src/tools/rag/retrieval.py index 0c2ced23..3498acdb 100644 --- a/src/tools/rag/retrieval.py +++ b/src/tools/rag/retrieval.py @@ -43,47 +43,61 @@ def retrieve(question: str) -> str: """ collection = get_collection() retrieval = collection.query(query_texts=[question], n_results=8) + + # Use BinaryRanker to filter relevant documents + binary_ranker = BinaryRanker() + ranking_results = binary_ranker.rank(question, retrieval) + + # Filter documents that are marked as relevant (score = '1') response = "" - for i, description in enumerate(retrieval["documents"][0]): - filename = retrieval["ids"][0][i] - response += f"{filename}:\n\n{description}\n\n###\n\n" + for filename, score in ranking_results: + if score == '1': + # Find the corresponding document in the retrieval results + idx = retrieval["ids"][0].index(filename) + description = retrieval["documents"][0][idx] + response += f"{filename}:\n\n{description}\n\n###\n\n" + + # If no relevant documents found, return a message + if not response: + return "No relevant documents found for your query." + response += "\n\nRemember to see files before adding to final response!" return response - reranked_docs = cohere_client.rerank( - query=question, - documents=retrieval["documents"][0], - top_n=4, - model="rerank-english-v3.0", - #return_documents=True, - ) - reranked_indexes = [result.index for result in reranked_docs.results] - - for index in reranked_indexes: - filename = retrieval["ids"][0][index] - description = retrieval["documents"][0][index] - response += f"{filename}:\n{description}\n\n###" - response += "\n\nRemember to see files before adding to final response!" - return response # New class added for binary ranking with lazy loading. class BinaryRanker: + """ + A binary document ranker that uses LLM to determine document relevance. + + This class implements lazy loading of the LLM chain, meaning the chain + is only initialized when the rank method is called. It evaluates whether + each document is relevant to a given question, returning a binary score + (0 or 1) for each document. + """ def __init__(self): + """ + Initialize the BinaryRanker with lazy loading. + + The LLM chain is not created until the rank method is called. + """ # Lazy-loaded chain; not initialized until rank() is called. self.chain = None def initialize_chain(self): + """ + Initialize the LLM chain if it hasn't been initialized yet. + + This method loads the prompt template from an external file, initializes the LLM, + and builds the chain used for binary document ranking. + """ if self.chain is None: - # Define prompt template for binary ranking. - template = ( - "You are a binary ranker. Evaluate the relevance of a document to a given question.\n" - "Question: {question}\n" - "Document: {document}\n\n" - "If the document is relevant to the question, output only '1'. " - "If it may be useful for programmer as contains similar code, but no relevant directly, also output '1'. " - "If it is not relevant at all, output only '0'." - ) - prompt = ChatPromptTemplate.from_template(template) + # Load the binary ranker prompt from an external file. + grandparent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) + file_path = f"{grandparent_dir}/prompts/binary_ranker.prompt" + with open(file_path, 'r') as file_handle: + template_text = file_handle.read() + prompt = ChatPromptTemplate.from_template(template_text) # Initialize LLMs with minimal intelligence and set run name to 'BinaryRanker' llms = init_llms_mini(tools=[], run_name='BinaryRanker') llm = llms[0].with_fallbacks(llms[1:]) @@ -91,32 +105,42 @@ def initialize_chain(self): self.chain = prompt | llm | StrOutputParser() def rank(self, question: str, retrieval: dict) -> list: + """ + Rank documents based on their relevance to the question. + + Parameters: + question (str): The query to evaluate document relevance against. + retrieval (dict): The retrieval results from a vector database query. + + Returns: + list: A list of tuples containing document IDs and their binary relevance scores ('0' or '1'). + """ # Ensure the chain is initialized (lazy loading) self.initialize_chain() # Extract list of documents and their ids from the retrieval result. documents_list = retrieval["documents"][0] - id_list = retrieval["ids"][0] - # Build input for batch processing: list of dicts containing question and document. + filenames_list = retrieval["ids"][0] + # Build input for batch processing: list of dicts containing question, filename, and document. batch_inputs = [] - for doc in documents_list: - batch_inputs.append({"question": question, "document": doc}) + for idx, doc in enumerate(documents_list): + batch_inputs.append({ + "question": question, + "filename": filenames_list[idx], + "document": doc + }) # Use the chain batch function to get binary outputs. results = self.chain.batch(batch_inputs) # Pair each document id with its binary ranking result. ranking = [] for idx, result in enumerate(results): - ranking.append((id_list[idx], result.strip())) + ranking.append((filenames_list[idx], result.strip())) return ranking if __name__ == "__main__": # Example usage of BinaryRanker for testing. - question = "Common styles, used in the main page" - collection = get_collection() - retrieval = collection.query(query_texts=[question], n_results=8) - binary_ranker = BinaryRanker() - ranking = binary_ranker.rank(question, retrieval) - print("Binary Ranking Results:", ranking) + question = "Some tool that can change files" + # Test the retrieve function results = retrieve(question) From 67ec89045a3a9228ea5759372bb6d7682fd1835a Mon Sep 17 00:00:00 2001 From: Grigorij Dudnik Date: Wed, 5 Mar 2025 10:42:02 +0100 Subject: [PATCH 25/32] questionaru to index added --- manager.py | 5 ++- single_task_coder.py | 8 +++-- ...riptions.py => index_file_descriptions.py} | 31 ++++++++++++++++--- src/tools/rag/retrieval.py | 22 ++++++------- src/utilities/manager_utils.py | 2 +- 5 files changed, 47 insertions(+), 21 deletions(-) rename src/tools/rag/{write_descriptions.py => index_file_descriptions.py} (86%) diff --git a/manager.py b/manager.py index 9c60d830..279d9835 100644 --- a/manager.py +++ b/manager.py @@ -34,7 +34,11 @@ class Manager: def __init__(self): load_dotenv(find_dotenv()) self.work_dir = os.getenv("WORK_DIR") + # initial project setup set_up_dot_clean_coder_dir(self.work_dir) + setup_todoist_project_if_needed() + + self.tools = self.prepare_tools() self.llms = init_llms_medium_intelligence(tools=self.tools, run_name="Manager") self.manager = self.setup_workflow() @@ -113,7 +117,6 @@ def setup_workflow(self): def run(self): print_formatted("😀 Hello! I'm Manager agent. Let's plan your project together!", color="green") - setup_todoist_project_if_needed() messages = get_manager_messages(self.saved_messages_path) inputs = {"messages": messages} diff --git a/single_task_coder.py b/single_task_coder.py index 6cfc59bd..1bdd57b7 100644 --- a/single_task_coder.py +++ b/single_task_coder.py @@ -17,6 +17,7 @@ from src.utilities.start_project_functions import set_up_dot_clean_coder_dir from src.utilities.util_functions import create_frontend_feedback_story from concurrent.futures import ThreadPoolExecutor +from src.tools.rag.index_file_descriptions import prompt_index_project_files use_frontend_feedback = bool(os.getenv("FRONTEND_URL")) @@ -54,8 +55,9 @@ def run_clean_coder_pipeline(task: str, work_dir: str, doc_harvest: bool = False if __name__ == "__main__": work_dir = os.getenv("WORK_DIR") - set_up_dot_clean_coder_dir(work_dir) - task = user_input("Provide task to be executed. ") if not work_dir: raise Exception("WORK_DIR variable not provided. Please add WORK_DIR to .env file") - run_clean_coder_pipeline(task, work_dir) \ No newline at end of file + set_up_dot_clean_coder_dir(work_dir) + prompt_index_project_files() + task = user_input("Provide task to be executed. ") + run_clean_coder_pipeline(task, work_dir) diff --git a/src/tools/rag/write_descriptions.py b/src/tools/rag/index_file_descriptions.py similarity index 86% rename from src/tools/rag/write_descriptions.py rename to src/tools/rag/index_file_descriptions.py index b8b23c33..3d31611f 100644 --- a/src/tools/rag/write_descriptions.py +++ b/src/tools/rag/index_file_descriptions.py @@ -5,13 +5,15 @@ from dotenv import load_dotenv, find_dotenv import chromadb import sys +import questionary sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))) from src.utilities.util_functions import join_paths, read_coderrules -from src.utilities.start_work_functions import CoderIgnore, file_folder_ignored +from src.utilities.start_work_functions import file_folder_ignored from src.utilities.llms import init_llms_mini from src.tools.rag.code_splitter import split_code from src.utilities.print_formatters import print_formatted - +from src.tools.rag.retrieval import vdb_available +from src.utilities.manager_utils import QUESTIONARY_STYLE load_dotenv(find_dotenv()) work_dir = os.getenv("WORK_DIR") @@ -158,9 +160,30 @@ def upload_descriptions_to_vdb(): ) -if __name__ == '__main__': +def prompt_index_project_files(): + """ + Checks if the vector database (VDB) is available. + If not, prompts the user via questionary to index project files for better search. + On a "Yes" answer, triggers write_and_index_descriptions(). + """ + if not vdb_available(): + answer = questionary.select( + "Do you want to index your project files for better search?", + choices=["Index", "Skip"], + style=QUESTIONARY_STYLE, + instruction="\nHint: Skip if you're running Clean Coder for the first time and testing, index if you're working on a real project" + ).ask() + if answer == "Index": + write_and_index_descriptions() + + +def write_and_index_descriptions(): #provide optionally which subfolders needs to be checked, if you don't want to describe all project folder write_file_descriptions(subfolders_with_files=['/']) write_file_chunks_descriptions() - upload_descriptions_to_vdb() \ No newline at end of file + upload_descriptions_to_vdb() + + +if __name__ == "__main__": + write_and_index_descriptions() \ No newline at end of file diff --git a/src/tools/rag/retrieval.py b/src/tools/rag/retrieval.py index 3498acdb..056fb8ae 100644 --- a/src/tools/rag/retrieval.py +++ b/src/tools/rag/retrieval.py @@ -10,21 +10,19 @@ load_dotenv(find_dotenv()) work_dir = os.getenv("WORK_DIR") -cohere_key = os.getenv("COHERE_API_KEY") -if cohere_key: - cohere_client = cohere.Client(cohere_key) +# cohere_key = os.getenv("COHERE_API_KEY") +# if cohere_key: +# cohere_client = cohere.Client(cohere_key) collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions" def get_collection(): - if cohere_key: - chroma_client = chromadb.PersistentClient(path=os.getenv('WORK_DIR') + '/.clean_coder/chroma_base') - try: - return chroma_client.get_collection(name=collection_name) - except: - # print("Vector database does not exist. (Optional) create it by running src/tools/rag/write_descriptions.py to improve file research capabilities") - return False - return False + chroma_client = chromadb.PersistentClient(path=os.getenv('WORK_DIR') + '/.clean_coder/chroma_base') + try: + return chroma_client.get_collection(name=collection_name) + except: + # print("Vector database does not exist. (Optional) create it by running src/tools/rag/write_descriptions.py to improve file research capabilities") + return False def vdb_available(): @@ -68,7 +66,7 @@ def retrieve(question: str) -> str: # New class added for binary ranking with lazy loading. class BinaryRanker: """ - A binary document ranker that uses LLM to determine document relevance. + A binary document ranker that uses LLM to determine if a document is relevant. This class implements lazy loading of the LLM chain, meaning the chain is only initialized when the rank method is called. It evaluates whether diff --git a/src/utilities/manager_utils.py b/src/utilities/manager_utils.py index 1a2d05cd..364d6c4b 100644 --- a/src/utilities/manager_utils.py +++ b/src/utilities/manager_utils.py @@ -41,7 +41,7 @@ ('highlighted', 'fg:green bold'), # Highlighted choice ('selected', 'fg:green bold'), # Selected choice ('separator', 'fg:magenta'), # Separator between choices - ('instruction', 'fg:white'), # Additional instructions + ('instruction', 'fg:#FFD700'), # Additional instructions now in golden yellow (hex color) ]) From 0d5190ce735d8627abe47fef9f46343773e41485 Mon Sep 17 00:00:00 2001 From: Grigorij Dudnik Date: Wed, 5 Mar 2025 10:44:58 +0100 Subject: [PATCH 26/32] questionaru to index added --- src/tools/rag/index_file_descriptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tools/rag/index_file_descriptions.py b/src/tools/rag/index_file_descriptions.py index 3d31611f..14e3b81f 100644 --- a/src/tools/rag/index_file_descriptions.py +++ b/src/tools/rag/index_file_descriptions.py @@ -171,7 +171,7 @@ def prompt_index_project_files(): "Do you want to index your project files for better search?", choices=["Index", "Skip"], style=QUESTIONARY_STYLE, - instruction="\nHint: Skip if you're running Clean Coder for the first time and testing, index if you're working on a real project" + instruction="\nHint: Skip for testing Clean Coder; index for real projects." ).ask() if answer == "Index": write_and_index_descriptions() From 58c267e25c18a34da2cfce735b7cc910fab238e8 Mon Sep 17 00:00:00 2001 From: Grigorij Dudnik Date: Thu, 6 Mar 2025 01:17:13 +0100 Subject: [PATCH 27/32] p bar --- src/tools/rag/index_file_descriptions.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/tools/rag/index_file_descriptions.py b/src/tools/rag/index_file_descriptions.py index 14e3b81f..339d3f28 100644 --- a/src/tools/rag/index_file_descriptions.py +++ b/src/tools/rag/index_file_descriptions.py @@ -6,6 +6,7 @@ import chromadb import sys import questionary +from rich.progress import Progress sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))) from src.utilities.util_functions import join_paths, read_coderrules from src.utilities.start_work_functions import file_folder_ignored @@ -54,9 +55,9 @@ def collect_file_pathes(subfolders, work_dir): return allowed_files -def write_file_descriptions(subfolders_with_files=['/']): +def write_file_descriptions(subfolders_with_files=['/']): all_files = collect_file_pathes(subfolders_with_files, work_dir) - + progress = Progress() coderrules = read_coderrules() prompt = ChatPromptTemplate.from_template( @@ -82,12 +83,13 @@ def write_file_descriptions(subfolders_with_files=['/']): description_folder = join_paths(work_dir, '.clean_coder/files_and_folders_descriptions') Path(description_folder).mkdir(parents=True, exist_ok=True) - # iterate over all files, take 8 files at once and descrive files in batch batch_size = 8 + task_progress = progress.add_task("[gold1]Describing files (0/{})".format(len(all_files)), total=len(all_files)) + progress.start() + for i in range(0, len(all_files), batch_size): files_iteration = all_files[i:i + batch_size] descriptions = chain.batch([get_content(file_path) for file_path in files_iteration]) - print(descriptions) for file_path, description in zip(files_iteration, descriptions): file_name = file_path.relative_to(work_dir).as_posix().replace('/', '=') @@ -95,6 +97,9 @@ def write_file_descriptions(subfolders_with_files=['/']): with open(output_path, 'w', encoding='utf-8') as out_file: out_file.write(description) + files_processed = progress.tasks[task_progress].completed + 1 + progress.update(task_progress, advance=1, description=f"[gold1]Describing files ({files_processed}/{len(all_files)})") + progress.stop() From fd699831aeddb1b91d41e43a55e1d2797ea2a985 Mon Sep 17 00:00:00 2001 From: Grigorij Dudnik Date: Thu, 6 Mar 2025 09:02:20 +0100 Subject: [PATCH 28/32] progres bars for indexing done --- src/tools/rag/index_file_descriptions.py | 29 ++++++++++++++++-------- src/utilities/llms.py | 19 ++++++---------- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/src/tools/rag/index_file_descriptions.py b/src/tools/rag/index_file_descriptions.py index 339d3f28..65c70179 100644 --- a/src/tools/rag/index_file_descriptions.py +++ b/src/tools/rag/index_file_descriptions.py @@ -6,7 +6,6 @@ import chromadb import sys import questionary -from rich.progress import Progress sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))) from src.utilities.util_functions import join_paths, read_coderrules from src.utilities.start_work_functions import file_folder_ignored @@ -15,10 +14,21 @@ from src.utilities.print_formatters import print_formatted from src.tools.rag.retrieval import vdb_available from src.utilities.manager_utils import QUESTIONARY_STYLE +from tqdm import tqdm load_dotenv(find_dotenv()) work_dir = os.getenv("WORK_DIR") +GOLDEN = "\033[38;5;220m" +MAGENTA = "\033[95m" +RESET = "\033[0m" + +# Customize tqdm's bar format with golden and magenta colors +bar_format = ( + f"{GOLDEN}{{desc}}: {MAGENTA}{{percentage:3.0f}}%{GOLDEN}|" + f"{{bar}}| {MAGENTA}{{n_fmt}}/{{total_fmt}} files " + f"{GOLDEN}[{{elapsed}}<{{remaining}}, {{rate_fmt}}{{postfix}}]{RESET}" +) def is_code_file(file_path): # List of common code file extensions @@ -57,7 +67,6 @@ def collect_file_pathes(subfolders, work_dir): def write_file_descriptions(subfolders_with_files=['/']): all_files = collect_file_pathes(subfolders_with_files, work_dir) - progress = Progress() coderrules = read_coderrules() prompt = ChatPromptTemplate.from_template( @@ -84,8 +93,7 @@ def write_file_descriptions(subfolders_with_files=['/']): description_folder = join_paths(work_dir, '.clean_coder/files_and_folders_descriptions') Path(description_folder).mkdir(parents=True, exist_ok=True) batch_size = 8 - task_progress = progress.add_task("[gold1]Describing files (0/{})".format(len(all_files)), total=len(all_files)) - progress.start() + pbar = tqdm(total=len(all_files), desc=f"[1/2]Describing files", bar_format=bar_format) for i in range(0, len(all_files), batch_size): files_iteration = all_files[i:i + batch_size] @@ -97,9 +105,11 @@ def write_file_descriptions(subfolders_with_files=['/']): with open(output_path, 'w', encoding='utf-8') as out_file: out_file.write(description) - files_processed = progress.tasks[task_progress].completed + 1 - progress.update(task_progress, advance=1, description=f"[gold1]Describing files ({files_processed}/{len(all_files)})") - progress.stop() + + # Update by actual number of files processed in this batch + pbar.update(len(files_iteration)) + + pbar.close() # Don't forget to close the progress bar when done @@ -120,8 +130,10 @@ def write_file_chunks_descriptions(subfolders_with_files=['/']): description_folder = join_paths(work_dir, '.clean_coder/files_and_folders_descriptions') Path(description_folder).mkdir(parents=True, exist_ok=True) + # iterate chunks inside of the file - for file_path in all_files: + for file_path in tqdm(all_files, desc=f"[2/2]Describing file chunks", + bar_format=bar_format): file_content = get_content(file_path) # get file extenstion extension = file_path.suffix.lstrip('.') @@ -130,7 +142,6 @@ def write_file_chunks_descriptions(subfolders_with_files=['/']): if len(file_chunks) <= 1: continue descriptions = chain.batch([{'coderrules': coderrules, 'file_code': file_content, 'chunk_code': chunk} for chunk in file_chunks]) - print(descriptions) for nr, description in enumerate(descriptions): file_name = f"{file_path.relative_to(work_dir).as_posix().replace('/', '=')}_chunk{nr}" diff --git a/src/utilities/llms.py b/src/utilities/llms.py index ed81e8e5..3c4e15e9 100644 --- a/src/utilities/llms.py +++ b/src/utilities/llms.py @@ -34,13 +34,12 @@ def llm_open_local_hosted(model): def init_llms_medium_intelligence(tools=None, run_name="Clean Coder", temp=0): llms = [] if getenv("ANTHROPIC_API_KEY"): - llms.append(ChatAnthropic(model='claude-3-7-sonnet-latest', temperature=temp, timeout=60, max_tokens=2048)) + llms.append(ChatAnthropic(model='claude-3-5-sonnet-latest', temperature=temp, timeout=60, max_tokens=2048)) if getenv("OPENROUTER_API_KEY"): - llms.append(llm_open_router("anthropic/claude-3.7-sonnet")) + llms.append(llm_open_router("anthropic/claude-3.5-sonnet")) if getenv("OPENAI_API_KEY"): llms.append(ChatOpenAI(model="gpt-4o", temperature=temp, timeout=60)) - # if os.getenv("GOOGLE_API_KEY"): - # llms.append(ChatGoogleGenerativeAI(model="gemini-2.0-flash-exp", temperature=temp, timeout=60)) + if getenv("OLLAMA_MODEL"): llms.append(ChatOllama(model=os.getenv("OLLAMA_MODEL"))) if getenv("LOCAL_MODEL_API_BASE"): @@ -75,19 +74,15 @@ def init_llms_mini(tools=None, run_name="Clean Coder", temp=0): def init_llms_high_intelligence(tools=None, run_name="Clean Coder", temp=0.2): llms = [] + if os.getenv("ANTHROPIC_API_KEY"): + llms.append(ChatAnthropic(model='claude-3-7-sonnet-latest', temperature=temp, timeout=60, max_tokens=2048)) + if getenv("OPENROUTER_API_KEY"): + llms.append(llm_open_router("anthropic/claude-3.7-sonnet")) if os.getenv("OPENAI_API_KEY"): llms.append(ChatOpenAI(model="o3-mini", temperature=1, timeout=60, reasoning_effort="high")) if os.getenv("OPENAI_API_KEY"): llms.append(ChatOpenAI(model="o1", temperature=1, timeout=60)) - if os.getenv("OPENROUTER_API_KEY"): - llms.append(llm_open_router("openai/gpt-4o")) - if os.getenv("OPENAI_API_KEY"): - llms.append(ChatOpenAI(model="gpt-4o", temperature=temp, timeout=60)) - if os.getenv("ANTHROPIC_API_KEY"): - llms.append(ChatAnthropic(model='claude-3-7-sonnet-latest', temperature=temp, timeout=60, max_tokens=2048)) - # if os.getenv("GOOGLE_API_KEY"): - # llms.append(ChatGoogleGenerativeAI(model="gemini-2.0-flash-exp", temperature=temp, timeout=60)) if os.getenv("OLLAMA_MODEL"): llms.append(ChatOllama(model=os.getenv("OLLAMA_MODEL"))) if getenv("LOCAL_MODEL_API_BASE"): From e912cdae9354f71c82e0ed56bdaf371a804882f1 Mon Sep 17 00:00:00 2001 From: Grigorij Dudnik Date: Fri, 7 Mar 2025 08:43:17 +0100 Subject: [PATCH 29/32] indexing added to manager, bug with cutting parenthesis solved --- .env.template | 7 ++----- docker-compose.yml | 2 -- manager.py | 3 ++- non_src/tests/manual_tests/planer_scenario_1.py | 4 ++-- requirements.txt | 2 -- src/agents/planner_agent.py | 2 +- src/prompts/planner_system.prompt | 8 +++----- src/tools/rag/retrieval.py | 6 ------ src/utilities/util_functions.py | 4 ++-- 9 files changed, 12 insertions(+), 26 deletions(-) diff --git a/.env.template b/.env.template index 4898bdf8..38fb491d 100644 --- a/.env.template +++ b/.env.template @@ -11,14 +11,11 @@ OLLAMA_MODEL= LOCAL_MODEL_API_BASE= LOCAL_MODEL_NAME= -# Optional, but highly recommended -## For RAG tool of Researcher -COHERE_API_KEY= - -# Optional ## For Manager agent TODOIST_API_KEY= TODOIST_PROJECT_ID= + +# Optional ## For automatic error check LOG_FILE= ## Frontend Feedback diff --git a/docker-compose.yml b/docker-compose.yml index 9fb66bc0..50f50145 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -10,7 +10,6 @@ services: - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} - TODOIST_API_KEY=${TODOIST_API_KEY} - TODOIST_PROJECT_ID=${TODOIST_PROJECT_ID} - - COHERE_API_KEY=${COHERE_API_KEY} - LOG_FILE=${LOG_FILE:-} volumes: - .:/Clean_Coder @@ -29,7 +28,6 @@ services: - WORK_DIR=/work_dir - OPENAI_API_KEY=${OPENAI_API_KEY} - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} - - COHERE_API_KEY=${COHERE_API_KEY} - LOG_FILE=${LOG_FILE:-} volumes: - .:/Clean_Coder diff --git a/manager.py b/manager.py index 279d9835..1cfea36c 100644 --- a/manager.py +++ b/manager.py @@ -16,6 +16,7 @@ from langgraph.graph import StateGraph from src.tools.tools_project_manager import add_task, modify_task, finish_project_planning, reorder_tasks from src.tools.tools_coder_pipeline import prepare_list_dir_tool, prepare_see_file_tool, ask_human_tool +from src.tools.rag.index_file_descriptions import prompt_index_project_files from src.utilities.manager_utils import actualize_tasks_list_and_progress_description, setup_todoist_project_if_needed, get_manager_messages from src.utilities.langgraph_common_functions import call_model, call_tool, multiple_tools_msg, no_tools_msg, empty_message_msg from src.utilities.start_project_functions import set_up_dot_clean_coder_dir @@ -37,7 +38,7 @@ def __init__(self): # initial project setup set_up_dot_clean_coder_dir(self.work_dir) setup_todoist_project_if_needed() - + prompt_index_project_files() self.tools = self.prepare_tools() self.llms = init_llms_medium_intelligence(tools=self.tools, run_name="Manager") diff --git a/non_src/tests/manual_tests/planer_scenario_1.py b/non_src/tests/manual_tests/planer_scenario_1.py index df9322f0..bfe7c81e 100644 --- a/non_src/tests/manual_tests/planer_scenario_1.py +++ b/non_src/tests/manual_tests/planer_scenario_1.py @@ -11,8 +11,8 @@ load_dotenv(find_dotenv()) -folder_with_project_files = repo_directory.joinpath("non_src/tests/manual_tests/projects_files", "debugger_scenario_1_files") -tmp_folder = pathlib.Path(__file__).parent.resolve().joinpath("sandbox_work_dir") +folder_with_project_files = repo_directory.joinpath("non_src/tests/manual_tests/projects_files", "planner_scenario_1_files") +tmp_folder = pathlib.Path(__file__).parent.resolve().joinpath("sandbox_work_dir") setup_work_dir(manual_tests_folder=tmp_folder, test_files_dir=folder_with_project_files) task = "Make form wider, with green background. Improve styling." diff --git a/requirements.txt b/requirements.txt index 32cdcfe1..61f23600 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,8 +10,6 @@ langchain-ollama==0.2.0 playwright==1.47.0 libsass==0.23.0 openai==1.61.1 -cohere==5.10.0 -langchain-cohere==0.3.0 chromadb==0.4.21 todoist-api-python==2.1.7 termcolor==2.4.0 diff --git a/src/agents/planner_agent.py b/src/agents/planner_agent.py index 007e23bb..408111ab 100644 --- a/src/agents/planner_agent.py +++ b/src/agents/planner_agent.py @@ -67,7 +67,7 @@ def call_advanced_planner(state): logic_pseudocode = llm_strong.invoke(logic_planner_messages) print_formatted("\nIntermediate planning done. Finalizing plan...", color="light_magenta") if os.getenv("SHOW_LOGIC_PLAN"): - print(logic_pseudocode.content) + print_formatted(logic_pseudocode.content, color="light_yellow") state["plan_finalizer_messages"].append(HumanMessage(content=f"Logic pseudocode plan to follow:\n\n{logic_pseudocode.content}")) plan_finalizer_messages = state["plan_finalizer_messages"] diff --git a/src/prompts/planner_system.prompt b/src/prompts/planner_system.prompt index 1d6d4165..33496241 100644 --- a/src/prompts/planner_system.prompt +++ b/src/prompts/planner_system.prompt @@ -20,16 +20,13 @@ For additional context, here's the directory tree: Instructions: -1. Plan the logic: - Outline the logic algorithm before proposing code changes. - -2. Draft a detailed modification plan: +1. Draft a detailed modification plan: - Prioritize readability - Follow the DRY (Don't Repeat Yourself) principle - Use meaningful variable names - Write concise code -3. Format code snippets in your plan properly: +2. Format code snippets in your plan properly: In your code snippets, follow udiff format with filename we working on in the header. For each code modification, use the following structure: ```filename.extension @@ -43,3 +40,4 @@ Instructions: Remember: - If you're unsure how to implement a given task, don't improvise. Simply state that you don't know. Assuming is not allowed - just tell "please provide me with more files" when needed. - When adjusting your plan based on user feedback, always provide a complete version of the plan, referenced to original file contents. Don't reference previous plan. +- Previous plan proposition have not been implemented. Always reference your code changes to code files you have in the context, not to the previous plan proposition. diff --git a/src/tools/rag/retrieval.py b/src/tools/rag/retrieval.py index 056fb8ae..5eef5de8 100644 --- a/src/tools/rag/retrieval.py +++ b/src/tools/rag/retrieval.py @@ -1,5 +1,4 @@ import os -import cohere import chromadb from pathlib import Path from dotenv import load_dotenv, find_dotenv @@ -10,9 +9,6 @@ load_dotenv(find_dotenv()) work_dir = os.getenv("WORK_DIR") -# cohere_key = os.getenv("COHERE_API_KEY") -# if cohere_key: -# cohere_client = cohere.Client(cohere_key) collection_name = f"clean_coder_{Path(work_dir).name}_file_descriptions" @@ -138,8 +134,6 @@ def rank(self, question: str, retrieval: dict) -> list: if __name__ == "__main__": # Example usage of BinaryRanker for testing. question = "Some tool that can change files" - - # Test the retrieve function results = retrieve(question) print("\n\n") diff --git a/src/utilities/util_functions.py b/src/utilities/util_functions.py index 1db64783..57c648e1 100644 --- a/src/utilities/util_functions.py +++ b/src/utilities/util_functions.py @@ -63,9 +63,9 @@ def watch_file(filename, work_dir, line_numbers=True): except FileNotFoundError: return "File not exists." if line_numbers: - formatted_lines = [f"{i + 1}|{line[:-1]} |{i+1}\n" for i, line in enumerate(lines)] + formatted_lines = [f"{i + 1}|{line.rstrip()} |{i+1}\n" for i, line in enumerate(lines)] else: - formatted_lines = [f"{line[:-1]}\n" for line in lines] + formatted_lines = [f"{line.rstrip()}\n" for line in lines] file_content = "".join(formatted_lines) file_content = filename + ":\n\n" + file_content From ec843aa76acfeaeeeae68f83600dad5c78635576 Mon Sep 17 00:00:00 2001 From: Grigorij Dudnik Date: Sun, 9 Mar 2025 15:18:33 +0100 Subject: [PATCH 30/32] improvements --- .env.template | 5 ++--- src/utilities/llms.py | 2 +- src/utilities/manager_utils.py | 1 + src/utilities/print_formatters.py | 3 +++ 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.env.template b/.env.template index 38fb491d..9eaf1a43 100644 --- a/.env.template +++ b/.env.template @@ -6,7 +6,6 @@ WORK_DIR= ANTHROPIC_API_KEY= OPENAI_API_KEY= OPENROUTER_API_KEY= -GOOGLE_API_KEY= OLLAMA_MODEL= LOCAL_MODEL_API_BASE= LOCAL_MODEL_NAME= @@ -25,8 +24,8 @@ EDIT_TRANSCRIPTION= ## Show planner intermediate reasoning SHOW_LOGIC_PLAN= -# optional - LLM observability -LANGCHAIN_TRACING_V2=true +# Optional - LLM observability +LANGCHAIN_TRACING_V2= LANGCHAIN_ENDPOINT="https://api.smith.langchain.com" LANGCHAIN_API_KEY= LANGCHAIN_PROJECT= \ No newline at end of file diff --git a/src/utilities/llms.py b/src/utilities/llms.py index 3c4e15e9..9d6dfeda 100644 --- a/src/utilities/llms.py +++ b/src/utilities/llms.py @@ -75,7 +75,7 @@ def init_llms_mini(tools=None, run_name="Clean Coder", temp=0): def init_llms_high_intelligence(tools=None, run_name="Clean Coder", temp=0.2): llms = [] if os.getenv("ANTHROPIC_API_KEY"): - llms.append(ChatAnthropic(model='claude-3-7-sonnet-latest', temperature=temp, timeout=60, max_tokens=2048)) + llms.append(ChatAnthropic(model='claude-3-7-sonnet-latest', temperature=temp, timeout=60, max_tokens=4096)) if getenv("OPENROUTER_API_KEY"): llms.append(llm_open_router("anthropic/claude-3.7-sonnet")) if os.getenv("OPENAI_API_KEY"): diff --git a/src/utilities/manager_utils.py b/src/utilities/manager_utils.py index 364d6c4b..8a6372a3 100644 --- a/src/utilities/manager_utils.py +++ b/src/utilities/manager_utils.py @@ -76,6 +76,7 @@ def fetch_epics(): def fetch_tasks(): + print("pies") return todoist_api.get_tasks(project_id=os.getenv('TODOIST_PROJECT_ID')) diff --git a/src/utilities/print_formatters.py b/src/utilities/print_formatters.py index 618e0998..e9c8825e 100644 --- a/src/utilities/print_formatters.py +++ b/src/utilities/print_formatters.py @@ -10,6 +10,9 @@ def print_formatted_content_planner(content): + """ + Prints output of planner module. Highlights code snippets in diff. + """ parts = content.split('```') outside_texts = parts[::2] code_snippets = parts[1::2] From f0b3e6e4244747b09b28ba5ba4e654551717b47e Mon Sep 17 00:00:00 2001 From: Grigorij Dudnik Date: Mon, 10 Mar 2025 17:38:58 +0100 Subject: [PATCH 31/32] 2-step indexing --- src/tools/rag/index_file_descriptions.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/tools/rag/index_file_descriptions.py b/src/tools/rag/index_file_descriptions.py index 65c70179..a68c1915 100644 --- a/src/tools/rag/index_file_descriptions.py +++ b/src/tools/rag/index_file_descriptions.py @@ -180,14 +180,23 @@ def prompt_index_project_files(): """ Checks if the vector database (VDB) is available. If not, prompts the user via questionary to index project files for better search. - On a "Yes" answer, triggers write_and_index_descriptions(). + Then asks if yous sure he want to do indexing. Then triggers write_and_index_descriptions(). """ - if not vdb_available(): + if vdb_available(): + return + answer = questionary.select( + "Do you want to index your project files for improving file search?", + choices=["Proceed", "Skip"], + style=QUESTIONARY_STYLE, + instruction="\nHint: Skip for testing Clean Coder; index for real projects." + ).ask() + if answer == "Proceed": + nr_of_files = len(collect_file_pathes(['/'], work_dir)) answer = questionary.select( - "Do you want to index your project files for better search?", + f"Going to index {nr_of_files} files. Indexing could be time-consuming and costly. Are you ready to go?", choices=["Index", "Skip"], style=QUESTIONARY_STYLE, - instruction="\nHint: Skip for testing Clean Coder; index for real projects." + instruction="\nHint: Ensure you provided all files and directories you don't want to index in {WORK_DIR}/.clean_coder/.coderignore to avoid describing trashy files." ).ask() if answer == "Index": write_and_index_descriptions() From 8a522716f9070b03e93561be9f99862b053f36f2 Mon Sep 17 00:00:00 2001 From: Wiktor Balcerzak <33687465+LilKeyboard@users.noreply.github.com> Date: Fri, 14 Mar 2025 10:59:24 +0100 Subject: [PATCH 32/32] Update user_input.py I added multiline input feature --- src/utilities/user_input.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/utilities/user_input.py b/src/utilities/user_input.py index 573ce3ed..6d371b6c 100644 --- a/src/utilities/user_input.py +++ b/src/utilities/user_input.py @@ -3,13 +3,18 @@ from src.utilities.voice_utils import VoiceRecorder import keyboard import readline +import sys recorder = VoiceRecorder() def user_input(prompt=""): - print_formatted(prompt + "Or use (m)icrophone to tell:", color="cyan", bold=True) + print_formatted(prompt + "Or use (m)icrophone to tell, or press Enter for multiline input:", color="cyan", bold=True) + + if not sys.stdin.isatty(): + return sys.stdin.read().strip() + user_sentence = input() if user_sentence == 'm': if not os.getenv("OPENAI_API_KEY"): @@ -26,7 +31,15 @@ def user_input(prompt=""): else: print_formatted("Install 'sudo apt-get install libportaudio2' (Linux) or 'brew install portaudio' (Mac) to use microphone feature.", color="red") user_sentence = input() - + elif user_sentence == '' or '\n' in user_sentence: + if user_sentence: + return user_sentence + print_formatted("Enter your multiline text (end with Ctrl+D on Unix or Ctrl+Z on Windows):", color="green") + try: + user_sentence = sys.stdin.read().strip() + except KeyboardInterrupt: + return "" + return user_sentence