Merge pull request #37 from BrunoV21/get-duplicate-protection

BrunoV21 · web-flow · commit ef020a2b79b6 · 2025-09-18T10:47:28.000+01:00
Get duplicate protection
diff --git a/.gitignore b/.gitignore
@@ -186,4 +186,6 @@ examples/hf_demo_space/chainlit.md
 
 examples/hf_demo_space/public/
 database.db-journal
+database.db-shm
 .chainlit/
+pgdata/
diff --git a/codetide/__init__.py b/codetide/__init__.py
@@ -100,7 +100,7 @@ def relative_filepaths(self)->List[str]:
     
     @property
     def cached_ids(self)->List[str]:
-        return self.codebase.unique_ids+self.relative_filepaths
+        return self.codebase.non_import_unique_ids+self.relative_filepaths
     
     @property
     def repo(self)->Optional[pygit2.Repository]:
diff --git a/codetide/agents/tide/agent.py b/codetide/agents/tide/agent.py
@@ -58,6 +58,7 @@ class AgentTide(BaseModel):
     _last_code_identifers :Optional[Set[str]]=set()
     _last_code_context :Optional[str] = None
     _has_patch :bool=False
+    _direct_mode :bool=False
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
@@ -138,92 +139,103 @@ async def agent_loop(self, codeIdentifiers :Optional[List[str]]=None):
             ...
         else:
             autocomplete = AutoComplete(self.tide.cached_ids)
-            matches = autocomplete.extract_words_from_text("\n\n".join(self.history))
-
-            # --- Begin Unified Identifier Retrieval ---
-            identifiers_accum = set(matches["all_found_words"]) if codeIdentifiers is None else set(codeIdentifiers + matches["all_found_words"])
-            modify_accum = set()
-            reasoning_accum = []
-            repo_tree = None
-            smart_search_attempts = 0
-            max_smart_search_attempts = 3
-            done = False
-            previous_reason = None
-
-            while not done:
-                expand_paths = ["./"]
-                # 1. SmartCodeSearch to filter repo tree
-                if repo_tree is None or smart_search_attempts > 0:
-                    repo_history = self.history
-                    if previous_reason:
-                        repo_history += [previous_reason]
+            if self._direct_mode:
+                self.contextIdentifiers = None
+                exact_matches = autocomplete.extract_words_from_text(self.history[-1], max_matches_per_word=1)["all_found_words"]
+                self.modifyIdentifiers = self.tide._as_file_paths(exact_matches)
+                codeIdentifiers = self.modifyIdentifiers
+                self._direct_mode = False
+
+            else:
+                matches = autocomplete.extract_words_from_text("\n\n".join(self.history), max_matches_per_word=1)
+
+                # --- Begin Unified Identifier Retrieval ---
+                identifiers_accum = set(matches["all_found_words"]) if codeIdentifiers is None else set(codeIdentifiers + matches["all_found_words"])
+                modify_accum = set()
+                reasoning_accum = []
+                repo_tree = None
+                smart_search_attempts = 0
+                max_smart_search_attempts = 3
+                done = False
+                previous_reason = None
+
+                while not done:
+                    expand_paths = ["./"]
+                    # 1. SmartCodeSearch to filter repo tree
+                    if repo_tree is None or smart_search_attempts > 0:
+                        repo_history = self.history
+                        if previous_reason:
+                            repo_history += [previous_reason]
+                        
+                        repo_tree = await self.get_repo_tree_from_user_prompt(self.history, include_modules=bool(smart_search_attempts), expand_paths=expand_paths)
+                    
+                    # 2. Single LLM call with unified prompt
+                    # Pass accumulated identifiers for context if this isn't the first iteration
+                    accumulated_context = "\n".join(
+                        sorted((identifiers_accum or set()) | (modify_accum or set()))
+                    ) if (identifiers_accum or modify_accum) else ""
+                    
+                    unified_response = await self.llm.acomplete(
+                        self.history,
+                        system_prompt=[GET_CODE_IDENTIFIERS_UNIFIED_PROMPT.format(
+                            DATE=TODAY, 
+                            SUPPORTED_LANGUAGES=SUPPORTED_LANGUAGES,
+                            IDENTIFIERS=accumulated_context
+                        )],
+                        prefix_prompt=repo_tree,
+                        stream=False
+                    )
+                    print(f"{unified_response=}")
+
+                    # Parse the unified response
+                    contextIdentifiers = parse_blocks(unified_response, block_word="Context Identifiers", multiple=False)
+                    modifyIdentifiers = parse_blocks(unified_response, block_word="Modify Identifiers", multiple=False)
+                    expandPaths = parse_blocks(unified_response, block_word="Expand Paths", multiple=False)                
+                    
+                    # Extract reasoning (everything before the first "*** Begin")
+                    reasoning_parts = unified_response.split("*** Begin")
+                    if reasoning_parts:
+                        reasoning_accum.append(reasoning_parts[0].strip())
+                        previous_reason = reasoning_accum[-1]
                     
-                    repo_tree = await self.get_repo_tree_from_user_prompt(self.history, include_modules=bool(smart_search_attempts), expand_paths=expand_paths)
-                
-                # 2. Single LLM call with unified prompt
-                # Pass accumulated identifiers for context if this isn't the first iteration
-                accumulated_context = "\n".join(
-                    sorted((identifiers_accum or set()) | (modify_accum or set()))
-                ) if (identifiers_accum or modify_accum) else ""
-                
-                unified_response = await self.llm.acomplete(
-                    self.history,
-                    system_prompt=[GET_CODE_IDENTIFIERS_UNIFIED_PROMPT.format(
-                        DATE=TODAY, 
-                        SUPPORTED_LANGUAGES=SUPPORTED_LANGUAGES,
-                        IDENTIFIERS=accumulated_context
-                    )],
-                    prefix_prompt=repo_tree,
-                    stream=False
-                )
-
-                # Parse the unified response
-                contextIdentifiers = parse_blocks(unified_response, block_word="Context Identifiers", multiple=False)
-                modifyIdentifiers = parse_blocks(unified_response, block_word="Modify Identifiers", multiple=False)
-                expandPaths = parse_blocks(unified_response, block_word="Expand Paths", multiple=False)                
-                
-                # Extract reasoning (everything before the first "*** Begin")
-                reasoning_parts = unified_response.split("*** Begin")
-                if reasoning_parts:
-                    reasoning_accum.append(reasoning_parts[0].strip())
-                    previous_reason = reasoning_accum[-1]
-                
-                # Accumulate identifiers
-                if contextIdentifiers:
-                    if smart_search_attempts == 0:
-                        ### clean wrongly mismtatched idenitifers
-                        identifiers_accum = set()
-                    for ident in contextIdentifiers.splitlines():
-                        if ident := self.get_valid_identifier(autocomplete, ident.strip()): 
-                            identifiers_accum.add(ident)
-                
-                if modifyIdentifiers:
-                    for ident in modifyIdentifiers.splitlines():
-                        if ident := self.get_valid_identifier(autocomplete, ident.strip()):
-                            modify_accum.add(ident.strip())
-                
-                if expandPaths:
-                    expand_paths = [
-                        path for ident in expandPaths if (path := self.get_valid_identifier(autocomplete, ident.strip()))
-                    ]
-                
-                # Check if we have enough identifiers (unified prompt includes this decision)
-                if "ENOUGH_IDENTIFIERS: TRUE" in unified_response.upper():
-                    done = True
-                else:
-                    smart_search_attempts += 1
-                    if smart_search_attempts >= max_smart_search_attempts:
+                    # Accumulate identifiers
+                    if contextIdentifiers:
+                        if smart_search_attempts == 0:
+                            ### clean wrongly mismtatched idenitifers
+                            identifiers_accum = set()
+                        for ident in contextIdentifiers.splitlines():
+                            if ident := self.get_valid_identifier(autocomplete, ident.strip()): 
+                                identifiers_accum.add(ident)
+                    
+                    if modifyIdentifiers:
+                        for ident in modifyIdentifiers.splitlines():
+                            if ident := self.get_valid_identifier(autocomplete, ident.strip()):
+                                modify_accum.add(ident.strip())
+                    
+                    if expandPaths:
+                        expand_paths = [
+                            path for ident in expandPaths if (path := self.get_valid_identifier(autocomplete, ident.strip()))
+                        ]
+                    
+                    # Check if we have enough identifiers (unified prompt includes this decision)
+                    if "ENOUGH_IDENTIFIERS: TRUE" in unified_response.upper():
                         done = True
-
-            # Finalize identifiers
-            self.reasoning = "\n\n".join(reasoning_accum)
-            self.contextIdentifiers = list(identifiers_accum) if identifiers_accum else None
-            self.modifyIdentifiers = list(modify_accum) if modify_accum else None
-
-            codeIdentifiers = self.contextIdentifiers or []
-            if self.modifyIdentifiers:
-                self.modifyIdentifiers = self.tide._as_file_paths(self.modifyIdentifiers)
-                codeIdentifiers.extend(self.modifyIdentifiers)
+                    else:
+                        smart_search_attempts += 1
+                        if smart_search_attempts >= max_smart_search_attempts:
+                            done = True
+
+                # Finalize identifiers
+                self.reasoning = "\n\n".join(reasoning_accum)
+                self.contextIdentifiers = list(identifiers_accum) if identifiers_accum else None
+                self.modifyIdentifiers = list(modify_accum) if modify_accum else None
+
+                codeIdentifiers = self.contextIdentifiers or []
+                if self.modifyIdentifiers:
+                    self.modifyIdentifiers = self.tide._as_file_paths(self.modifyIdentifiers)
+                    codeIdentifiers.extend(self.modifyIdentifiers)
+                # TODO preserve passed identifiers by the user
+                codeIdentifiers += matches["all_found_words"] 
 
             # --- End Unified Identifier Retrieval ---
             if codeIdentifiers:
@@ -232,7 +244,7 @@ async def agent_loop(self, codeIdentifiers :Optional[List[str]]=None):
 
             if not codeContext:
                 codeContext = REPO_TREE_CONTEXT_PROMPT.format(REPO_TREE=self.tide.codebase.get_tree_view())
-                readmeFile = self.tide.get("README.md", as_string_list=True)
+                readmeFile = self.tide.get(["README.md"] + matches["all_found_words"] , as_string_list=True)
                 if readmeFile:
                     codeContext = "\n".join([codeContext, README_CONTEXT_PROMPT.format(README=readmeFile)])
 
@@ -431,5 +443,7 @@ async def _handle_commands(self, command :str) -> str:
         context = ""
         if command == "commit":
             context = await self.prepare_commit()
+        elif command == "direct_mode":
+            self._direct_mode = True
 
         return context
diff --git a/codetide/agents/tide/prompts.py b/codetide/agents/tide/prompts.py
@@ -457,8 +457,12 @@
    - Code identifiers should use dot notation (e.g., `module.submodule.Class.method`) without file extensions
 
 2. **Identifier Categories:**
-   - **Context Identifiers:** Elements needed to understand or provide context for the request, but not directly modified
-   - **Modify Identifiers:** Elements that will likely require direct modification to fulfill the request
+   - **Context Identifiers:** Only include identifiers that correspond to functions, classes, methods, variables, or attributes defined in the codebase. Do **not** include package names, import statements, or dependencies based solely on import/package presence—even if they are present in the accumulated context.
+   - **Modify Identifiers:** Only include identifiers that correspond to functions, classes, methods, variables, or attributes that will likely require direct modification. Do **not** include package names, import statements, or dependencies based solely on import/package presence—even if they are present in the accumulated context.
+
+3. **ABSOLUTE PROHIBITION ON DEPENDENCY INCLUSION:**
+   - Never include identifiers in the Context Identifiers or Modify Identifiers sections that represent only package imports, external dependencies, or modules that are not actual code elements (functions, classes, methods, variables, or attributes) defined in the codebase.
+   - Even if a package or import name is present in the accumulated context, do not include it unless it refers to a concrete function, class, method, variable, or attribute in the codebase.
 
 **UNIFIED ANALYSIS PROTOCOL**
 
diff --git a/codetide/agents/tide/ui/agent_tide_ui.py b/codetide/agents/tide/ui/agent_tide_ui.py
@@ -43,7 +43,8 @@ def __init__(self, project_path: Path = Path("./"), history :Optional[list]=None
             "review": CMD_CODE_REVIEW_PROMPT,
             "test": CMD_WRITE_TESTS_PROMPT,
             "commit": CMD_COMMIT_PROMPT,
-            "brainstorm": CMD_BRAINSTORM_PROMPT
+            "brainstorm": CMD_BRAINSTORM_PROMPT,
+            "direct_mode": ""
         }
         self.session_id = session_id if session_id else ulid()
     
@@ -52,7 +53,8 @@ def __init__(self, project_path: Path = Path("./"), history :Optional[list]=None
         {"id": "test", "icon": "flask-conical", "description": "Test file(s) or object(s)"},
         {"id": "commit", "icon": "git-commit", "description": "Commit changed files"},
         {"id": "plan", "icon": "notepad-text-dashed", "description": "Create a step-by-step task plan"},
-        {"id": "brainstorm", "icon": "brain-circuit", "description": "Brainstorm and discuss solutions (no code generation)"}
+        {"id": "brainstorm", "icon": "brain-circuit", "description": "Brainstorm and discuss solutions (no code generation)"},
+        {"id": "direct_mode", "icon": "search-code", "description": "Skip repository analysis and jump straight into code generation with the specified context (identifiers or paths)"}
     ]
 
     async def load(self):
@@ -133,4 +135,5 @@ def settings(self):
     
     async def get_command_prompt(self, command :str)->Optional[str]:
         context = await self.agent_tide._handle_commands(command)
-        return f"{self.commands_prompts.get(command)} {context}" 
+        return f"{self.commands_prompts.get(command)} {context}".strip()
+
diff --git a/codetide/autocomplete.py b/codetide/autocomplete.py
@@ -170,15 +170,24 @@ def validate_paths(self, file_paths):
                     raise ValueError(f"Invalid file path: '{path}'")
         return valid_paths
     
-    def extract_words_from_text(self, text: str, similarity_threshold: float = 0.6, case_sensitive: bool = False) -> dict:
+    def extract_words_from_text(
+       self,
+        text: str,
+        similarity_threshold: float = 0.6,
+        case_sensitive: bool = False,
+        max_matches_per_word: int = None
+    ) -> dict:
         """
         Extract words from the word list that are present in the given text, including similar words (potential typos).
-        
+        Optionally limit the number of matches returned per word found in the text.
+
         Args:
             text (str): The input text to analyze
             similarity_threshold (float): Minimum similarity score for fuzzy matching (0.0 to 1.0)
             case_sensitive (bool): Whether matching should be case sensitive
-        
+            max_matches_per_word (int, optional): Maximum number of matches to return per word in the text.
+                If None, all matches are returned. If 1, only the top match per word is returned.
+
         Returns:
             dict: Dictionary containing:
                 - 'exact_matches': List of words found exactly in the text
@@ -191,64 +200,71 @@ def extract_words_from_text(self, text: str, similarity_threshold: float = 0.6,
                 'fuzzy_matches': [],
                 'all_found_words': []
             }
-        
+
         # Split text into words (remove punctuation and split by whitespace)
         text_words = re.findall(r'\b\w+\b', text)
-        
+
         exact_matches = []
         fuzzy_matches = []
         all_found_words = set()
-        
+
         # Convert to appropriate case for comparison
         if case_sensitive:
             text_words_search = text_words
             word_list_search = self.words
         else:
             text_words_search = [word.lower() for word in text_words]
             word_list_search = [word.lower() for word in self.words]
-        
+
         # Find exact matches
         for i, text_word in enumerate(text_words_search):
+            per_word_matches = 0
             for j, list_word in enumerate(word_list_search):
                 if text_word == list_word:
                     original_word = self.words[j]
                     if original_word not in all_found_words:
                         exact_matches.append(original_word)
                         all_found_words.add(original_word)
-        
+                        per_word_matches += 1
+                        if max_matches_per_word is not None and per_word_matches >= max_matches_per_word:
+                            break
+
         # Find fuzzy matches for words that didn't match exactly
         matched_text_words = set()
         for match in exact_matches:
             search_match = match if case_sensitive else match.lower()
             for i, text_word in enumerate(text_words_search):
                 if text_word == search_match:
-                    matched_text_words.add(i)
-        
+                   matched_text_words.add(i)
+
         # Check remaining text words for fuzzy matches
         for i, text_word in enumerate(text_words_search):
             if i in matched_text_words:
                 continue
-                
-            # Find the most similar word from our word list
+
+            # Find the most similar word(s) from our word list
             best_matches = []
             for j, list_word in enumerate(word_list_search):
                 similarity = difflib.SequenceMatcher(None, text_word, list_word).ratio()
                 if similarity >= similarity_threshold:
                     best_matches.append((self.words[j], text_words[i], similarity))
-            
-            # Sort by similarity and add to results
+
+            # Sort by similarity and add up to max_matches_per_word to results
             if best_matches:
                 best_matches.sort(key=lambda x: x[2], reverse=True)
-                for match in best_matches:
+                matches_to_add = best_matches
+                if max_matches_per_word is not None:
+                    matches_to_add = best_matches[:max_matches_per_word]
+                for match in matches_to_add:
                     word_from_list, word_in_text, score = match
                     if word_from_list not in all_found_words:
                         fuzzy_matches.append((word_from_list, word_in_text, score))
                         all_found_words.add(word_from_list)
-        
+
         # Sort results
         exact_matches.sort()
         fuzzy_matches.sort(key=lambda x: x[2], reverse=True)  # Sort by similarity score
-        
+
         return {
             'exact_matches': exact_matches,
             'fuzzy_matches': fuzzy_matches,
diff --git a/codetide/core/models.py b/codetide/core/models.py