Skip to content

Commit ef020a2

Browse files
authored
Merge pull request #37 from BrunoV21/get-duplicate-protection
Get duplicate protection
2 parents 0029d33 + dc4c0aa commit ef020a2

File tree

7 files changed

+155
-108
lines changed

7 files changed

+155
-108
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,4 +186,6 @@ examples/hf_demo_space/chainlit.md
186186

187187
examples/hf_demo_space/public/
188188
database.db-journal
189+
database.db-shm
189190
.chainlit/
191+
pgdata/

codetide/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ def relative_filepaths(self)->List[str]:
100100

101101
@property
102102
def cached_ids(self)->List[str]:
103-
return self.codebase.unique_ids+self.relative_filepaths
103+
return self.codebase.non_import_unique_ids+self.relative_filepaths
104104

105105
@property
106106
def repo(self)->Optional[pygit2.Repository]:

codetide/agents/tide/agent.py

Lines changed: 99 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ class AgentTide(BaseModel):
5858
_last_code_identifers :Optional[Set[str]]=set()
5959
_last_code_context :Optional[str] = None
6060
_has_patch :bool=False
61+
_direct_mode :bool=False
6162

6263
model_config = ConfigDict(arbitrary_types_allowed=True)
6364

@@ -138,92 +139,103 @@ async def agent_loop(self, codeIdentifiers :Optional[List[str]]=None):
138139
...
139140
else:
140141
autocomplete = AutoComplete(self.tide.cached_ids)
141-
matches = autocomplete.extract_words_from_text("\n\n".join(self.history))
142-
143-
# --- Begin Unified Identifier Retrieval ---
144-
identifiers_accum = set(matches["all_found_words"]) if codeIdentifiers is None else set(codeIdentifiers + matches["all_found_words"])
145-
modify_accum = set()
146-
reasoning_accum = []
147-
repo_tree = None
148-
smart_search_attempts = 0
149-
max_smart_search_attempts = 3
150-
done = False
151-
previous_reason = None
152-
153-
while not done:
154-
expand_paths = ["./"]
155-
# 1. SmartCodeSearch to filter repo tree
156-
if repo_tree is None or smart_search_attempts > 0:
157-
repo_history = self.history
158-
if previous_reason:
159-
repo_history += [previous_reason]
142+
if self._direct_mode:
143+
self.contextIdentifiers = None
144+
exact_matches = autocomplete.extract_words_from_text(self.history[-1], max_matches_per_word=1)["all_found_words"]
145+
self.modifyIdentifiers = self.tide._as_file_paths(exact_matches)
146+
codeIdentifiers = self.modifyIdentifiers
147+
self._direct_mode = False
148+
149+
else:
150+
matches = autocomplete.extract_words_from_text("\n\n".join(self.history), max_matches_per_word=1)
151+
152+
# --- Begin Unified Identifier Retrieval ---
153+
identifiers_accum = set(matches["all_found_words"]) if codeIdentifiers is None else set(codeIdentifiers + matches["all_found_words"])
154+
modify_accum = set()
155+
reasoning_accum = []
156+
repo_tree = None
157+
smart_search_attempts = 0
158+
max_smart_search_attempts = 3
159+
done = False
160+
previous_reason = None
161+
162+
while not done:
163+
expand_paths = ["./"]
164+
# 1. SmartCodeSearch to filter repo tree
165+
if repo_tree is None or smart_search_attempts > 0:
166+
repo_history = self.history
167+
if previous_reason:
168+
repo_history += [previous_reason]
169+
170+
repo_tree = await self.get_repo_tree_from_user_prompt(self.history, include_modules=bool(smart_search_attempts), expand_paths=expand_paths)
171+
172+
# 2. Single LLM call with unified prompt
173+
# Pass accumulated identifiers for context if this isn't the first iteration
174+
accumulated_context = "\n".join(
175+
sorted((identifiers_accum or set()) | (modify_accum or set()))
176+
) if (identifiers_accum or modify_accum) else ""
177+
178+
unified_response = await self.llm.acomplete(
179+
self.history,
180+
system_prompt=[GET_CODE_IDENTIFIERS_UNIFIED_PROMPT.format(
181+
DATE=TODAY,
182+
SUPPORTED_LANGUAGES=SUPPORTED_LANGUAGES,
183+
IDENTIFIERS=accumulated_context
184+
)],
185+
prefix_prompt=repo_tree,
186+
stream=False
187+
)
188+
print(f"{unified_response=}")
189+
190+
# Parse the unified response
191+
contextIdentifiers = parse_blocks(unified_response, block_word="Context Identifiers", multiple=False)
192+
modifyIdentifiers = parse_blocks(unified_response, block_word="Modify Identifiers", multiple=False)
193+
expandPaths = parse_blocks(unified_response, block_word="Expand Paths", multiple=False)
194+
195+
# Extract reasoning (everything before the first "*** Begin")
196+
reasoning_parts = unified_response.split("*** Begin")
197+
if reasoning_parts:
198+
reasoning_accum.append(reasoning_parts[0].strip())
199+
previous_reason = reasoning_accum[-1]
160200

161-
repo_tree = await self.get_repo_tree_from_user_prompt(self.history, include_modules=bool(smart_search_attempts), expand_paths=expand_paths)
162-
163-
# 2. Single LLM call with unified prompt
164-
# Pass accumulated identifiers for context if this isn't the first iteration
165-
accumulated_context = "\n".join(
166-
sorted((identifiers_accum or set()) | (modify_accum or set()))
167-
) if (identifiers_accum or modify_accum) else ""
168-
169-
unified_response = await self.llm.acomplete(
170-
self.history,
171-
system_prompt=[GET_CODE_IDENTIFIERS_UNIFIED_PROMPT.format(
172-
DATE=TODAY,
173-
SUPPORTED_LANGUAGES=SUPPORTED_LANGUAGES,
174-
IDENTIFIERS=accumulated_context
175-
)],
176-
prefix_prompt=repo_tree,
177-
stream=False
178-
)
179-
180-
# Parse the unified response
181-
contextIdentifiers = parse_blocks(unified_response, block_word="Context Identifiers", multiple=False)
182-
modifyIdentifiers = parse_blocks(unified_response, block_word="Modify Identifiers", multiple=False)
183-
expandPaths = parse_blocks(unified_response, block_word="Expand Paths", multiple=False)
184-
185-
# Extract reasoning (everything before the first "*** Begin")
186-
reasoning_parts = unified_response.split("*** Begin")
187-
if reasoning_parts:
188-
reasoning_accum.append(reasoning_parts[0].strip())
189-
previous_reason = reasoning_accum[-1]
190-
191-
# Accumulate identifiers
192-
if contextIdentifiers:
193-
if smart_search_attempts == 0:
194-
### clean wrongly mismtatched idenitifers
195-
identifiers_accum = set()
196-
for ident in contextIdentifiers.splitlines():
197-
if ident := self.get_valid_identifier(autocomplete, ident.strip()):
198-
identifiers_accum.add(ident)
199-
200-
if modifyIdentifiers:
201-
for ident in modifyIdentifiers.splitlines():
202-
if ident := self.get_valid_identifier(autocomplete, ident.strip()):
203-
modify_accum.add(ident.strip())
204-
205-
if expandPaths:
206-
expand_paths = [
207-
path for ident in expandPaths if (path := self.get_valid_identifier(autocomplete, ident.strip()))
208-
]
209-
210-
# Check if we have enough identifiers (unified prompt includes this decision)
211-
if "ENOUGH_IDENTIFIERS: TRUE" in unified_response.upper():
212-
done = True
213-
else:
214-
smart_search_attempts += 1
215-
if smart_search_attempts >= max_smart_search_attempts:
201+
# Accumulate identifiers
202+
if contextIdentifiers:
203+
if smart_search_attempts == 0:
204+
### clean wrongly mismtatched idenitifers
205+
identifiers_accum = set()
206+
for ident in contextIdentifiers.splitlines():
207+
if ident := self.get_valid_identifier(autocomplete, ident.strip()):
208+
identifiers_accum.add(ident)
209+
210+
if modifyIdentifiers:
211+
for ident in modifyIdentifiers.splitlines():
212+
if ident := self.get_valid_identifier(autocomplete, ident.strip()):
213+
modify_accum.add(ident.strip())
214+
215+
if expandPaths:
216+
expand_paths = [
217+
path for ident in expandPaths if (path := self.get_valid_identifier(autocomplete, ident.strip()))
218+
]
219+
220+
# Check if we have enough identifiers (unified prompt includes this decision)
221+
if "ENOUGH_IDENTIFIERS: TRUE" in unified_response.upper():
216222
done = True
217-
218-
# Finalize identifiers
219-
self.reasoning = "\n\n".join(reasoning_accum)
220-
self.contextIdentifiers = list(identifiers_accum) if identifiers_accum else None
221-
self.modifyIdentifiers = list(modify_accum) if modify_accum else None
222-
223-
codeIdentifiers = self.contextIdentifiers or []
224-
if self.modifyIdentifiers:
225-
self.modifyIdentifiers = self.tide._as_file_paths(self.modifyIdentifiers)
226-
codeIdentifiers.extend(self.modifyIdentifiers)
223+
else:
224+
smart_search_attempts += 1
225+
if smart_search_attempts >= max_smart_search_attempts:
226+
done = True
227+
228+
# Finalize identifiers
229+
self.reasoning = "\n\n".join(reasoning_accum)
230+
self.contextIdentifiers = list(identifiers_accum) if identifiers_accum else None
231+
self.modifyIdentifiers = list(modify_accum) if modify_accum else None
232+
233+
codeIdentifiers = self.contextIdentifiers or []
234+
if self.modifyIdentifiers:
235+
self.modifyIdentifiers = self.tide._as_file_paths(self.modifyIdentifiers)
236+
codeIdentifiers.extend(self.modifyIdentifiers)
237+
# TODO preserve passed identifiers by the user
238+
codeIdentifiers += matches["all_found_words"]
227239

228240
# --- End Unified Identifier Retrieval ---
229241
if codeIdentifiers:
@@ -232,7 +244,7 @@ async def agent_loop(self, codeIdentifiers :Optional[List[str]]=None):
232244

233245
if not codeContext:
234246
codeContext = REPO_TREE_CONTEXT_PROMPT.format(REPO_TREE=self.tide.codebase.get_tree_view())
235-
readmeFile = self.tide.get("README.md", as_string_list=True)
247+
readmeFile = self.tide.get(["README.md"] + matches["all_found_words"] , as_string_list=True)
236248
if readmeFile:
237249
codeContext = "\n".join([codeContext, README_CONTEXT_PROMPT.format(README=readmeFile)])
238250

@@ -431,5 +443,7 @@ async def _handle_commands(self, command :str) -> str:
431443
context = ""
432444
if command == "commit":
433445
context = await self.prepare_commit()
446+
elif command == "direct_mode":
447+
self._direct_mode = True
434448

435449
return context

codetide/agents/tide/prompts.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -457,8 +457,12 @@
457457
- Code identifiers should use dot notation (e.g., `module.submodule.Class.method`) without file extensions
458458
459459
2. **Identifier Categories:**
460-
- **Context Identifiers:** Elements needed to understand or provide context for the request, but not directly modified
461-
- **Modify Identifiers:** Elements that will likely require direct modification to fulfill the request
460+
- **Context Identifiers:** Only include identifiers that correspond to functions, classes, methods, variables, or attributes defined in the codebase. Do **not** include package names, import statements, or dependencies based solely on import/package presence—even if they are present in the accumulated context.
461+
- **Modify Identifiers:** Only include identifiers that correspond to functions, classes, methods, variables, or attributes that will likely require direct modification. Do **not** include package names, import statements, or dependencies based solely on import/package presence—even if they are present in the accumulated context.
462+
463+
3. **ABSOLUTE PROHIBITION ON DEPENDENCY INCLUSION:**
464+
- Never include identifiers in the Context Identifiers or Modify Identifiers sections that represent only package imports, external dependencies, or modules that are not actual code elements (functions, classes, methods, variables, or attributes) defined in the codebase.
465+
- Even if a package or import name is present in the accumulated context, do not include it unless it refers to a concrete function, class, method, variable, or attribute in the codebase.
462466
463467
**UNIFIED ANALYSIS PROTOCOL**
464468

codetide/agents/tide/ui/agent_tide_ui.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ def __init__(self, project_path: Path = Path("./"), history :Optional[list]=None
4343
"review": CMD_CODE_REVIEW_PROMPT,
4444
"test": CMD_WRITE_TESTS_PROMPT,
4545
"commit": CMD_COMMIT_PROMPT,
46-
"brainstorm": CMD_BRAINSTORM_PROMPT
46+
"brainstorm": CMD_BRAINSTORM_PROMPT,
47+
"direct_mode": ""
4748
}
4849
self.session_id = session_id if session_id else ulid()
4950

@@ -52,7 +53,8 @@ def __init__(self, project_path: Path = Path("./"), history :Optional[list]=None
5253
{"id": "test", "icon": "flask-conical", "description": "Test file(s) or object(s)"},
5354
{"id": "commit", "icon": "git-commit", "description": "Commit changed files"},
5455
{"id": "plan", "icon": "notepad-text-dashed", "description": "Create a step-by-step task plan"},
55-
{"id": "brainstorm", "icon": "brain-circuit", "description": "Brainstorm and discuss solutions (no code generation)"}
56+
{"id": "brainstorm", "icon": "brain-circuit", "description": "Brainstorm and discuss solutions (no code generation)"},
57+
{"id": "direct_mode", "icon": "search-code", "description": "Skip repository analysis and jump straight into code generation with the specified context (identifiers or paths)"}
5658
]
5759

5860
async def load(self):
@@ -133,4 +135,5 @@ def settings(self):
133135

134136
async def get_command_prompt(self, command :str)->Optional[str]:
135137
context = await self.agent_tide._handle_commands(command)
136-
return f"{self.commands_prompts.get(command)} {context}"
138+
return f"{self.commands_prompts.get(command)} {context}".strip()
139+

codetide/autocomplete.py

Lines changed: 33 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -170,15 +170,24 @@ def validate_paths(self, file_paths):
170170
raise ValueError(f"Invalid file path: '{path}'")
171171
return valid_paths
172172

173-
def extract_words_from_text(self, text: str, similarity_threshold: float = 0.6, case_sensitive: bool = False) -> dict:
173+
def extract_words_from_text(
174+
self,
175+
text: str,
176+
similarity_threshold: float = 0.6,
177+
case_sensitive: bool = False,
178+
max_matches_per_word: int = None
179+
) -> dict:
174180
"""
175181
Extract words from the word list that are present in the given text, including similar words (potential typos).
176-
182+
Optionally limit the number of matches returned per word found in the text.
183+
177184
Args:
178185
text (str): The input text to analyze
179186
similarity_threshold (float): Minimum similarity score for fuzzy matching (0.0 to 1.0)
180187
case_sensitive (bool): Whether matching should be case sensitive
181-
188+
max_matches_per_word (int, optional): Maximum number of matches to return per word in the text.
189+
If None, all matches are returned. If 1, only the top match per word is returned.
190+
182191
Returns:
183192
dict: Dictionary containing:
184193
- 'exact_matches': List of words found exactly in the text
@@ -191,64 +200,71 @@ def extract_words_from_text(self, text: str, similarity_threshold: float = 0.6,
191200
'fuzzy_matches': [],
192201
'all_found_words': []
193202
}
194-
203+
195204
# Split text into words (remove punctuation and split by whitespace)
196205
text_words = re.findall(r'\b\w+\b', text)
197-
206+
198207
exact_matches = []
199208
fuzzy_matches = []
200209
all_found_words = set()
201-
210+
202211
# Convert to appropriate case for comparison
203212
if case_sensitive:
204213
text_words_search = text_words
205214
word_list_search = self.words
206215
else:
207216
text_words_search = [word.lower() for word in text_words]
208217
word_list_search = [word.lower() for word in self.words]
209-
218+
210219
# Find exact matches
211220
for i, text_word in enumerate(text_words_search):
221+
per_word_matches = 0
212222
for j, list_word in enumerate(word_list_search):
213223
if text_word == list_word:
214224
original_word = self.words[j]
215225
if original_word not in all_found_words:
216226
exact_matches.append(original_word)
217227
all_found_words.add(original_word)
218-
228+
per_word_matches += 1
229+
if max_matches_per_word is not None and per_word_matches >= max_matches_per_word:
230+
break
231+
219232
# Find fuzzy matches for words that didn't match exactly
220233
matched_text_words = set()
221234
for match in exact_matches:
222235
search_match = match if case_sensitive else match.lower()
223236
for i, text_word in enumerate(text_words_search):
224237
if text_word == search_match:
225-
matched_text_words.add(i)
226-
238+
matched_text_words.add(i)
239+
227240
# Check remaining text words for fuzzy matches
228241
for i, text_word in enumerate(text_words_search):
229242
if i in matched_text_words:
230243
continue
231-
232-
# Find the most similar word from our word list
244+
245+
# Find the most similar word(s) from our word list
233246
best_matches = []
234247
for j, list_word in enumerate(word_list_search):
235248
similarity = difflib.SequenceMatcher(None, text_word, list_word).ratio()
236249
if similarity >= similarity_threshold:
237250
best_matches.append((self.words[j], text_words[i], similarity))
238-
239-
# Sort by similarity and add to results
251+
252+
# Sort by similarity and add up to max_matches_per_word to results
240253
if best_matches:
241254
best_matches.sort(key=lambda x: x[2], reverse=True)
242-
for match in best_matches:
255+
matches_to_add = best_matches
256+
if max_matches_per_word is not None:
257+
matches_to_add = best_matches[:max_matches_per_word]
258+
for match in matches_to_add:
243259
word_from_list, word_in_text, score = match
244260
if word_from_list not in all_found_words:
245261
fuzzy_matches.append((word_from_list, word_in_text, score))
246262
all_found_words.add(word_from_list)
247-
263+
248264
# Sort results
249265
exact_matches.sort()
250266
fuzzy_matches.sort(key=lambda x: x[2], reverse=True) # Sort by similarity score
251-
267+
252268
return {
253269
'exact_matches': exact_matches,
254270
'fuzzy_matches': fuzzy_matches,

0 commit comments

Comments
 (0)