CodeTide/codetide/autocomplete.py at dc4c0aa01bb7c682b12380e3654d5777cb0f8f2c · BrunoV21/CodeTide · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
from typing import List
import difflib
import os
import re

class AutoComplete:
    def __init__(self, word_list: List[str]) -> None:
        """Initialize with a list of strings to search from"""
        self.words = word_list
        # Sort words for better organization (optional)
        self.words.sort()

    def get_suggestions(self, prefix: str, max_suggestions: int = 10, case_sensitive: bool = False) -> List[str]:
        """
        Get autocomplete suggestions based on prefix

        Args:
            prefix (str): The text to search for
            max_suggestions (int): Maximum number of suggestions to return
            case_sensitive (bool): Whether matching should be case sensitive

        Returns:
            list: List of matching suggestions
        """
        if not prefix:
            return []

        suggestions = []
        search_prefix = prefix if case_sensitive else prefix.lower()

        for word in self.words:
            search_word = word if case_sensitive else word.lower()

            if search_word.startswith(search_prefix):
                suggestions.append(word)

                # Stop when we reach max suggestions
                if len(suggestions) >= max_suggestions:
                    break

        return suggestions

    def get_fuzzy_suggestions(self, prefix: str, max_suggestions: int = 10, case_sensitive: bool = False) -> List[str]:
        """
        Get suggestions that contain the prefix anywhere in the string

        Args:
            prefix (str): The text to search for
            max_suggestions (int): Maximum number of suggestions to return
            case_sensitive (bool): Whether matching should be case sensitive

        Returns:
            list: List of matching suggestions
        """
        if not prefix:
            return []

        suggestions = []
        search_prefix = prefix if case_sensitive else prefix.lower()

        for word in self.words:
            search_word = word if case_sensitive else word.lower()

            if search_prefix in search_word:
                suggestions.append(word)

                if len(suggestions) >= max_suggestions:
                    break

        return suggestions

    def validate_code_identifier(self, code_identifier, max_suggestions=5, case_sensitive=False):
        """
        Validate a code identifier and return similar matches if not found

        Args:
            code_identifier (str): The code identifier to validate
            max_suggestions (int): Maximum number of similar suggestions to return
            case_sensitive (bool): Whether matching should be case sensitive

        Returns:
            dict: Dictionary with validation result
        """
        if not code_identifier:
            return {
                "code_identifier": code_identifier,
                "is_valid": False,
                "matching_identifiers": []
            }

        # Check for perfect match
        search_word = code_identifier if case_sensitive else code_identifier.lower()
        words_to_check = self.words if case_sensitive else [word.lower() for word in self.words]

        if search_word in words_to_check:
            return {
                "code_identifier": code_identifier,
                "is_valid": True,
                "matching_identifiers": []
            }

        # If not perfect match, find most similar ones using difflib
        # Get close matches using sequence matching
        close_matches = difflib.get_close_matches(
            code_identifier,
            self.words,
            n=max_suggestions,
            cutoff=0.3  # Minimum similarity threshold
        )

        # If we don't have enough close matches, supplement with fuzzy matches
        if len(close_matches) < max_suggestions:
            fuzzy_matches = self.get_fuzzy_suggestions(
                code_identifier,
                max_suggestions * 2,  # Get more to filter
                case_sensitive
            )

            # Add fuzzy matches that aren't already in close_matches
            for match in fuzzy_matches:
                if match not in close_matches and len(close_matches) < max_suggestions:
                    close_matches.append(match)

        # Sort by similarity score (difflib ratio)
        if close_matches:
            similarity_scores = []
            for match in close_matches:
                score = difflib.SequenceMatcher(None, code_identifier, match).ratio()
                similarity_scores.append((match, score))

            # Sort by score descending (most similar first)
            similarity_scores.sort(key=lambda x: x[1], reverse=True)
            close_matches = [match for match, score in similarity_scores[:max_suggestions]]

        return {
            "code_identifier": code_identifier,
            "is_valid": False,
            "matching_identifiers": close_matches
        }

    def validate_paths(self, file_paths):
        """
        Validate a list of file paths. For each path, check if it is valid; if not, try to match it to a valid one using autocomplete logic.
        Args:
            file_paths (list of str): List of file paths to validate.
        Returns:
            list of str: List of valid file paths (matched or original).
        Raises:
            ValueError: If a path cannot be matched to a valid entry.
        """
        valid_paths = []
        valid_set = set(self.words)
        for path in file_paths:
            # Direct match
            if path in valid_set:
                valid_paths.append(path)
                continue
            # Try normalization: replace '.' with os.sep, strip leading/trailing spaces
            normalized = path.replace('.', os.sep).replace('\\', os.sep).replace('/', os.sep).strip()
            # Try to match normalized path
            if normalized in valid_set:
                valid_paths.append(normalized)
                continue

            # Try to find close matches using autocomplete logic
            suggestions = []
            if hasattr(self, "get_fuzzy_suggestions"):
                suggestions = self.get_fuzzy_suggestions(path, 1)
                if not suggestions:
                    raise ValueError(f"Invalid file path: '{path}'")
        return valid_paths

    def extract_words_from_text(
       self,
        text: str,
        similarity_threshold: float = 0.6,
        case_sensitive: bool = False,
        max_matches_per_word: int = None
    ) -> dict:
        """
        Extract words from the word list that are present in the given text, including similar words (potential typos).
        Optionally limit the number of matches returned per word found in the text.

        Args:
            text (str): The input text to analyze
            similarity_threshold (float): Minimum similarity score for fuzzy matching (0.0 to 1.0)
            case_sensitive (bool): Whether matching should be case sensitive
            max_matches_per_word (int, optional): Maximum number of matches to return per word in the text.
                If None, all matches are returned. If 1, only the top match per word is returned.

        Returns:
            dict: Dictionary containing:
                - 'exact_matches': List of words found exactly in the text
                - 'fuzzy_matches': List of tuples (word_from_list, similar_word_in_text, similarity_score)
                - 'all_found_words': Combined list of all matched words from the word list
        """
        if not text:
            return {
                'exact_matches': [],
                'fuzzy_matches': [],
                'all_found_words': []
            }

        # Split text into words (remove punctuation and split by whitespace)
        text_words = re.findall(r'\b\w+\b', text)

        exact_matches = []
        fuzzy_matches = []
        all_found_words = set()

        # Convert to appropriate case for comparison
        if case_sensitive:
            text_words_search = text_words
            word_list_search = self.words
        else:
            text_words_search = [word.lower() for word in text_words]
            word_list_search = [word.lower() for word in self.words]

        # Find exact matches
        for i, text_word in enumerate(text_words_search):
            per_word_matches = 0
            for j, list_word in enumerate(word_list_search):
                if text_word == list_word:
                    original_word = self.words[j]
                    if original_word not in all_found_words:
                        exact_matches.append(original_word)
                        all_found_words.add(original_word)
                        per_word_matches += 1
                        if max_matches_per_word is not None and per_word_matches >= max_matches_per_word:
                            break

        # Find fuzzy matches for words that didn't match exactly
        matched_text_words = set()
        for match in exact_matches:
            search_match = match if case_sensitive else match.lower()
            for i, text_word in enumerate(text_words_search):
                if text_word == search_match:
                   matched_text_words.add(i)

        # Check remaining text words for fuzzy matches
        for i, text_word in enumerate(text_words_search):
            if i in matched_text_words:
                continue

            # Find the most similar word(s) from our word list
            best_matches = []
            for j, list_word in enumerate(word_list_search):
                similarity = difflib.SequenceMatcher(None, text_word, list_word).ratio()
                if similarity >= similarity_threshold:
                    best_matches.append((self.words[j], text_words[i], similarity))

            # Sort by similarity and add up to max_matches_per_word to results
            if best_matches:
                best_matches.sort(key=lambda x: x[2], reverse=True)
                matches_to_add = best_matches
                if max_matches_per_word is not None:
                    matches_to_add = best_matches[:max_matches_per_word]
                for match in matches_to_add:
                    word_from_list, word_in_text, score = match
                    if word_from_list not in all_found_words:
                        fuzzy_matches.append((word_from_list, word_in_text, score))
                        all_found_words.add(word_from_list)

        # Sort results
        exact_matches.sort()
        fuzzy_matches.sort(key=lambda x: x[2], reverse=True)  # Sort by similarity score

        return {
            'exact_matches': exact_matches,
            'fuzzy_matches': fuzzy_matches,
            'all_found_words': sorted(list(all_found_words))
        }