Skip to content

Commit 314288e

Browse files
committed
fix: resolve all SonarCloud issues — ReDoS, complexity, regex style
1 parent 509a725 commit 314288e

7 files changed

Lines changed: 198 additions & 142 deletions

File tree

src/treemapper/diffctx/__init__.py

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,30 @@ def _truncate_generated_fragments(file_frags: list[Fragment]) -> list[Fragment]:
214214
return truncated
215215

216216

217+
def _dedup_fragments(raw_frags: list[Fragment]) -> list[Fragment]:
218+
seen: set[FragmentId] = set()
219+
result: list[Fragment] = []
220+
for f in raw_frags:
221+
if f.id not in seen:
222+
result.append(f)
223+
seen.add(f.id)
224+
return result
225+
226+
227+
def _cap_fragments(file_frags: list[Fragment], cap: int, file_path: Path, is_generated: bool) -> list[Fragment]:
228+
if len(file_frags) <= cap:
229+
return file_frags
230+
file_frags.sort(key=lambda f: f.line_count, reverse=True)
231+
file_frags = file_frags[:cap]
232+
logger.debug(
233+
"diffctx: capped %s to %d fragments%s",
234+
file_path.name,
235+
cap,
236+
" (generated)" if is_generated else "",
237+
)
238+
return file_frags
239+
240+
217241
def _process_files_for_fragments(
218242
files: list[Path],
219243
root_dir: Path,
@@ -227,25 +251,11 @@ def _process_files_for_fragments(
227251
if content is None:
228252
continue
229253
raw_frags = [f for f in fragment_file(file_path, content) if f.id not in seen_frag_ids]
230-
local_seen: set[FragmentId] = set()
231-
file_frags = []
232-
for f in raw_frags:
233-
if f.id not in local_seen:
234-
file_frags.append(f)
235-
local_seen.add(f.id)
254+
file_frags = _dedup_fragments(raw_frags)
236255

237256
is_generated = _is_generated_file(file_path, content)
238257
cap = _MAX_GENERATED_FRAGMENTS if is_generated else max_frags
239-
240-
if len(file_frags) > cap:
241-
file_frags.sort(key=lambda f: f.line_count, reverse=True)
242-
file_frags = file_frags[:cap]
243-
logger.debug(
244-
"diffctx: capped %s to %d fragments%s",
245-
file_path.name,
246-
cap,
247-
" (generated)" if is_generated else "",
248-
)
258+
file_frags = _cap_fragments(file_frags, cap, file_path, is_generated)
249259

250260
if is_generated:
251261
file_frags = _truncate_generated_fragments(file_frags)

src/treemapper/diffctx/edges/semantic/go.py

Lines changed: 76 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,31 @@ def _get_package_name_from_content(content: str, path: Path) -> str:
113113
return path.parent.name
114114

115115

116+
def _resolve_bases(pattern_str: str, parent: Path, repo_root: Path | None) -> list[Path]:
117+
base_pattern = pattern_str.split("*")[0].rstrip("/")
118+
candidate_bases = [parent / base_pattern]
119+
if repo_root:
120+
candidate_bases.append(repo_root / base_pattern)
121+
dirs: list[Path] = []
122+
for base in candidate_bases:
123+
try:
124+
dirs.append(base.resolve())
125+
except (OSError, ValueError):
126+
pass
127+
return dirs
128+
129+
130+
def _any_dir_matches(dirs_to_check: set[Path], embed_dirs: list[Path]) -> bool:
131+
for d in dirs_to_check:
132+
try:
133+
resolved = d.resolve()
134+
if any(resolved == ed or resolved.is_relative_to(ed) for ed in embed_dirs):
135+
return True
136+
except (ValueError, OSError):
137+
continue
138+
return False
139+
140+
116141
class GoEdgeBuilder(EdgeBuilder):
117142
weight = 0.75
118143
import_weight = EDGE_WEIGHTS["go_import"].forward
@@ -127,55 +152,55 @@ def discover_related_files(
127152
all_candidate_files: list[Path],
128153
repo_root: Path | None = None,
129154
) -> list[Path]:
130-
go_changed = [f for f in changed_files if _is_go_file(f)]
131155
changed_set = set(changed_files)
132156
discovered: set[Path] = set()
157+
candidates = [c for c in all_candidate_files if c not in changed_set and _is_go_file(c)]
133158

159+
go_changed = [f for f in changed_files if _is_go_file(f)]
134160
if go_changed:
135-
changed_pkg_dirs = {f.parent for f in go_changed}
136-
for candidate in all_candidate_files:
137-
if candidate not in changed_set and _is_go_file(candidate) and candidate.parent in changed_pkg_dirs:
138-
discovered.add(candidate)
161+
self._discover_same_package(go_changed, candidates, discovered)
162+
163+
embed_go_files = self._discover_embed_files(changed_files, candidates, discovered, repo_root)
164+
self._discover_package_peers(embed_go_files, candidates, discovered)
165+
166+
return list(discovered)
167+
168+
def _discover_same_package(self, go_changed: list[Path], candidates: list[Path], discovered: set[Path]) -> None:
169+
pkg_dirs = {f.parent for f in go_changed}
170+
for c in candidates:
171+
if c.parent in pkg_dirs:
172+
discovered.add(c)
139173

174+
def _discover_embed_files(
175+
self,
176+
changed_files: list[Path],
177+
candidates: list[Path],
178+
discovered: set[Path],
179+
repo_root: Path | None,
180+
) -> set[Path]:
140181
changed_dirs = {f.parent for f in changed_files}
141182
embed_go_files: set[Path] = set()
142-
for candidate in all_candidate_files:
143-
if candidate not in changed_set and _is_go_file(candidate):
144-
if self._embeds_any_changed_dir(candidate, changed_dirs, repo_root):
145-
discovered.add(candidate)
146-
embed_go_files.add(candidate)
183+
for c in candidates:
184+
if self._embeds_any_changed_dir(c, changed_dirs, repo_root):
185+
discovered.add(c)
186+
embed_go_files.add(c)
187+
return embed_go_files
147188

189+
def _discover_package_peers(self, embed_go_files: set[Path], candidates: list[Path], discovered: set[Path]) -> None:
148190
embed_dirs = {f.parent for f in embed_go_files}
149-
for candidate in all_candidate_files:
150-
if candidate not in changed_set and _is_go_file(candidate) and candidate not in discovered:
151-
if candidate.parent in embed_dirs:
152-
discovered.add(candidate)
153-
154-
return list(discovered)
191+
for c in candidates:
192+
if c not in discovered and c.parent in embed_dirs:
193+
discovered.add(c)
155194

156195
def _embeds_any_changed_dir(self, go_file: Path, changed_dirs: set[Path], repo_root: Path | None = None) -> bool:
157196
try:
158197
content = go_file.read_text(encoding="utf-8")
159198
except (OSError, UnicodeDecodeError):
160199
return False
161200
for match in _GO_EMBED_RE.finditer(content):
162-
embed_pattern = match.group(1)
163-
base_pattern = embed_pattern.split("*")[0].rstrip("/")
164-
candidate_bases = [go_file.parent / base_pattern]
165-
if repo_root:
166-
candidate_bases.append(repo_root / base_pattern)
167-
for base in candidate_bases:
168-
try:
169-
embed_dir = base.resolve()
170-
except (OSError, ValueError):
171-
continue
172-
for changed_dir in changed_dirs:
173-
try:
174-
resolved = changed_dir.resolve()
175-
if resolved == embed_dir or resolved.is_relative_to(embed_dir):
176-
return True
177-
except (ValueError, OSError):
178-
continue
201+
embed_dirs = _resolve_bases(match.group(1), go_file.parent, repo_root)
202+
if _any_dir_matches(changed_dirs, embed_dirs):
203+
return True
179204
return False
180205

181206
def build(self, fragments: list[Fragment], repo_root: Path | None = None) -> EdgeDict:
@@ -199,28 +224,26 @@ def _build_embed_edges(
199224
edges: EdgeDict,
200225
repo_root: Path | None = None,
201226
) -> None:
227+
non_go_frags = [f for f in all_frags if not _is_go_file(f.path)]
202228
for gf in go_frags:
203229
for match in _GO_EMBED_RE.finditer(gf.content):
204-
embed_pattern = match.group(1)
205-
base_pattern = embed_pattern.split("*")[0].rstrip("/")
206-
candidate_bases = [gf.path.parent / base_pattern]
207-
if repo_root:
208-
candidate_bases.append(repo_root / base_pattern)
209-
embed_dirs: list[Path] = []
210-
for base in candidate_bases:
211-
try:
212-
embed_dirs.append(base.resolve())
213-
except (OSError, ValueError):
214-
pass
215-
for frag in all_frags:
216-
if _is_go_file(frag.path):
217-
continue
218-
try:
219-
frag_resolved = frag.path.resolve()
220-
if any(frag_resolved.is_relative_to(ed) for ed in embed_dirs):
221-
self.add_edge(edges, gf.id, frag.id, self.weight * 0.8)
222-
except (ValueError, OSError):
223-
continue
230+
embed_dirs = _resolve_bases(match.group(1), gf.path.parent, repo_root)
231+
self._link_embed_targets(gf, non_go_frags, embed_dirs, edges)
232+
233+
def _link_embed_targets(
234+
self,
235+
gf: Fragment,
236+
non_go_frags: list[Fragment],
237+
embed_dirs: list[Path],
238+
edges: EdgeDict,
239+
) -> None:
240+
for frag in non_go_frags:
241+
try:
242+
frag_resolved = frag.path.resolve()
243+
if any(frag_resolved.is_relative_to(ed) for ed in embed_dirs):
244+
self.add_edge(edges, gf.id, frag.id, self.weight * 0.8)
245+
except (ValueError, OSError):
246+
continue
224247

225248
def _build_indices(
226249
self, go_frags: list[Fragment], repo_root: Path | None

src/treemapper/diffctx/edges/semantic/javascript.py

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -286,28 +286,39 @@ def _add_import_edges(
286286
file_to_frags[f.path].append(f.id)
287287

288288
fragment_paths = set(file_to_frags.keys())
289-
290-
file_imports: dict[Path, set[str]] = defaultdict(set)
291-
for f in js_frags:
292-
for import_source in info_cache[f.id].imports:
293-
if import_source.startswith("."):
294-
file_imports[f.path].add(import_source)
289+
file_imports = self._collect_relative_imports(js_frags, info_cache)
295290

296291
for src_path, import_sources in file_imports.items():
297292
for import_source in import_sources:
298293
resolved = _resolve_relative_import(src_path, import_source, fragment_paths)
299294
if resolved is None or resolved == src_path:
300295
continue
301296
target_ids = file_to_frags.get(resolved, [])
302-
if not target_ids:
303-
continue
304-
for src_id in file_to_frags[src_path]:
305-
for target_id in target_ids:
306-
if target_id == src_id:
307-
continue
308-
w = self._IMPORT_WEIGHT
309-
edges[(src_id, target_id)] = max(edges.get((src_id, target_id), 0.0), w)
310-
edges[(target_id, src_id)] = max(edges.get((target_id, src_id), 0.0), w * self.reverse_weight_factor)
297+
if target_ids:
298+
self._link_import_pairs(file_to_frags[src_path], target_ids, edges)
299+
300+
@staticmethod
301+
def _collect_relative_imports(js_frags: list[Fragment], info_cache: dict[FragmentId, JsFragmentInfo]) -> dict[Path, set[str]]:
302+
file_imports: dict[Path, set[str]] = defaultdict(set)
303+
for f in js_frags:
304+
for import_source in info_cache[f.id].imports:
305+
if import_source.startswith("."):
306+
file_imports[f.path].add(import_source)
307+
return file_imports
308+
309+
def _link_import_pairs(
310+
self,
311+
src_ids: list[FragmentId],
312+
target_ids: list[FragmentId],
313+
edges: EdgeDict,
314+
) -> None:
315+
w = self._IMPORT_WEIGHT
316+
rev_w = w * self.reverse_weight_factor
317+
for src_id in src_ids:
318+
for target_id in target_ids:
319+
if target_id != src_id:
320+
edges[(src_id, target_id)] = max(edges.get((src_id, target_id), 0.0), w)
321+
edges[(target_id, src_id)] = max(edges.get((target_id, src_id), 0.0), rev_w)
311322

312323
def _discover_forward_imports(
313324
self,

src/treemapper/diffctx/edges/semantic/php.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@
1010

1111
_PHP_EXTS = {".php", ".phtml", ".php3", ".php4", ".php5", ".php7", ".phps"}
1212

13-
_PHP_USE_RE = re.compile(r"^\s*use\s+([A-Z][a-zA-Z0-9_]*\\[a-zA-Z0-9_\\]*(?:\s+as\s+\w+)?)\s*;", re.MULTILINE)
14-
_PHP_NAMESPACE_RE = re.compile(r"^\s*namespace\s+([A-Z][a-zA-Z0-9_\\]*)\s*;", re.MULTILINE)
13+
_PHP_USE_RE = re.compile(r"^\s*use\s+([A-Z]\w*\\[\w\\]*(?:\s+as\s+\w+)?)\s*;", re.MULTILINE)
14+
_PHP_NAMESPACE_RE = re.compile(r"^\s*namespace\s+([A-Z][\w\\]*)\s*;", re.MULTILINE)
1515
_PHP_REQUIRE_RE = re.compile(r"^\s*(?:require|require_once|include|include_once)\s*\(?['\"]([^'\"]+)['\"]", re.MULTILINE)
1616

1717
_PHP_CLASS_RE = re.compile(r"^\s*(?:abstract\s+)?(?:final\s+)?class\s+([A-Z]\w*)", re.MULTILINE)

src/treemapper/diffctx/edges/semantic/rust.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
_RUST_STRUCT_RE = re.compile(r"^\s*(?:pub(?:\([^)]*\))?\s+)?struct\s+([A-Z]\w*)", re.MULTILINE)
1717
_RUST_ENUM_RE = re.compile(r"^\s*(?:pub(?:\([^)]*\))?\s+)?enum\s+([A-Z]\w*)", re.MULTILINE)
1818
_RUST_TRAIT_RE = re.compile(r"^\s*(?:pub(?:\([^)]*\))?\s+)?trait\s+([A-Z]\w*)", re.MULTILINE)
19-
_RUST_IMPL_RE = re.compile(r"^\s*impl(?:<[^<>]*(?:<[^<>]*>)*[^<>]*>)?\s+(?:\w+\s+for\s+)?([A-Z]\w*)", re.MULTILINE)
19+
_RUST_IMPL_RE = re.compile(r"^\s*impl(?:<[^>\n]*>)?\s+(?:\w+\s+for\s+)?([A-Z]\w*)", re.MULTILINE)
2020
_RUST_TYPE_ALIAS_RE = re.compile(r"^\s*(?:pub(?:\([^)]*\))?\s+)?type\s+([A-Z]\w*)", re.MULTILINE)
2121

2222
_RUST_TYPE_REF_RE = re.compile(r"(?<![a-z_])([A-Z]\w*)\b")

src/treemapper/diffctx/parsers/base.py

Lines changed: 29 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -104,35 +104,23 @@ def _find_first_balanced_point(lines: list[str], start_idx: int, target_end_idx:
104104
return None
105105

106106

107-
def find_balanced_end_line(
108-
lines: list[str], start_idx: int, target_end_idx: int, max_extension: int = _GENERIC_MAX_EXTENSION
109-
) -> int:
110-
if target_end_idx >= len(lines):
111-
target_end_idx = len(lines) - 1
112-
113-
state = _BracketState()
114-
target_state: _BracketState | None = None
115-
for idx in range(start_idx, target_end_idx + 1):
107+
def _feed_lines(state: _BracketState, lines: list[str], start_idx: int, end_idx: int) -> None:
108+
for idx in range(start_idx, end_idx + 1):
116109
if idx > start_idx:
117110
state.feed("\n")
118111
state.feed(lines[idx])
119-
target_state = state
120-
121-
if target_state.depth == 0:
122-
first_balanced = _find_first_balanced_point(lines, start_idx, target_end_idx)
123-
if first_balanced is not None and first_balanced < target_end_idx:
124-
return first_balanced
125-
return target_end_idx
126112

127-
max_end = min(len(lines) - 1, target_end_idx + max_extension)
128113

129-
state = target_state.copy()
130-
for end_idx in range(target_end_idx + 1, max_end + 1):
114+
def _scan_forward_for_balance(state: _BracketState, lines: list[str], start: int, end: int) -> int | None:
115+
for end_idx in range(start, end + 1):
131116
state.feed("\n")
132117
state.feed(lines[end_idx])
133118
if state.depth == 0:
134119
return end_idx
120+
return None
121+
135122

123+
def _scan_backward_for_balance(lines: list[str], start_idx: int, target_end_idx: int) -> int | None:
136124
state = _BracketState()
137125
last_balanced_idx = None
138126
for idx in range(start_idx, target_end_idx):
@@ -141,11 +129,30 @@ def find_balanced_end_line(
141129
state.feed(lines[idx])
142130
if state.depth == 0:
143131
last_balanced_idx = idx
132+
return last_balanced_idx
144133

145-
if last_balanced_idx is not None:
146-
return last_balanced_idx
147134

148-
return target_end_idx
135+
def find_balanced_end_line(
136+
lines: list[str], start_idx: int, target_end_idx: int, max_extension: int = _GENERIC_MAX_EXTENSION
137+
) -> int:
138+
if target_end_idx >= len(lines):
139+
target_end_idx = len(lines) - 1
140+
141+
target_state = _BracketState()
142+
_feed_lines(target_state, lines, start_idx, target_end_idx)
143+
144+
if target_state.depth == 0:
145+
first_balanced = _find_first_balanced_point(lines, start_idx, target_end_idx)
146+
if first_balanced is not None and first_balanced < target_end_idx:
147+
return first_balanced
148+
return target_end_idx
149+
150+
max_end = min(len(lines) - 1, target_end_idx + max_extension)
151+
forward = _scan_forward_for_balance(target_state.copy(), lines, target_end_idx + 1, max_end)
152+
if forward is not None:
153+
return forward
154+
155+
return _scan_backward_for_balance(lines, start_idx, target_end_idx) or target_end_idx
149156

150157

151158
_SENTENCE_ENDINGS = (".", "?", "!", '."', '?"', '!"', ".'", "?'", "!'")

0 commit comments

Comments
 (0)