Skip to content

Commit 2f3aab3

Browse files
DvirDukhanCopilot
andcommitted
feat(analyzers): syntactic IMPORTS edges + derived OVERRIDES
Add language-agnostic File->File IMPORTS edges via per-analyzer import resolution (Python: dotted-module index) and derive OVERRIDES edges from the EXTENDS+DEFINES hierarchy. Wired into the analysis pipeline. Improves the graph for all consumers (HTTP API + MCP) and feeds search_code centrality. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 070d373 commit 2f3aab3

4 files changed

Lines changed: 201 additions & 0 deletions

File tree

api/analyzers/analyzer.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,39 @@ def needs_lsp(self) -> bool:
7676
"""
7777
return True
7878

79+
def build_import_index(self, files: dict[Path, File], root: Path) -> object:
80+
"""
81+
Build a language-specific index used to resolve import statements to
82+
in-repo files. Returns an opaque structure consumed by
83+
``resolve_imports``. Default: no import resolution for this language.
84+
85+
Args:
86+
files (dict[Path, File]): All parsed files keyed by absolute path.
87+
root (Path): The analyzed repository root.
88+
89+
Returns:
90+
object: Opaque index, or ``None`` when unsupported.
91+
"""
92+
93+
return None
94+
95+
def resolve_imports(self, file: File, root: Path, index: object) -> list[File]:
96+
"""
97+
Resolve the import statements of ``file`` to the in-repo files they
98+
depend on. Purely syntactic by default (no LSP). Each returned File is
99+
connected to ``file`` with an ``IMPORTS`` edge by the orchestrator.
100+
101+
Args:
102+
file (File): The importing file (already parsed; ``file.tree`` set).
103+
root (Path): The analyzed repository root.
104+
index (object): The structure returned by ``build_import_index``.
105+
106+
Returns:
107+
list[File]: In-repo files imported by ``file`` (deduped, self excluded).
108+
"""
109+
110+
return []
111+
79112
@abstractmethod
80113
def add_dependencies(self, path: Path, files: list[Path]):
81114
"""

api/analyzers/python/analyzer.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,110 @@ def add_symbols(self, entity: Entity) -> None:
136136
def is_dependency(self, file_path: str) -> bool:
137137
return "venv" in file_path
138138

139+
def _module_parts(self, file_path: Path, root: Path) -> Optional[list[str]]:
140+
"""Dotted module path components for ``file_path`` relative to ``root``."""
141+
try:
142+
rel = file_path.relative_to(root)
143+
except ValueError:
144+
return None
145+
parts = list(rel.with_suffix('').parts)
146+
if parts and parts[-1] == '__init__':
147+
parts = parts[:-1]
148+
return parts
149+
150+
def build_import_index(self, files: dict[Path, File], root: Path) -> object:
151+
"""Index in-repo files by dotted module name.
152+
153+
Two maps: ``exact`` keyed by the full dotted path from ``root`` and
154+
``suffix`` keyed by every trailing sub-path (first file wins). The
155+
suffix map tolerates ``src/``/``lib/`` layouts where the import name
156+
(``matplotlib.axes``) differs from the path-from-root
157+
(``lib.matplotlib.axes``).
158+
"""
159+
exact: dict[str, File] = {}
160+
suffix: dict[str, File] = {}
161+
for fpath, file in files.items():
162+
if self.is_dependency(str(fpath)):
163+
continue
164+
parts = self._module_parts(fpath, root)
165+
if not parts:
166+
continue
167+
exact.setdefault('.'.join(parts), file)
168+
for i in range(len(parts)):
169+
suffix.setdefault('.'.join(parts[i:]), file)
170+
return {'exact': exact, 'suffix': suffix}
171+
172+
def _resolve_dotted(self, dotted: str, index: dict) -> Optional[File]:
173+
if not dotted:
174+
return None
175+
f = index['exact'].get(dotted) or index['suffix'].get(dotted)
176+
if f is None and '.' in dotted:
177+
# imported name may be a symbol inside a module; drop the last part.
178+
parent = dotted.rsplit('.', 1)[0]
179+
f = index['exact'].get(parent) or index['suffix'].get(parent)
180+
return f
181+
182+
def _import_requests(self, file: File) -> list[tuple[str, int]]:
183+
"""Extract (dotted, level) resolution requests from import statements."""
184+
requests: list[tuple[str, int]] = []
185+
captures = self._captures(
186+
"(import_statement) @i (import_from_statement) @f",
187+
file.tree.root_node,
188+
)
189+
for node in captures.get('i', []):
190+
for child in node.named_children:
191+
target = child
192+
if child.type == 'aliased_import':
193+
target = child.child_by_field_name('name')
194+
if target is not None and target.type == 'dotted_name':
195+
requests.append((target.text.decode('utf-8'), 0))
196+
for node in captures.get('f', []):
197+
module = node.child_by_field_name('module_name')
198+
level = 0
199+
base = ''
200+
if module is not None:
201+
if module.type == 'relative_import':
202+
prefix = next((c for c in module.children if c.type == 'import_prefix'), None)
203+
level = len(prefix.text.decode('utf-8')) if prefix is not None else 1
204+
dotted_part = next((c for c in module.named_children if c.type == 'dotted_name'), None)
205+
base = dotted_part.text.decode('utf-8') if dotted_part is not None else ''
206+
else:
207+
base = module.text.decode('utf-8')
208+
requests.append((base, level))
209+
for name_node in node.children_by_field_name('name'):
210+
leaf = name_node
211+
if name_node.type == 'aliased_import':
212+
leaf = name_node.child_by_field_name('name')
213+
if leaf is not None:
214+
name_txt = leaf.text.decode('utf-8')
215+
requests.append((f"{base}.{name_txt}" if base else name_txt, level))
216+
return requests
217+
218+
def resolve_imports(self, file: File, root: Path, index: object) -> list[File]:
219+
if not index:
220+
return []
221+
package_parts = self._module_parts(file.path, root)
222+
if package_parts is None:
223+
return []
224+
# Package of the importing file = its parent dotted path.
225+
package_parts = package_parts[:-1] if package_parts else []
226+
seen: set[Path] = set()
227+
targets: list[File] = []
228+
for dotted, level in self._import_requests(file):
229+
if level:
230+
base = package_parts[: len(package_parts) - (level - 1)] if level > 1 else list(package_parts)
231+
full = '.'.join([*base, dotted]) if dotted else '.'.join(base)
232+
else:
233+
full = dotted
234+
resolved = self._resolve_dotted(full, index)
235+
if resolved is None or resolved.path == file.path or resolved.path in seen:
236+
continue
237+
if self.is_dependency(str(resolved.path)):
238+
continue
239+
seen.add(resolved.path)
240+
targets.append(resolved)
241+
return targets
242+
139243
def _extract_type_target(self, node: Node) -> Optional[Node]:
140244
if node.type == 'attribute':
141245
return node.child_by_field_name('attribute')

api/analyzers/source_analyzer.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,19 +315,49 @@ def _resolve_file(file_path: Path) -> Path:
315315
elif key == "parameters":
316316
graph.connect_entities("PARAMETERS", entity.id, resolved.id)
317317

318+
def link_imports(self, graph: Graph, root: Path) -> None:
319+
"""Add ``IMPORTS`` edges (File -> File) via per-language resolution.
320+
321+
Purely syntactic for Python (no LSP), so this runs after ``first_pass``
322+
once every file has a graph id. Languages whose analyzer does not
323+
implement import resolution are silently skipped.
324+
"""
325+
indices: dict[str, object] = {}
326+
for file_path, file in self.files.items():
327+
analyzer = analyzers.get(file_path.suffix)
328+
if analyzer is None:
329+
continue
330+
if file_path.suffix not in indices:
331+
indices[file_path.suffix] = analyzer.build_import_index(self.files, root)
332+
index = indices[file_path.suffix]
333+
if not index:
334+
continue
335+
for target in analyzer.resolve_imports(file, root, index):
336+
if getattr(file, "id", None) is None or getattr(target, "id", None) is None:
337+
continue
338+
graph.connect_entities("IMPORTS", file.id, target.id)
339+
318340
def analyze_files(self, files: list[Path], path: Path, graph: Graph) -> None:
319341
self.first_pass(path, files, [], graph)
342+
self.link_imports(graph, path)
320343
self.second_pass(graph, files, path)
344+
graph.derive_overrides()
321345

322346
def analyze_sources(self, path: Path, ignore: list[str], graph: Graph) -> None:
323347
path = path.resolve()
324348
files = list(path.rglob("*.java")) + list(path.rglob("*.py")) + list(path.rglob("*.cs")) + [f for f in path.rglob("*.js") if "node_modules" not in f.parts] + list(path.rglob("*.kt")) + list(path.rglob("*.kts"))
325349
# First pass analysis of the source code
326350
self.first_pass(path, files, ignore, graph)
327351

352+
# Link import edges (syntactic, language-specific, no LSP)
353+
self.link_imports(graph, path)
354+
328355
# Second pass analysis of the source code
329356
self.second_pass(graph, files, path)
330357

358+
# Derive override edges from the resolved class hierarchy
359+
graph.derive_overrides()
360+
331361
def analyze_local_folder(self, path: str, g: Graph, ignore: Optional[list[str]] = []) -> None:
332362
"""
333363
Analyze path.

api/graph.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -612,6 +612,40 @@ def connect_entities(self, relation: str, src_id: int, dest_id: int, properties:
612612
params = {'src_id': src_id, 'dest_id': dest_id, "properties": properties}
613613
self._query(q, params)
614614

615+
def derive_overrides(self, max_depth: int = 3) -> int:
616+
"""
617+
Derive ``OVERRIDES`` edges from the existing class hierarchy.
618+
619+
A method ``m`` on a subclass overrides method ``m2`` on an ancestor
620+
class when they share a name. Pure graph derivation over existing
621+
``EXTENDS`` + ``DEFINES`` edges, so it is language-agnostic. The edge
622+
carries ``depth`` (inheritance distance) for downstream filtering.
623+
624+
Args:
625+
max_depth (int): Maximum inheritance distance to bridge.
626+
627+
Returns:
628+
int: Number of OVERRIDES edges after derivation.
629+
"""
630+
631+
q = f"""MATCH (sub:Class)-[x:EXTENDS*1..{int(max_depth)}]->(sup:Class)
632+
WHERE ID(sub) <> ID(sup)
633+
WITH DISTINCT sub, sup, length(x) AS depth
634+
MATCH (sub)-[:DEFINES]->(m:Function)
635+
MATCH (sup)-[:DEFINES]->(m2:Function)
636+
WHERE m.name = m2.name AND ID(m) <> ID(m2)
637+
MERGE (m)-[e:OVERRIDES]->(m2)
638+
ON CREATE SET e.depth = depth"""
639+
640+
try:
641+
self._query(q)
642+
except Exception as exc: # noqa: BLE001 — derivation is best-effort
643+
logging.warning("derive_overrides failed: %s", exc)
644+
return 0
645+
646+
res = self._query("MATCH ()-[e:OVERRIDES]->() RETURN count(e)").result_set
647+
return int(res[0][0]) if res else 0
648+
615649
def function_calls_function(self, caller_id: int, callee_id: int, pos: int) -> None:
616650
"""
617651
Establish a 'CALLS' relationship between two function nodes.

0 commit comments

Comments
 (0)