Skip to content

Commit 7d12be1

Browse files
DvirDukhanCopilot
andcommitted
feat(analyzers): syntactic IMPORTS edges + derived OVERRIDES
Add language-agnostic File->File IMPORTS edges via per-analyzer import resolution (Python: dotted-module index) and derive OVERRIDES edges from the EXTENDS+DEFINES hierarchy. Wired into the analysis pipeline. Improves the graph for all consumers (HTTP API + MCP) and feeds search_code centrality. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 23e382c commit 7d12be1

4 files changed

Lines changed: 201 additions & 0 deletions

File tree

api/analyzers/analyzer.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,39 @@ def needs_lsp(self) -> bool:
7070
"""
7171
return True
7272

73+
def build_import_index(self, files: dict[Path, File], root: Path) -> object:
74+
"""
75+
Build a language-specific index used to resolve import statements to
76+
in-repo files. Returns an opaque structure consumed by
77+
``resolve_imports``. Default: no import resolution for this language.
78+
79+
Args:
80+
files (dict[Path, File]): All parsed files keyed by absolute path.
81+
root (Path): The analyzed repository root.
82+
83+
Returns:
84+
object: Opaque index, or ``None`` when unsupported.
85+
"""
86+
87+
return None
88+
89+
def resolve_imports(self, file: File, root: Path, index: object) -> list[File]:
90+
"""
91+
Resolve the import statements of ``file`` to the in-repo files they
92+
depend on. Purely syntactic by default (no LSP). Each returned File is
93+
connected to ``file`` with an ``IMPORTS`` edge by the orchestrator.
94+
95+
Args:
96+
file (File): The importing file (already parsed; ``file.tree`` set).
97+
root (Path): The analyzed repository root.
98+
index (object): The structure returned by ``build_import_index``.
99+
100+
Returns:
101+
list[File]: In-repo files imported by ``file`` (deduped, self excluded).
102+
"""
103+
104+
return []
105+
73106
@abstractmethod
74107
def add_dependencies(self, path: Path, files: list[Path]):
75108
"""

api/analyzers/python/analyzer.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,110 @@ def add_symbols(self, entity: Entity) -> None:
136136
def is_dependency(self, file_path: str) -> bool:
137137
return "venv" in file_path
138138

139+
def _module_parts(self, file_path: Path, root: Path) -> Optional[list[str]]:
140+
"""Dotted module path components for ``file_path`` relative to ``root``."""
141+
try:
142+
rel = file_path.relative_to(root)
143+
except ValueError:
144+
return None
145+
parts = list(rel.with_suffix('').parts)
146+
if parts and parts[-1] == '__init__':
147+
parts = parts[:-1]
148+
return parts
149+
150+
def build_import_index(self, files: dict[Path, File], root: Path) -> object:
151+
"""Index in-repo files by dotted module name.
152+
153+
Two maps: ``exact`` keyed by the full dotted path from ``root`` and
154+
``suffix`` keyed by every trailing sub-path (first file wins). The
155+
suffix map tolerates ``src/``/``lib/`` layouts where the import name
156+
(``matplotlib.axes``) differs from the path-from-root
157+
(``lib.matplotlib.axes``).
158+
"""
159+
exact: dict[str, File] = {}
160+
suffix: dict[str, File] = {}
161+
for fpath, file in files.items():
162+
if self.is_dependency(str(fpath)):
163+
continue
164+
parts = self._module_parts(fpath, root)
165+
if not parts:
166+
continue
167+
exact.setdefault('.'.join(parts), file)
168+
for i in range(len(parts)):
169+
suffix.setdefault('.'.join(parts[i:]), file)
170+
return {'exact': exact, 'suffix': suffix}
171+
172+
def _resolve_dotted(self, dotted: str, index: dict) -> Optional[File]:
173+
if not dotted:
174+
return None
175+
f = index['exact'].get(dotted) or index['suffix'].get(dotted)
176+
if f is None and '.' in dotted:
177+
# imported name may be a symbol inside a module; drop the last part.
178+
parent = dotted.rsplit('.', 1)[0]
179+
f = index['exact'].get(parent) or index['suffix'].get(parent)
180+
return f
181+
182+
def _import_requests(self, file: File) -> list[tuple[str, int]]:
183+
"""Extract (dotted, level) resolution requests from import statements."""
184+
requests: list[tuple[str, int]] = []
185+
captures = self._captures(
186+
"(import_statement) @i (import_from_statement) @f",
187+
file.tree.root_node,
188+
)
189+
for node in captures.get('i', []):
190+
for child in node.named_children:
191+
target = child
192+
if child.type == 'aliased_import':
193+
target = child.child_by_field_name('name')
194+
if target is not None and target.type == 'dotted_name':
195+
requests.append((target.text.decode('utf-8'), 0))
196+
for node in captures.get('f', []):
197+
module = node.child_by_field_name('module_name')
198+
level = 0
199+
base = ''
200+
if module is not None:
201+
if module.type == 'relative_import':
202+
prefix = next((c for c in module.children if c.type == 'import_prefix'), None)
203+
level = len(prefix.text.decode('utf-8')) if prefix is not None else 1
204+
dotted_part = next((c for c in module.named_children if c.type == 'dotted_name'), None)
205+
base = dotted_part.text.decode('utf-8') if dotted_part is not None else ''
206+
else:
207+
base = module.text.decode('utf-8')
208+
requests.append((base, level))
209+
for name_node in node.children_by_field_name('name'):
210+
leaf = name_node
211+
if name_node.type == 'aliased_import':
212+
leaf = name_node.child_by_field_name('name')
213+
if leaf is not None:
214+
name_txt = leaf.text.decode('utf-8')
215+
requests.append((f"{base}.{name_txt}" if base else name_txt, level))
216+
return requests
217+
218+
def resolve_imports(self, file: File, root: Path, index: object) -> list[File]:
219+
if not index:
220+
return []
221+
package_parts = self._module_parts(file.path, root)
222+
if package_parts is None:
223+
return []
224+
# Package of the importing file = its parent dotted path.
225+
package_parts = package_parts[:-1] if package_parts else []
226+
seen: set[Path] = set()
227+
targets: list[File] = []
228+
for dotted, level in self._import_requests(file):
229+
if level:
230+
base = package_parts[: len(package_parts) - (level - 1)] if level > 1 else list(package_parts)
231+
full = '.'.join([*base, dotted]) if dotted else '.'.join(base)
232+
else:
233+
full = dotted
234+
resolved = self._resolve_dotted(full, index)
235+
if resolved is None or resolved.path == file.path or resolved.path in seen:
236+
continue
237+
if self.is_dependency(str(resolved.path)):
238+
continue
239+
seen.add(resolved.path)
240+
targets.append(resolved)
241+
return targets
242+
139243
def _extract_type_target(self, node: Node) -> Optional[Node]:
140244
if node.type == 'attribute':
141245
return node.child_by_field_name('attribute')

api/analyzers/source_analyzer.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,19 +183,49 @@ def second_pass(self, graph: Graph, files: list[Path], path: Path) -> None:
183183
elif key == "parameters":
184184
graph.connect_entities("PARAMETERS", entity.id, resolved.id)
185185

186+
def link_imports(self, graph: Graph, root: Path) -> None:
187+
"""Add ``IMPORTS`` edges (File -> File) via per-language resolution.
188+
189+
Purely syntactic for Python (no LSP), so this runs after ``first_pass``
190+
once every file has a graph id. Languages whose analyzer does not
191+
implement import resolution are silently skipped.
192+
"""
193+
indices: dict[str, object] = {}
194+
for file_path, file in self.files.items():
195+
analyzer = analyzers.get(file_path.suffix)
196+
if analyzer is None:
197+
continue
198+
if file_path.suffix not in indices:
199+
indices[file_path.suffix] = analyzer.build_import_index(self.files, root)
200+
index = indices[file_path.suffix]
201+
if not index:
202+
continue
203+
for target in analyzer.resolve_imports(file, root, index):
204+
if getattr(file, "id", None) is None or getattr(target, "id", None) is None:
205+
continue
206+
graph.connect_entities("IMPORTS", file.id, target.id)
207+
186208
def analyze_files(self, files: list[Path], path: Path, graph: Graph) -> None:
187209
self.first_pass(path, files, [], graph)
210+
self.link_imports(graph, path)
188211
self.second_pass(graph, files, path)
212+
graph.derive_overrides()
189213

190214
def analyze_sources(self, path: Path, ignore: list[str], graph: Graph) -> None:
191215
path = path.resolve()
192216
files = list(path.rglob("*.java")) + list(path.rglob("*.py")) + list(path.rglob("*.cs")) + [f for f in path.rglob("*.js") if "node_modules" not in f.parts] + list(path.rglob("*.kt")) + list(path.rglob("*.kts"))
193217
# First pass analysis of the source code
194218
self.first_pass(path, files, ignore, graph)
195219

220+
# Link import edges (syntactic, language-specific, no LSP)
221+
self.link_imports(graph, path)
222+
196223
# Second pass analysis of the source code
197224
self.second_pass(graph, files, path)
198225

226+
# Derive override edges from the resolved class hierarchy
227+
graph.derive_overrides()
228+
199229
def analyze_local_folder(self, path: str, g: Graph, ignore: Optional[list[str]] = []) -> None:
200230
"""
201231
Analyze path.

api/graph.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -603,6 +603,40 @@ def connect_entities(self, relation: str, src_id: int, dest_id: int, properties:
603603
params = {'src_id': src_id, 'dest_id': dest_id, "properties": properties}
604604
self._query(q, params)
605605

606+
def derive_overrides(self, max_depth: int = 3) -> int:
607+
"""
608+
Derive ``OVERRIDES`` edges from the existing class hierarchy.
609+
610+
A method ``m`` on a subclass overrides method ``m2`` on an ancestor
611+
class when they share a name. Pure graph derivation over existing
612+
``EXTENDS`` + ``DEFINES`` edges, so it is language-agnostic. The edge
613+
carries ``depth`` (inheritance distance) for downstream filtering.
614+
615+
Args:
616+
max_depth (int): Maximum inheritance distance to bridge.
617+
618+
Returns:
619+
int: Number of OVERRIDES edges after derivation.
620+
"""
621+
622+
q = f"""MATCH (sub:Class)-[x:EXTENDS*1..{int(max_depth)}]->(sup:Class)
623+
WHERE ID(sub) <> ID(sup)
624+
WITH DISTINCT sub, sup, length(x) AS depth
625+
MATCH (sub)-[:DEFINES]->(m:Function)
626+
MATCH (sup)-[:DEFINES]->(m2:Function)
627+
WHERE m.name = m2.name AND ID(m) <> ID(m2)
628+
MERGE (m)-[e:OVERRIDES]->(m2)
629+
ON CREATE SET e.depth = depth"""
630+
631+
try:
632+
self._query(q)
633+
except Exception as exc: # noqa: BLE001 — derivation is best-effort
634+
logging.warning("derive_overrides failed: %s", exc)
635+
return 0
636+
637+
res = self._query("MATCH ()-[e:OVERRIDES]->() RETURN count(e)").result_set
638+
return int(res[0][0]) if res else 0
639+
606640
def function_calls_function(self, caller_id: int, callee_id: int, pos: int) -> None:
607641
"""
608642
Establish a 'CALLS' relationship between two function nodes.

0 commit comments

Comments
 (0)