Skip to content

Commit f48d4ae

Browse files
committed
feat: Implement Tree-sitter parsers for JavaScript/TypeScript and Java, adding a base parser, temporal module, and updating documentation.
1 parent c9b7a00 commit f48d4ae

14 files changed

Lines changed: 1278 additions & 5 deletions

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ knowcode stats [--store <path>]
113113
## Supported Languages (MVP)
114114

115115
- **Python** (.py) - Full AST parsing with functions, classes, methods, calls, imports
116+
- **JavaScript / TypeScript** (.js, .ts) - Classes, functions, imports (via tree-sitter)
117+
- **Java** (.java) - Classes, methods, imports, inheritance (via tree-sitter)
116118
- **Markdown** (.md) - Document structure with heading hierarchy
117119
- **YAML** (.yaml, .yml) - Configuration keys with nested structure
118120

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ dependencies = [
99
"networkx>=3.0",
1010
"pyyaml>=6.0",
1111
"pathspec>=0.11",
12+
"tree-sitter==0.21.3",
13+
"tree-sitter-languages>=1.10.0",
14+
"GitPython>=3.1.0",
1215
]
1316

1417
[project.scripts]

src/knowcode/cli.py

Lines changed: 94 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,18 +34,25 @@ def main() -> None:
3434
multiple=True,
3535
help="Additional patterns to ignore",
3636
)
37-
def analyze(directory: str, output: str, ignore: tuple[str, ...]) -> None:
38-
"""Analyze a codebase and build the knowledge store.
37+
@click.option(
38+
"--temporal/--no-temporal",
39+
default=False,
40+
help="Analyze git history and add temporal context.",
41+
)
42+
def analyze(directory: str, output: str, ignore: tuple[str, ...], temporal: bool) -> None:
43+
"""Scan and analyze a codebase.
3944
4045
DIRECTORY: Path to the codebase to analyze.
4146
"""
4247
click.echo(f"Analyzing: {directory}")
48+
click.echo(f"Temporal analysis: {'Enabled' if temporal else 'Disabled'}")
4349

4450
# Build graph
4551
builder = GraphBuilder()
4652
builder.build_from_directory(
4753
root_dir=directory,
48-
additional_ignores=list(ignore) if ignore else None,
54+
additional_ignores=list(ignore),
55+
analyze_temporal=temporal,
4956
)
5057

5158
# Create store and save
@@ -315,3 +322,87 @@ def stats(store: str) -> None:
315322

316323
if __name__ == "__main__":
317324
main()
325+
@main.command()
326+
@click.argument("target", required=False)
327+
@click.option(
328+
"--store", "-s",
329+
type=click.Path(exists=True),
330+
default=".",
331+
help="Path to knowledge store file or directory",
332+
)
333+
@click.option(
334+
"--limit", "-l",
335+
type=int,
336+
default=10,
337+
help="Limit number of revisions",
338+
)
339+
def history(target: Optional[str], store: str, limit: int) -> None:
340+
"""Show history of the codebase or a specific entity.
341+
342+
TARGET: Optional entity ID or search pattern. If omitted, shows commit log.
343+
"""
344+
try:
345+
knowledge = KnowledgeStore.load(store)
346+
except FileNotFoundError:
347+
click.echo("Error: Knowledge store not found. Run 'knowcode analyze' first.", err=True)
348+
sys.exit(1)
349+
350+
if not target:
351+
# Show recent commits
352+
commits = knowledge.get_entities_by_kind("commit")
353+
# Sort by timestamp (metadata)
354+
commits.sort(key=lambda x: x.metadata.get("timestamp", "0"), reverse=True)
355+
356+
click.echo(f"Recent History (showing {min(limit, len(commits))} of {len(commits)}):")
357+
for commit in commits[:limit]:
358+
date = commit.metadata.get("date", "Unknown date")
359+
author_rels = knowledge.get_incoming_relationships(commit.id)
360+
author = "Unknown"
361+
for rel in author_rels:
362+
if rel.kind == "authored":
363+
# rel.source_id is author
364+
a_ent = knowledge.get_entity(rel.source_id)
365+
if a_ent:
366+
author = a_ent.name
367+
368+
click.echo(f"[{date}] {commit.name} - {author}")
369+
click.echo(f" {commit.docstring.splitlines()[0] if commit.docstring else ''}")
370+
371+
else:
372+
# Show history for specific entity
373+
entity = knowledge.get_entity(target)
374+
if not entity:
375+
matches = knowledge.search(target)
376+
if matches:
377+
entity = matches[0]
378+
click.echo(f"Using: {entity.id}\n")
379+
380+
if not entity:
381+
click.echo(f"Entity not found: {target}")
382+
return
383+
384+
click.echo(f"History for {entity.qualified_name} ({entity.kind.value}):")
385+
386+
# Build history from relationships
387+
# Entity -> CHANGED_BY -> Commit
388+
rels = knowledge.get_outgoing_relationships(entity.id)
389+
changes = []
390+
for rel in rels:
391+
if rel.kind == "changed_by":
392+
commit = knowledge.get_entity(rel.target_id)
393+
if commit:
394+
# Get modification stats from edge metadata
395+
stats = f"(+{rel.metadata.get('insertions', 0)}/-{rel.metadata.get('deletions', 0)})"
396+
timestamp = commit.metadata.get("timestamp", "0")
397+
changes.append((timestamp, commit, stats))
398+
399+
changes.sort(key=lambda x: x[0], reverse=True)
400+
401+
if not changes:
402+
click.echo(" No recorded history (scan with --temporal).")
403+
return
404+
405+
for _, commit, stats in changes[:limit]:
406+
date = commit.metadata.get("date", "")
407+
click.echo(f" {date} {commit.name} {stats}: {commit.docstring.splitlines()[0]}")
408+

src/knowcode/graph_builder.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@
55

66
from knowcode.models import Entity, ParseResult, Relationship
77
from knowcode.parsers import MarkdownParser, PythonParser, YamlParser
8+
from knowcode.parsers.javascript_parser import JavaScriptParser
9+
from knowcode.parsers.java_parser import JavaParser
810
from knowcode.scanner import FileInfo, Scanner
11+
from knowcode.temporal import TemporalAnalyzer
912

1013

1114
class GraphBuilder:
@@ -16,6 +19,8 @@ def __init__(self) -> None:
1619
self.python_parser = PythonParser()
1720
self.markdown_parser = MarkdownParser()
1821
self.yaml_parser = YamlParser()
22+
self.js_parser = JavaScriptParser()
23+
self.java_parser = JavaParser()
1924

2025
self.entities: dict[str, Entity] = {}
2126
self.relationships: list[Relationship] = []
@@ -25,6 +30,7 @@ def build_from_directory(
2530
self,
2631
root_dir: str | Path,
2732
additional_ignores: Optional[list[str]] = None,
33+
analyze_temporal: bool = False,
2834
) -> "GraphBuilder":
2935
"""Build graph by scanning and parsing a directory.
3036
@@ -42,7 +48,17 @@ def build_from_directory(
4248
)
4349

4450
files = scanner.scan_all()
45-
return self.build_from_files(files)
51+
52+
# Static Analysis
53+
self.build_from_files(files)
54+
55+
# Temporal Analysis
56+
if analyze_temporal:
57+
temporal_analyzer = TemporalAnalyzer(root_dir)
58+
result = temporal_analyzer.analyze_history()
59+
self._merge_result(result)
60+
61+
return self
4662

4763
def build_from_files(self, files: list[FileInfo]) -> "GraphBuilder":
4864
"""Build graph from a list of files.
@@ -70,6 +86,10 @@ def _parse_file(self, file_info: FileInfo) -> ParseResult:
7086
return self.markdown_parser.parse_file(file_info.path)
7187
elif file_info.extension in {".yaml", ".yml"}:
7288
return self.yaml_parser.parse_file(file_info.path)
89+
elif file_info.extension in {".js", ".ts"}:
90+
return self.js_parser.parse_file(file_info.path)
91+
elif file_info.extension == ".java":
92+
return self.java_parser.parse_file(file_info.path)
7393
else:
7494
return ParseResult(
7595
file_path=str(file_info.path),

src/knowcode/models.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ class EntityKind(str, Enum):
1818
SECTION = "section"
1919
# Configuration entities
2020
CONFIG_KEY = "config_key"
21+
# Temporal entities
22+
COMMIT = "commit"
23+
AUTHOR = "author"
2124

2225

2326
class RelationshipKind(str, Enum):
@@ -28,6 +31,10 @@ class RelationshipKind(str, Enum):
2831
CONTAINS = "contains"
2932
INHERITS = "inherits"
3033
REFERENCES = "references"
34+
# Temporal relationships
35+
CHANGED_BY = "changed_by" # Entity -> Commit
36+
AUTHORED = "authored" # Author -> Commit
37+
MODIFIED = "modified" # Commit -> Entity
3138

3239

3340
@dataclass

src/knowcode/parsers/base.py

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
"""Base parser using Tree-sitter."""
2+
3+
from pathlib import Path
4+
from typing import Any, Optional
5+
6+
from tree_sitter import Language, Parser, Tree
7+
import tree_sitter_languages
8+
9+
from knowcode.models import (
10+
Entity,
11+
EntityKind,
12+
Location,
13+
ParseResult,
14+
Relationship,
15+
RelationshipKind,
16+
)
17+
18+
19+
class TreeSitterParser:
20+
"""Base class for parsers using Tree-sitter."""
21+
22+
def __init__(self, language_name: str) -> None:
23+
"""Initialize parser for a specific language.
24+
25+
Args:
26+
language_name: Name of the language (e.g., 'python', 'javascript', 'java').
27+
"""
28+
self.language_name = language_name
29+
self.language = tree_sitter_languages.get_language(language_name)
30+
self.parser = Parser()
31+
self.parser.set_language(self.language)
32+
33+
def parse_file(self, file_path: str | Path) -> ParseResult:
34+
"""Parse a source file.
35+
36+
Args:
37+
file_path: Path to the source file.
38+
39+
Returns:
40+
ParseResult with entities and relationships.
41+
"""
42+
file_path = Path(file_path)
43+
errors: list[str] = []
44+
45+
try:
46+
source_code = file_path.read_text(encoding="utf-8")
47+
except Exception as e:
48+
return ParseResult(
49+
file_path=str(file_path),
50+
entities=[],
51+
relationships=[],
52+
errors=[f"Failed to read file: {e}"],
53+
)
54+
55+
try:
56+
tree = self.parser.parse(bytes(source_code, "utf8"))
57+
except Exception as e:
58+
return ParseResult(
59+
file_path=str(file_path),
60+
entities=[],
61+
relationships=[],
62+
errors=[f"Parse error: {e}"],
63+
)
64+
65+
entities: list[Entity] = []
66+
relationships: list[Relationship] = []
67+
source_lines = source_code.splitlines()
68+
69+
# Create module entity
70+
module_name = file_path.stem
71+
module_id = f"{file_path}::{module_name}"
72+
module_entity = Entity(
73+
id=module_id,
74+
kind=EntityKind.MODULE,
75+
name=module_name,
76+
qualified_name=module_name,
77+
location=Location(
78+
file_path=str(file_path),
79+
line_start=1,
80+
line_end=len(source_lines),
81+
),
82+
)
83+
entities.append(module_entity)
84+
85+
# Delegate to language-specific extraction
86+
child_entities, child_rels = self._extract_entities(
87+
tree.root_node, file_path, module_id, source_code, source_lines
88+
)
89+
entities.extend(child_entities)
90+
relationships.extend(child_rels)
91+
92+
# Handle errors from tree-sitter
93+
if tree.root_node.has_error:
94+
# We might want to be more specific here, but for now just flag it
95+
# Don't fail completely, as partial AST is often useful
96+
errors.append("Tree-sitter reported syntax errors in file")
97+
98+
return ParseResult(
99+
file_path=str(file_path),
100+
entities=entities,
101+
relationships=relationships,
102+
errors=errors,
103+
)
104+
105+
def _extract_entities(
106+
self,
107+
node: Any,
108+
file_path: Path,
109+
parent_id: str,
110+
source_code: str,
111+
source_lines: list[str],
112+
) -> tuple[list[Entity], list[Relationship]]:
113+
"""Extract entities from the AST. Must be implemented by subclasses."""
114+
raise NotImplementedError
115+
116+
def _get_text(self, node: Any, source_bytes: bytes) -> str:
117+
"""Get text content of a node."""
118+
return node.text.decode("utf8")
119+
120+
def _get_location(self, node: Any, file_path: Path) -> Location:
121+
"""Get location object for a node."""
122+
return Location(
123+
file_path=str(file_path),
124+
line_start=node.start_point[0] + 1,
125+
line_end=node.end_point[0] + 1,
126+
column_start=node.start_point[1],
127+
column_end=node.end_point[1],
128+
)
129+
130+
def _create_entity(
131+
self,
132+
node: Any,
133+
kind: EntityKind,
134+
name: str,
135+
qualified_name: str,
136+
file_path: Path,
137+
source_lines: list[str],
138+
docstring: Optional[str] = None,
139+
signature: Optional[str] = None,
140+
) -> Entity:
141+
"""Helper to create an entity."""
142+
# Extract source code for the node
143+
start_line = node.start_point[0]
144+
end_line = node.end_point[0] + 1
145+
node_source = "\n".join(source_lines[start_line:end_line])
146+
147+
return Entity(
148+
id=f"{file_path}::{qualified_name}",
149+
kind=kind,
150+
name=name,
151+
qualified_name=qualified_name,
152+
location=self._get_location(node, file_path),
153+
docstring=docstring,
154+
signature=signature,
155+
source_code=node_source,
156+
)

0 commit comments

Comments
 (0)