Skip to content

Commit ad2e1c0

Browse files
committed
feat: add DWARF parsing support for exporting local variables
1 parent b11f385 commit ad2e1c0

8 files changed

Lines changed: 2017 additions & 4 deletions

dwarf_parser.py

Lines changed: 471 additions & 0 deletions
Large diffs are not rendered by default.

evaluate_quality.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,24 @@ class FileMetrics:
6666
namespaces_found: List[str] = field(default_factory=list)
6767
source_file_refs: List[str] = field(default_factory=list)
6868

69+
# Debug info indicators (from DWARF)
70+
preserved_var_names: int = (
71+
0 # Variables with original names (not local_XX, param_X)
72+
)
73+
auto_generated_vars: int = 0 # Variables with auto-generated names
74+
has_debug_info_comment: bool = False # File has debug info comment
75+
6976
# Issues found
7077
issues: List[str] = field(default_factory=list)
7178

79+
@property
80+
def debug_info_ratio(self) -> float:
81+
"""Calculate ratio of preserved variable names (0.0-1.0)"""
82+
total = self.preserved_var_names + self.auto_generated_vars
83+
if total == 0:
84+
return 0.0
85+
return self.preserved_var_names / total
86+
7287
@property
7388
def quality_score(self) -> float:
7489
"""Calculate a quality score (0-100)"""
@@ -92,6 +107,16 @@ def quality_score(self) -> float:
92107
if self.source_file_refs:
93108
score += 2
94109

110+
# Debug info bonus (significant - up to 15 points)
111+
if self.preserved_var_names > 0:
112+
# Bonus based on ratio of preserved names
113+
debug_bonus = self.debug_info_ratio * 15
114+
score += debug_bonus
115+
116+
# Extra bonus if debug info comment is present
117+
if self.has_debug_info_comment:
118+
score += 2
119+
95120
return max(0, min(100, score))
96121

97122

@@ -111,6 +136,12 @@ class ProjectMetrics:
111136
total_undefined_types: int = 0
112137
total_excessive_casts: int = 0
113138

139+
# Debug info metrics
140+
files_with_debug_info: int = 0
141+
total_preserved_vars: int = 0
142+
total_auto_generated_vars: int = 0
143+
avg_debug_info_ratio: float = 0.0
144+
114145
# Summary
115146
avg_quality_score: float = 0.0
116147
min_quality_score: float = 100.0
@@ -136,6 +167,20 @@ class ProjectMetrics:
136167
"function_comment": re.compile(r"//\s*Function:\s*(\w+)"),
137168
"source_file": re.compile(r"framework/source/[\w/]+\.cpp"),
138169
"assert_fail": re.compile(r'__assert_fail\s*\([^)]*"([^"]+)"'),
170+
# Debug info patterns
171+
"debug_info_comment": re.compile(r"/\*\s*Debug Information:\s*DWARF\s*\*/"),
172+
"preserved_var_comment": re.compile(r"/\*\s*Variable names preserved\s*\*/"),
173+
# Auto-generated variable names (Ghidra default patterns)
174+
"auto_var_local": re.compile(r"\blocal_[0-9a-fA-F]+\b"),
175+
"auto_var_param": re.compile(r"\bparam_\d+\b"),
176+
"auto_var_uvar": re.compile(r"\b[iu]Var\d+\b"),
177+
"auto_var_pvar": re.compile(r"\bpVar\d+\b"),
178+
"auto_var_in": re.compile(r"\bin_[A-Z]+\b"),
179+
# Meaningful variable names (likely from debug info)
180+
# Match variable declarations with meaningful names (not auto-generated)
181+
"meaningful_var": re.compile(
182+
r"\b(int|float|double|char|void|bool|uint\d+_t|int\d+_t|size_t)\s+\*?\s*([a-z][a-zA-Z0-9_]{1,20})\s*[;=,\)]"
183+
),
139184
}
140185

141186

@@ -181,6 +226,48 @@ def analyze_file(filepath: str) -> FileMetrics:
181226
metrics.classes = len(PATTERNS["class_comment"].findall(content))
182227
metrics.functions = len(PATTERNS["function_comment"].findall(content))
183228

229+
# Debug info analysis
230+
metrics.has_debug_info_comment = bool(
231+
PATTERNS["debug_info_comment"].search(content)
232+
)
233+
234+
# Count auto-generated variable names
235+
auto_vars = set()
236+
for pattern_name in [
237+
"auto_var_local",
238+
"auto_var_param",
239+
"auto_var_uvar",
240+
"auto_var_pvar",
241+
"auto_var_in",
242+
]:
243+
for match in PATTERNS[pattern_name].finditer(content):
244+
auto_vars.add(match.group(0))
245+
metrics.auto_generated_vars = len(auto_vars)
246+
247+
# Count meaningful variable names (likely from debug info)
248+
meaningful_vars = set()
249+
for match in PATTERNS["meaningful_var"].finditer(content):
250+
var_name = match.group(2)
251+
# Filter out common false positives
252+
if var_name not in [
253+
"this",
254+
"void",
255+
"int",
256+
"char",
257+
"bool",
258+
"true",
259+
"false",
260+
"NULL",
261+
"nullptr",
262+
]:
263+
# Check it's not an auto-generated name
264+
if not any(
265+
var_name.startswith(prefix)
266+
for prefix in ["local_", "param_", "uVar", "iVar", "pVar", "in_"]
267+
):
268+
meaningful_vars.add(var_name)
269+
metrics.preserved_var_names = len(meaningful_vars)
270+
184271
# Record issues
185272
if metrics.halt_baddata > 0:
186273
metrics.issues.append(f"Contains {metrics.halt_baddata} halt_baddata calls")
@@ -189,6 +276,12 @@ def analyze_file(filepath: str) -> FileMetrics:
189276
if metrics.inline_assembly > 0:
190277
metrics.issues.append(f"Contains inline assembly: {metrics.inline_assembly}")
191278

279+
# Debug info quality note
280+
if metrics.preserved_var_names > 0 and metrics.debug_info_ratio > 0.5:
281+
metrics.issues.append(
282+
f"Good debug info: {metrics.preserved_var_names} preserved variable names ({metrics.debug_info_ratio:.0%})"
283+
)
284+
192285
return metrics
193286

194287

@@ -230,6 +323,12 @@ def analyze_directory(directory: str, file_pattern: str = "*.c*") -> ProjectMetr
230323
project.total_undefined_types += metrics.undefined_types
231324
project.total_excessive_casts += metrics.excessive_casts
232325

326+
# Debug info aggregation
327+
project.total_preserved_vars += metrics.preserved_var_names
328+
project.total_auto_generated_vars += metrics.auto_generated_vars
329+
if metrics.preserved_var_names > 0 or metrics.has_debug_info_comment:
330+
project.files_with_debug_info += 1
331+
233332
if metrics.halt_baddata > 0:
234333
project.files_with_halt_baddata += 1
235334

@@ -242,6 +341,11 @@ def analyze_directory(directory: str, file_pattern: str = "*.c*") -> ProjectMetr
242341
if quality_scores:
243342
project.avg_quality_score = sum(quality_scores) / len(quality_scores)
244343

344+
# Calculate debug info ratio
345+
total_vars = project.total_preserved_vars + project.total_auto_generated_vars
346+
if total_vars > 0:
347+
project.avg_debug_info_ratio = project.total_preserved_vars / total_vars
348+
245349
# Find worst files
246350
scored_files = [(m.filename, m.quality_score) for m in project.file_metrics]
247351
project.worst_files = sorted(scored_files, key=lambda x: x[1])[:10]
@@ -295,6 +399,28 @@ def print_report(project: ProjectMetrics, verbose: bool = False):
295399
print(f" excessive casts: {project.total_excessive_casts:,}")
296400
print()
297401

402+
# Debug Info Summary
403+
print(f"{Colors.BLUE}Debug Information:{Colors.NC}")
404+
if project.files_with_debug_info > 0:
405+
print(
406+
f" Files with debug info: {Colors.GREEN}{project.files_with_debug_info}/{project.total_files}{Colors.NC}"
407+
)
408+
print(
409+
f" Preserved variable names: {Colors.GREEN}{project.total_preserved_vars:,}{Colors.NC}"
410+
)
411+
print(f" Auto-generated names: {project.total_auto_generated_vars:,}")
412+
ratio_color = (
413+
Colors.GREEN
414+
if project.avg_debug_info_ratio > 0.5
415+
else (Colors.YELLOW if project.avg_debug_info_ratio > 0.2 else Colors.RED)
416+
)
417+
print(
418+
f" Debug info ratio: {ratio_color}{project.avg_debug_info_ratio:.1%}{Colors.NC}"
419+
)
420+
else:
421+
print(f" {Colors.YELLOW}No debug information detected{Colors.NC}")
422+
print()
423+
298424
# Worst Files
299425
if project.worst_files:
300426
print(f"{Colors.BLUE}Lowest Quality Files:{Colors.NC}")
@@ -365,6 +491,13 @@ def export_json(project: ProjectMetrics, output_path: str):
365491
"avg_quality_score": project.avg_quality_score,
366492
"files_with_halt_baddata": project.files_with_halt_baddata,
367493
"total_halt_baddata": project.total_halt_baddata,
494+
# Debug info metrics
495+
"debug_info": {
496+
"files_with_debug_info": project.files_with_debug_info,
497+
"total_preserved_vars": project.total_preserved_vars,
498+
"total_auto_generated_vars": project.total_auto_generated_vars,
499+
"avg_debug_info_ratio": project.avg_debug_info_ratio,
500+
},
368501
"files": [
369502
{
370503
"filename": m.filename,
@@ -375,6 +508,11 @@ def export_json(project: ProjectMetrics, output_path: str):
375508
"namespaces": m.namespaces_found,
376509
"source_refs": m.source_file_refs,
377510
"issues": m.issues,
511+
# Debug info per file
512+
"preserved_var_names": m.preserved_var_names,
513+
"auto_generated_vars": m.auto_generated_vars,
514+
"debug_info_ratio": m.debug_info_ratio,
515+
"has_debug_info_comment": m.has_debug_info_comment,
378516
}
379517
for m in project.file_metrics
380518
],

ghidra_common.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,8 @@ def get_decompiled_function_basic(decomp_ifc, func, monitor):
366366
Decompile a single function and return C code.
367367
368368
Basic version without type normalization or enhancements.
369+
Ghidra automatically uses DWARF debug info when available,
370+
preserving original variable names in the decompiled output.
369371
370372
Args:
371373
decomp_ifc: Ghidra DecompInterface
@@ -385,6 +387,81 @@ def get_decompiled_function_basic(decomp_ifc, func, monitor):
385387
return None
386388

387389

390+
def get_decompiled_function_with_debug_info(
391+
decomp_ifc, func, monitor, include_var_comments=True
392+
):
393+
"""
394+
Decompile a function with enhanced debug information handling.
395+
396+
This function attempts to extract and preserve as much debug information
397+
as possible, including original variable names, types, and source locations.
398+
399+
Args:
400+
decomp_ifc: Ghidra DecompInterface
401+
func: Ghidra Function object
402+
monitor: Task monitor
403+
include_var_comments: If True, add comments about preserved variable names
404+
405+
Returns:
406+
Decompiled C code string with debug annotations, or None on failure
407+
"""
408+
try:
409+
results = decomp_ifc.decompileFunction(func, 60, monitor)
410+
if results and results.decompileCompleted():
411+
code = results.getDecompiledFunction().getC()
412+
code = clean_decompiled_code(code)
413+
414+
if include_var_comments:
415+
# Get high-level function representation for variable info
416+
high_func = results.getHighFunction()
417+
if high_func:
418+
local_symbols = high_func.getLocalSymbolMap()
419+
if local_symbols:
420+
preserved_vars = []
421+
for sym in local_symbols.getSymbols():
422+
name = sym.getName()
423+
# Check if this is an original name (not auto-generated)
424+
if not (
425+
name.startswith("local_")
426+
or name.startswith("param_")
427+
or name.startswith("in_")
428+
or name.startswith("uVar")
429+
or name.startswith("iVar")
430+
or name.startswith("pVar")
431+
):
432+
var_type = (
433+
sym.getDataType().getName()
434+
if sym.getDataType()
435+
else "?"
436+
)
437+
preserved_vars.append("{} ({})".format(name, var_type))
438+
439+
if preserved_vars:
440+
# Add comment about preserved variables at function start
441+
var_comment = "/* Original variables: {} */\n".format(
442+
", ".join(preserved_vars[:10])
443+
)
444+
if len(preserved_vars) > 10:
445+
var_comment = var_comment.rstrip(
446+
"\n"
447+
) + " + {} more */\n".format(len(preserved_vars) - 10)
448+
449+
# Insert after function signature
450+
brace_pos = code.find("{")
451+
if brace_pos > 0:
452+
code = (
453+
code[: brace_pos + 1]
454+
+ "\n"
455+
+ var_comment
456+
+ code[brace_pos + 1 :]
457+
)
458+
459+
return code
460+
except Exception as e:
461+
print(" [Error] Failed to decompile {}: {}".format(func.getName(), str(e)))
462+
return None
463+
464+
388465
def get_decompiled_function(
389466
decomp_ifc, func, monitor, class_info=None, struct_info=None, enhance=True
390467
):

ghidra_decompile_elf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -362,7 +362,7 @@ def extract_prefix(func_name, min_prefix_len=2, max_prefix_len=30):
362362
xxBmpInit -> xxBmp
363363
xxFntGetMetrics -> xxFnt
364364
GfxCreateSurface -> Gfx
365-
vg_lite_init -> vg_lite
365+
aa_bb_init -> aa_bb
366366
ApplicationApplication_goHome -> ApplicationApplication
367367
CoreView__ReInit -> CoreView
368368
"""
@@ -408,7 +408,7 @@ def extract_prefix(func_name, min_prefix_len=2, max_prefix_len=30):
408408
if match and len(match.group(1)) >= min_prefix_len:
409409
return match.group(1)
410410

411-
# Lowercase prefix (c-style: vg_lite_init)
411+
# Lowercase prefix (c-style: xx_init)
412412
match = re.match(r"^([a-z][a-z0-9]*_[a-z0-9]+)", func_name)
413413
if match:
414414
return match.group(1)

0 commit comments

Comments
 (0)