Skip to content

Commit 9ec0177

Browse files
authored
[TOOLS] add function export preprocessor to simplify common Ghidra quirks
1 parent 4b6b0f2 commit 9ec0177

2 files changed

Lines changed: 242 additions & 1 deletion

File tree

Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
#TODO write a description for this script
2+
#@author
3+
#@category _OPENSHC.TOOLS
4+
#@keybinding
5+
#@menupath
6+
#@toolbar
7+
#@runtime PyGhidra
8+
9+
10+
#TODO Add User Code Here
11+
12+
# preprocess_thiscall_dat.py
13+
#
14+
# Ghidra Python script to preprocess decompiled __thiscall functions,
15+
# replacing DAT_<StructName>.<field> with this-><field>.
16+
#
17+
# The rule: Ghidra emits DAT_FooObject.field for __thiscall methods because
18+
# it sees the global-like address that ECX points to. In the reimplementation,
19+
# these become this->field since ECX *is* the this pointer.
20+
#
21+
# Usage:
22+
# Run from Ghidra Script Manager, or via analyzeHeadless:
23+
# analyzeHeadless <project> <program> -postScript preprocess_thiscall_dat.py
24+
#
25+
# Output:
26+
# Prints preprocessed source to the Ghidra console, and optionally writes
27+
# it to a file (set OUTPUT_DIR below).
28+
#
29+
30+
31+
import re
32+
import os
33+
34+
# ----------------------------------------------------------------------------
35+
# Configuration
36+
# ----------------------------------------------------------------------------
37+
38+
# Set to a directory path to write output .cpp files, or None to print only.
39+
OUTPUT_DIR = None # e.g. "C:/Users/Gynt/Projects/sourcehold/openshc/src_preprocessed"
40+
41+
# If True, also replace DAT_<X>::ptr-> (cross-object struct globals) with
42+
# the proper accessor pattern for reference in comments. Does NOT change
43+
# cross-object references — those are __cdecl and must keep DAT_X::ptr->.
44+
ANNOTATE_CDECL_GLOBALS = True
45+
46+
47+
# ----------------------------------------------------------------------------
48+
# Core transformation
49+
# ----------------------------------------------------------------------------
50+
51+
import java.math.BigInteger
52+
53+
def get_thiscall_dat_prefix(func, toAddr):
54+
"""
55+
Detect which DAT_<name> prefix is used for 'this' in the given source.
56+
57+
Ghidra names the global after the struct type, e.g. DAT_TextManagerObject,
58+
DAT_TileMapStateObject, etc. We find all DAT_ tokens that are followed by
59+
a '.' (member access) — those are __thiscall this-references.
60+
61+
Returns a list of (dat_prefix, struct_name) tuples found.
62+
"""
63+
ECX = [r for r in func.getProgram().getLanguage().getRegisters() if r.getName() == "ECX"][0]
64+
ECX_value = func.getProgram().getListing().getCodeUnitAt(func.getEntryPoint()).getRegisterValue(ECX)
65+
ECX_label = ""
66+
if ECX_value:
67+
ECX_value = ECX_value.getUnsignedValue()
68+
if isinstance(ECX_value, java.math.BigInteger):
69+
ECX_value = int(ECX_value.toString(), 10)
70+
ECX_label = func.getProgram().getListing().getCodeUnitAt(toAddr(ECX_value)).getLabel()
71+
return ECX_label
72+
return None
73+
74+
75+
def replace_this_references(source, dat_prefix):
76+
"""
77+
Replace DAT_<name>.<field> with this-><field> for all detected this-prefixes.
78+
Handles:
79+
DAT_Foo.field -> this->field
80+
DAT_Foo.field.subfield -> this->field.subfield (chained)
81+
ADJ(ptr)->owner -> *ptr (Ghidra ADJ artefact, same address)
82+
"""
83+
result = source
84+
85+
if dat_prefix:
86+
# Replace DAT_FooObject.field with this->field
87+
# Use word boundary on the left, match the dot accessor
88+
pattern = re.compile(r'\b' + re.escape(dat_prefix) + r'\.(\w+)')
89+
result = pattern.sub(r'this->\1', result)
90+
91+
# Handle Ghidra's ADJ() artefact: ADJ(expr)->field means same address,
92+
# just different type view. In __thiscall context this is this->field
93+
# where expr is the same pointer. Replace ADJ(x)->field with x->field.
94+
result = re.sub(r'\bADJ\(([^)]+)\)->(\w+)', r'\1->\2', result)
95+
96+
return result
97+
98+
99+
def annotate_cdecl_globals(source):
100+
"""
101+
Add a comment marker next to DAT_Foo::ptr references so the implementer
102+
knows these are cross-object globals requiring the ::ptr accessor pattern.
103+
Does not change the text — purely informational.
104+
"""
105+
# Match DAT_Xxx::ptr (but not if already commented)
106+
pattern = re.compile(r'(DAT_\w+::ptr)')
107+
result = pattern.sub(r'\1 /* cross-object global, keep ::ptr */', source)
108+
return result
109+
110+
111+
def strip_ghidra_signature_comments(source):
112+
"""
113+
Remove the Ghidra-emitted block comment header (===... lines) if present,
114+
since the reimplementation will have its own header.
115+
"""
116+
lines = source.splitlines()
117+
out = []
118+
in_header = False
119+
header_done = False
120+
for line in lines:
121+
if not header_done:
122+
stripped = line.strip()
123+
if stripped.startswith('// ====='):
124+
in_header = True
125+
continue
126+
if in_header:
127+
if stripped.startswith('//'):
128+
continue # still in header block
129+
else:
130+
in_header = False
131+
header_done = True
132+
out.append(line)
133+
return '\n'.join(out)
134+
135+
136+
def fix_thiscall_param(source):
137+
"""
138+
Remove the explicit 'this' parameter from the function signature.
139+
Ghidra emits: void __thiscall Foo(FooType *this)
140+
We want: void Foo::Foo() (the caller handles this via ECX)
141+
142+
Also strips __thiscall keyword since MSVC __thiscall is implicit for
143+
non-static member functions.
144+
"""
145+
# Remove __thiscall keyword
146+
source = re.sub(r'\b__thiscall\b\s*', '', source)
147+
148+
# Remove explicit (SomeType *this) or (SomeType * this) parameter
149+
# when it is the only parameter
150+
source = re.sub(r'\(\s*\w[\w\s\*]*\*\s*this\s*\)', '()', source)
151+
152+
# Remove it when it is the first of multiple parameters:
153+
# (FooType *this, int x, ...) -> (int x, ...)
154+
source = re.sub(r'\(\s*\w[\w\s\*]*\*\s*this\s*,\s*', '(', source)
155+
156+
return source
157+
158+
def is_thiscall_function(func):
159+
"""Heuristic: check if the function's calling convention is __thiscall."""
160+
if func is None:
161+
return False
162+
cc = func.getCallingConventionName()
163+
if cc is not None and '__thiscall' in cc:
164+
return True
165+
# Fallback: check parameter names for 'this'
166+
params = func.getParameters()
167+
for p in params:
168+
if p.getName() == 'this':
169+
return True
170+
return False
171+
172+
def process_dat_prefixes(decompiled_source):
173+
"""
174+
Detect which DAT_<name> prefix is used for 'this' in the given source.
175+
176+
Ghidra names the global after the struct type, e.g. DAT_TextManagerObject,
177+
DAT_TileMapStateObject, etc. We find all DAT_ tokens that are followed by
178+
a '.' (member access) — those are __thiscall this-references.
179+
180+
Returns a list of (dat_prefix, struct_name) tuples found.
181+
"""
182+
# Match DAT_Xxx.fieldName (dot access = member of the object)
183+
pattern = re.compile(r'\b(DAT_\w+)\.(\w+)')
184+
matches = pattern.findall(decompiled_source)
185+
# Deduplicate, preserve order
186+
seen = {}
187+
result = decompiled_source
188+
for dat_name, field in matches:
189+
result = result.replace(dat_name + "." + field, dat_name + "::ptr->" + field)
190+
return result
191+
192+
def find_dat_includes(decompiled_source):
193+
pattern = re.compile(r'\b([A-Z]{3,}_\w+)[.\[]')
194+
matches = pattern.findall(decompiled_source)
195+
for match in matches:
196+
yield '#include "OpenSHC/'+match+'.hpp"'
197+
198+
199+
def preprocess(func, source, toAddr):
200+
"""
201+
Full preprocessing pipeline for a single decompiled function source string.
202+
"""
203+
result = source
204+
205+
# 1. Strip Ghidra header comment block
206+
result = strip_ghidra_signature_comments(result)
207+
208+
if is_thiscall_function(func):
209+
# 2. Detect all DAT_X.field this-references
210+
dat_prefix = get_thiscall_dat_prefix(func, toAddr)
211+
212+
if dat_prefix:
213+
# 4. Replace DAT_X.field -> this->field
214+
result = replace_this_references(result, dat_prefix)
215+
216+
# 5. Fix signature (remove __thiscall, explicit this param)
217+
result = fix_thiscall_param(result)
218+
219+
# 6. Optionally annotate cross-object globals
220+
#if ANNOTATE_CDECL_GLOBALS:
221+
# result = annotate_cdecl_globals(result)
222+
223+
ns = func.getParentNamespace().getName(True)
224+
nsp = ns.replace("::", "/") + ".hpp"
225+
result = '#include "'+nsp+'"\n\n' + result
226+
227+
result = "\n".join(find_dat_includes(result)) + result
228+
229+
return process_dat_prefixes(result).replace("_HoldStrong", "OpenSHC")
230+
231+

tools/mcp/ghidra_scripts/functionexporter.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
from ghidra.util.task import ConsoleTaskMonitor
2929
from ghidra.program.model.symbol import SymbolType
3030

31+
from functionexport_preprocessor import preprocess
32+
3133

3234

3335
TARGET_NAMESPACE = "_HoldStrong" # Root namespace to search (case-sensitive)
@@ -103,6 +105,14 @@ def init_decompiler():
103105
"""Initialise and open a DecompInterface for the current program."""
104106
iface = DecompInterface()
105107
opts = DecompileOptions()
108+
opts.setNoCastPrint(False)
109+
opts.setEliminateUnreachable(False)
110+
opts.setPLATECommentIncluded(True)
111+
opts.setPOSTCommentIncluded(True)
112+
opts.setPRECommentIncluded(True)
113+
opts.setEOLCommentIncluded(True)
114+
opts.setHeadCommentIncluded(True)
115+
106116
iface.setOptions(opts)
107117
iface.openProgram(currentProgram)
108118
return iface
@@ -222,7 +232,7 @@ def run():
222232
else:
223233
if not code:
224234
raise Exception("impossible situation")
225-
fh.write(code)
235+
fh.write(preprocess(func, code, toAddr))
226236
fh.write("\n")
227237

228238
written.append(out_path)

0 commit comments

Comments
 (0)