|
| 1 | +#TODO write a description for this script |
| 2 | +#@author |
| 3 | +#@category _OPENSHC.TOOLS |
| 4 | +#@keybinding |
| 5 | +#@menupath |
| 6 | +#@toolbar |
| 7 | +#@runtime PyGhidra |
| 8 | + |
| 9 | + |
| 10 | +#TODO Add User Code Here |
| 11 | + |
| 12 | +# preprocess_thiscall_dat.py |
| 13 | +# |
| 14 | +# Ghidra Python script to preprocess decompiled __thiscall functions, |
| 15 | +# replacing DAT_<StructName>.<field> with this-><field>. |
| 16 | +# |
| 17 | +# The rule: Ghidra emits DAT_FooObject.field for __thiscall methods because |
| 18 | +# it sees the global-like address that ECX points to. In the reimplementation, |
| 19 | +# these become this->field since ECX *is* the this pointer. |
| 20 | +# |
| 21 | +# Usage: |
| 22 | +# Run from Ghidra Script Manager, or via analyzeHeadless: |
| 23 | +# analyzeHeadless <project> <program> -postScript preprocess_thiscall_dat.py |
| 24 | +# |
| 25 | +# Output: |
| 26 | +# Prints preprocessed source to the Ghidra console, and optionally writes |
| 27 | +# it to a file (set OUTPUT_DIR below). |
| 28 | +# |
| 29 | + |
| 30 | + |
| 31 | +import re |
| 32 | +import os |
| 33 | + |
| 34 | +# ---------------------------------------------------------------------------- |
| 35 | +# Configuration |
| 36 | +# ---------------------------------------------------------------------------- |
| 37 | + |
| 38 | +# Set to a directory path to write output .cpp files, or None to print only. |
| 39 | +OUTPUT_DIR = None # e.g. "C:/Users/Gynt/Projects/sourcehold/openshc/src_preprocessed" |
| 40 | + |
| 41 | +# If True, also replace DAT_<X>::ptr-> (cross-object struct globals) with |
| 42 | +# the proper accessor pattern for reference in comments. Does NOT change |
| 43 | +# cross-object references — those are __cdecl and must keep DAT_X::ptr->. |
| 44 | +ANNOTATE_CDECL_GLOBALS = True |
| 45 | + |
| 46 | + |
| 47 | +# ---------------------------------------------------------------------------- |
| 48 | +# Core transformation |
| 49 | +# ---------------------------------------------------------------------------- |
| 50 | + |
| 51 | +import java.math.BigInteger |
| 52 | + |
| 53 | +def get_thiscall_dat_prefix(func, toAddr): |
| 54 | + """ |
| 55 | + Detect which DAT_<name> prefix is used for 'this' in the given source. |
| 56 | +
|
| 57 | + Ghidra names the global after the struct type, e.g. DAT_TextManagerObject, |
| 58 | + DAT_TileMapStateObject, etc. We find all DAT_ tokens that are followed by |
| 59 | + a '.' (member access) — those are __thiscall this-references. |
| 60 | +
|
| 61 | + Returns a list of (dat_prefix, struct_name) tuples found. |
| 62 | + """ |
| 63 | + ECX = [r for r in func.getProgram().getLanguage().getRegisters() if r.getName() == "ECX"][0] |
| 64 | + ECX_value = func.getProgram().getListing().getCodeUnitAt(func.getEntryPoint()).getRegisterValue(ECX) |
| 65 | + ECX_label = "" |
| 66 | + if ECX_value: |
| 67 | + ECX_value = ECX_value.getUnsignedValue() |
| 68 | + if isinstance(ECX_value, java.math.BigInteger): |
| 69 | + ECX_value = int(ECX_value.toString(), 10) |
| 70 | + ECX_label = func.getProgram().getListing().getCodeUnitAt(toAddr(ECX_value)).getLabel() |
| 71 | + return ECX_label |
| 72 | + return None |
| 73 | + |
| 74 | + |
| 75 | +def replace_this_references(source, dat_prefix): |
| 76 | + """ |
| 77 | + Replace DAT_<name>.<field> with this-><field> for all detected this-prefixes. |
| 78 | + Handles: |
| 79 | + DAT_Foo.field -> this->field |
| 80 | + DAT_Foo.field.subfield -> this->field.subfield (chained) |
| 81 | + ADJ(ptr)->owner -> *ptr (Ghidra ADJ artefact, same address) |
| 82 | + """ |
| 83 | + result = source |
| 84 | + |
| 85 | + if dat_prefix: |
| 86 | + # Replace DAT_FooObject.field with this->field |
| 87 | + # Use word boundary on the left, match the dot accessor |
| 88 | + pattern = re.compile(r'\b' + re.escape(dat_prefix) + r'\.(\w+)') |
| 89 | + result = pattern.sub(r'this->\1', result) |
| 90 | + |
| 91 | + # Handle Ghidra's ADJ() artefact: ADJ(expr)->field means same address, |
| 92 | + # just different type view. In __thiscall context this is this->field |
| 93 | + # where expr is the same pointer. Replace ADJ(x)->field with x->field. |
| 94 | + result = re.sub(r'\bADJ\(([^)]+)\)->(\w+)', r'\1->\2', result) |
| 95 | + |
| 96 | + return result |
| 97 | + |
| 98 | + |
| 99 | +def annotate_cdecl_globals(source): |
| 100 | + """ |
| 101 | + Add a comment marker next to DAT_Foo::ptr references so the implementer |
| 102 | + knows these are cross-object globals requiring the ::ptr accessor pattern. |
| 103 | + Does not change the text — purely informational. |
| 104 | + """ |
| 105 | + # Match DAT_Xxx::ptr (but not if already commented) |
| 106 | + pattern = re.compile(r'(DAT_\w+::ptr)') |
| 107 | + result = pattern.sub(r'\1 /* cross-object global, keep ::ptr */', source) |
| 108 | + return result |
| 109 | + |
| 110 | + |
| 111 | +def strip_ghidra_signature_comments(source): |
| 112 | + """ |
| 113 | + Remove the Ghidra-emitted block comment header (===... lines) if present, |
| 114 | + since the reimplementation will have its own header. |
| 115 | + """ |
| 116 | + lines = source.splitlines() |
| 117 | + out = [] |
| 118 | + in_header = False |
| 119 | + header_done = False |
| 120 | + for line in lines: |
| 121 | + if not header_done: |
| 122 | + stripped = line.strip() |
| 123 | + if stripped.startswith('// ====='): |
| 124 | + in_header = True |
| 125 | + continue |
| 126 | + if in_header: |
| 127 | + if stripped.startswith('//'): |
| 128 | + continue # still in header block |
| 129 | + else: |
| 130 | + in_header = False |
| 131 | + header_done = True |
| 132 | + out.append(line) |
| 133 | + return '\n'.join(out) |
| 134 | + |
| 135 | + |
| 136 | +def fix_thiscall_param(source): |
| 137 | + """ |
| 138 | + Remove the explicit 'this' parameter from the function signature. |
| 139 | + Ghidra emits: void __thiscall Foo(FooType *this) |
| 140 | + We want: void Foo::Foo() (the caller handles this via ECX) |
| 141 | +
|
| 142 | + Also strips __thiscall keyword since MSVC __thiscall is implicit for |
| 143 | + non-static member functions. |
| 144 | + """ |
| 145 | + # Remove __thiscall keyword |
| 146 | + source = re.sub(r'\b__thiscall\b\s*', '', source) |
| 147 | + |
| 148 | + # Remove explicit (SomeType *this) or (SomeType * this) parameter |
| 149 | + # when it is the only parameter |
| 150 | + source = re.sub(r'\(\s*\w[\w\s\*]*\*\s*this\s*\)', '()', source) |
| 151 | + |
| 152 | + # Remove it when it is the first of multiple parameters: |
| 153 | + # (FooType *this, int x, ...) -> (int x, ...) |
| 154 | + source = re.sub(r'\(\s*\w[\w\s\*]*\*\s*this\s*,\s*', '(', source) |
| 155 | + |
| 156 | + return source |
| 157 | + |
| 158 | +def is_thiscall_function(func): |
| 159 | + """Heuristic: check if the function's calling convention is __thiscall.""" |
| 160 | + if func is None: |
| 161 | + return False |
| 162 | + cc = func.getCallingConventionName() |
| 163 | + if cc is not None and '__thiscall' in cc: |
| 164 | + return True |
| 165 | + # Fallback: check parameter names for 'this' |
| 166 | + params = func.getParameters() |
| 167 | + for p in params: |
| 168 | + if p.getName() == 'this': |
| 169 | + return True |
| 170 | + return False |
| 171 | + |
| 172 | +def process_dat_prefixes(decompiled_source): |
| 173 | + """ |
| 174 | + Detect which DAT_<name> prefix is used for 'this' in the given source. |
| 175 | +
|
| 176 | + Ghidra names the global after the struct type, e.g. DAT_TextManagerObject, |
| 177 | + DAT_TileMapStateObject, etc. We find all DAT_ tokens that are followed by |
| 178 | + a '.' (member access) — those are __thiscall this-references. |
| 179 | +
|
| 180 | + Returns a list of (dat_prefix, struct_name) tuples found. |
| 181 | + """ |
| 182 | + # Match DAT_Xxx.fieldName (dot access = member of the object) |
| 183 | + pattern = re.compile(r'\b(DAT_\w+)\.(\w+)') |
| 184 | + matches = pattern.findall(decompiled_source) |
| 185 | + # Deduplicate, preserve order |
| 186 | + seen = {} |
| 187 | + result = decompiled_source |
| 188 | + for dat_name, field in matches: |
| 189 | + result = result.replace(dat_name + "." + field, dat_name + "::ptr->" + field) |
| 190 | + return result |
| 191 | + |
| 192 | +def find_dat_includes(decompiled_source): |
| 193 | + pattern = re.compile(r'\b([A-Z]{3,}_\w+)[.\[]') |
| 194 | + matches = pattern.findall(decompiled_source) |
| 195 | + for match in matches: |
| 196 | + yield '#include "OpenSHC/'+match+'.hpp"' |
| 197 | + |
| 198 | + |
| 199 | +def preprocess(func, source, toAddr): |
| 200 | + """ |
| 201 | + Full preprocessing pipeline for a single decompiled function source string. |
| 202 | + """ |
| 203 | + result = source |
| 204 | + |
| 205 | + # 1. Strip Ghidra header comment block |
| 206 | + result = strip_ghidra_signature_comments(result) |
| 207 | + |
| 208 | + if is_thiscall_function(func): |
| 209 | + # 2. Detect all DAT_X.field this-references |
| 210 | + dat_prefix = get_thiscall_dat_prefix(func, toAddr) |
| 211 | + |
| 212 | + if dat_prefix: |
| 213 | + # 4. Replace DAT_X.field -> this->field |
| 214 | + result = replace_this_references(result, dat_prefix) |
| 215 | + |
| 216 | + # 5. Fix signature (remove __thiscall, explicit this param) |
| 217 | + result = fix_thiscall_param(result) |
| 218 | + |
| 219 | + # 6. Optionally annotate cross-object globals |
| 220 | + #if ANNOTATE_CDECL_GLOBALS: |
| 221 | + # result = annotate_cdecl_globals(result) |
| 222 | + |
| 223 | + ns = func.getParentNamespace().getName(True) |
| 224 | + nsp = ns.replace("::", "/") + ".hpp" |
| 225 | + result = '#include "'+nsp+'"\n\n' + result |
| 226 | + |
| 227 | + result = "\n".join(find_dat_includes(result)) + result |
| 228 | + |
| 229 | + return process_dat_prefixes(result).replace("_HoldStrong", "OpenSHC") |
| 230 | + |
| 231 | + |
0 commit comments