|
| 1 | +import sys |
| 2 | +from elftools.elf.elffile import ELFFile |
| 3 | +import re |
| 4 | +import json |
| 5 | + |
| 6 | +# Lookback 5 instructions before the ECALL |
| 7 | +SYSCALL_INSTRUCTIONS_LOOKBACK = 5 |
| 8 | + |
| 9 | +def extract_text_section_instructions(elf_path): |
| 10 | + """ |
| 11 | + Extract and print executable instructions from the .text section of a RISC-V ELF binary. |
| 12 | + |
| 13 | + Args: |
| 14 | + - elf_path (str): Path to the ELF binary file. |
| 15 | + |
| 16 | + Returns: |
| 17 | + - List of hexadecimal instructions from the .text section. |
| 18 | + """ |
| 19 | + try: |
| 20 | + with open(elf_path, 'rb') as f: |
| 21 | + elffile = ELFFile(f) |
| 22 | + |
| 23 | + # Check if the ELF is for RISC-V architecture (EM_RISCV = 243) |
| 24 | + if elffile['e_machine'] != 'EM_RISCV': |
| 25 | + print(f"Error: ELF is not for RISC-V (detected: {elffile['e_machine']})") |
| 26 | + exit(1) |
| 27 | + |
| 28 | + # Get the .text section |
| 29 | + text_section = elffile.get_section_by_name('.text') |
| 30 | + if text_section is None: |
| 31 | + print(f"Error: Could not find the .text section in {elf_path}") |
| 32 | + exit(1) |
| 33 | + |
| 34 | + # Extract the raw bytes from the .text section |
| 35 | + text_data = text_section.data() |
| 36 | + |
| 37 | + # Divide the text section data into 32-bit (4-byte) RISC-V instructions |
| 38 | + instructions = [] |
| 39 | + for i in range(0, len(text_data), 4): |
| 40 | + instruction_bytes = text_data[i:i + 4] |
| 41 | + if len(instruction_bytes) < 4: |
| 42 | + break # If the remaining bytes are less than 4, stop |
| 43 | + instruction = int.from_bytes(instruction_bytes, byteorder='little') |
| 44 | + instructions.append(instruction) |
| 45 | + |
| 46 | + return instructions |
| 47 | + |
| 48 | + except FileNotFoundError: |
| 49 | + print(f"Error: File '{elf_path}' not found.") |
| 50 | + exit(1) |
| 51 | + except Exception as e: |
| 52 | + print(f"Error: Unable to read the ELF file. Reason: {e}") |
| 53 | + exit(1) |
| 54 | + |
| 55 | +def parse_rd(instr): |
| 56 | + return (instr >> 7) & 0x1F |
| 57 | + |
| 58 | +def parse_imm_i(instr): |
| 59 | + return (instr >> 20) & 0xFFF |
| 60 | + |
| 61 | +def parse_imm_u(instr): |
| 62 | + return instr & 0xFFFFF000 |
| 63 | + |
| 64 | +def parse_rs1(instr): |
| 65 | + return (instr >> 15) & 0x1F |
| 66 | + |
| 67 | +def parse_funct3(instr): |
| 68 | + return (instr >> 12) & 0x7 |
| 69 | + |
| 70 | +def parse_funct7(instr): |
| 71 | + return (instr >> 25) |
| 72 | + |
| 73 | +def parse_funct12(instr): |
| 74 | + return (instr >> 20) & 0xFFF |
| 75 | + |
| 76 | +def parse_opcode(instr): |
| 77 | + return instr & 0x7F |
| 78 | + |
| 79 | +def instruction_name(instruction, supported): |
| 80 | + opcode = parse_opcode(instruction) |
| 81 | + funct3 = parse_funct3(instruction) |
| 82 | + funct7 = parse_funct7(instruction) |
| 83 | + funct12 = parse_funct12(instruction) |
| 84 | + |
| 85 | + opcode_hex = f"{opcode:02X}" |
| 86 | + funct3_hex = f"{funct3:02X}" |
| 87 | + funct7_hex = f"{funct7:02X}" |
| 88 | + funct12_hex = f"{funct12:04X}" |
| 89 | + |
| 90 | + for opcode_entry in supported['opcodes']: |
| 91 | + if opcode_hex in opcode_entry: |
| 92 | + opcode_data = opcode_entry[opcode_hex] |
| 93 | + |
| 94 | + # Check if it's a direct instruction like LUI, JAL, etc. |
| 95 | + if isinstance(opcode_data, str): |
| 96 | + return opcode_data |
| 97 | + |
| 98 | + # Check for funct3-based instructions |
| 99 | + if 'funct3' in opcode_data: |
| 100 | + for funct3_entry in opcode_data['funct3']: |
| 101 | + if funct3_hex in funct3_entry: |
| 102 | + funct3_data = funct3_entry[funct3_hex] |
| 103 | + |
| 104 | + # Check for funct12 (for ECALL, EBREAK, etc.) |
| 105 | + if 'funct12' in funct3_data: |
| 106 | + for funct12_entry in funct3_data['funct12']: |
| 107 | + if funct12_hex in funct12_entry: |
| 108 | + funct12_data = funct12_entry[funct12_hex] |
| 109 | + return funct12_data |
| 110 | + |
| 111 | + return funct3_data |
| 112 | + |
| 113 | + # Check for funct7-based instructions |
| 114 | + if 'funct7' in opcode_data: |
| 115 | + for funct7_entry in opcode_data['funct7']: |
| 116 | + if funct7_hex in funct7_entry: |
| 117 | + funct7_data = funct7_entry[funct7_hex] |
| 118 | + if 'funct3' in funct7_data: |
| 119 | + for funct3_entry in funct7_data['funct3']: |
| 120 | + if funct3_hex in funct3_entry: |
| 121 | + return funct3_entry[funct3_hex] |
| 122 | + elif 'default' in funct7_entry: |
| 123 | + funct7_data = funct7_entry['default'] |
| 124 | + if 'funct3' in funct7_data: |
| 125 | + for funct3_entry in funct7_data['funct3']: |
| 126 | + if funct3_hex in funct3_entry: |
| 127 | + return funct3_entry[funct3_hex] |
| 128 | + |
| 129 | + return "UNKNOWN" |
| 130 | + |
| 131 | +def parse_instructions(instructions, json_path): |
| 132 | + last_bytes = {} |
| 133 | + unknown_syscalls = {} |
| 134 | + unknown_instructions = {} |
| 135 | + supported, syscall_map = dict_from_json(json_path) |
| 136 | + |
| 137 | + u32max = (2**32)-1 |
| 138 | + for index, instruction in enumerate(instructions): |
| 139 | + if instruction < u32max: |
| 140 | + ins_name = instruction_name(instruction, supported) |
| 141 | + if ins_name == "ECALL": |
| 142 | + ins_name = parse_syscall(instructions, index, syscall_map) |
| 143 | + if "UNKNOWN" in ins_name: |
| 144 | + unknown_syscalls[ins_name] = unknown_syscalls.get(ins_name, 0) +1 |
| 145 | + if ins_name == "UNKNOWN": |
| 146 | + unknown_instructions[instruction] = unknown_instructions.get(instruction, 0) + 1 |
| 147 | + last_bytes[ins_name] = last_bytes.get(ins_name, 0) + 1 |
| 148 | + else: |
| 149 | + print(f"Error: Unexpected instruction: {instruction}.") |
| 150 | + exit(1) |
| 151 | + return last_bytes, unknown_instructions, unknown_syscalls |
| 152 | + |
| 153 | +def find_a7_value(instructions, index): |
| 154 | + # parse the 5 previous instructions, looking for A7 value |
| 155 | + for i in range(max(0,index-SYSCALL_INSTRUCTIONS_LOOKBACK), index): |
| 156 | + instr = instructions[i] |
| 157 | + rd = parse_rd(instr) |
| 158 | + if rd == 17: # a7 = x17 |
| 159 | + opcode = parse_opcode(instr) |
| 160 | + if opcode == 0x13: # ADDI |
| 161 | + imm = parse_imm_i(instr) |
| 162 | + return imm |
| 163 | + elif opcode == 0x37: # LUI |
| 164 | + imm = parse_imm_u(instr) >> 12 |
| 165 | + return imm |
| 166 | + elif opcode == 0x13 and parse_rs1(instr) == 0: # LI (ADDI x17, x0, imm) |
| 167 | + imm = parse_imm_i(instr) |
| 168 | + return imm |
| 169 | + return None |
| 170 | + |
| 171 | +def parse_syscall(instructions, index, syscall_map): |
| 172 | + a7 = find_a7_value(instructions, index) |
| 173 | + if a7 == None: |
| 174 | + return "UNKNOWN_SYSCALL (a7 = UNKNOWN)" |
| 175 | + syscall_name = syscall_map.get(f"{a7:02X}") |
| 176 | + if syscall_map.get(f"{a7:02X}") is None: |
| 177 | + return f"UNKNOWN_SYSCALL (a7 = 0x{a7:X})" |
| 178 | + return f"ECALL.{syscall_name}" |
| 179 | + |
| 180 | + |
| 181 | +def dict_from_json(json_path): |
| 182 | + try: |
| 183 | + with open(json_path, 'r') as f: |
| 184 | + data = json.load(f) |
| 185 | + syscalls = {list(s.keys())[0]: list(s.values())[0] for s in data.get('syscalls', [])} |
| 186 | + return data, syscalls |
| 187 | + except Exception as e: |
| 188 | + print(f"Error: Unable to read the JSON file. Reason: {e}") |
| 189 | + exit(1) |
| 190 | + |
| 191 | +if __name__ == "__main__": |
| 192 | + if len(sys.argv) != 3: |
| 193 | + print("Usage: python3 parse_riscv_elf.py <path_to_elf_file> <path_to_json_file>") |
| 194 | + sys.exit(1) |
| 195 | + |
| 196 | + elf_path = sys.argv[1] |
| 197 | + json_path = sys.argv[2] |
| 198 | + instructions = extract_text_section_instructions(elf_path) |
| 199 | + |
| 200 | + instruction_counts, unknown_instr, unknown_syscalls = parse_instructions(instructions, json_path) |
| 201 | + |
| 202 | + # SYSCALL results |
| 203 | + for key in unknown_syscalls.keys(): |
| 204 | + print(f"There were {unknown_syscalls[key]} {key}.") |
| 205 | + |
| 206 | + if instruction_counts.get("UNKNOWN", 0) != 0: |
| 207 | + nb_unknown = instruction_counts["UNKNOWN"] |
| 208 | + print(f"There were {nb_unknown} unknown instructions.\n") |
| 209 | + for instru, count in sorted(unknown_instr.items()): |
| 210 | + print(f"Unknown instruction: {instru:08X}: {count} times") |
| 211 | + exit(1) |
| 212 | + else: |
| 213 | + print("All instructions known.") |
0 commit comments