From 512eb786ab8249faefd79198a16bdfeac2ea6a29 Mon Sep 17 00:00:00 2001 From: MS Date: Mon, 8 Apr 2024 03:40:05 -0400 Subject: [PATCH] Detect calls using absolute indirect addressing (#784) * Detect calls using absolute indirect addressing * Ignore imports we can't match --- .../isledecomp/compare/asm/parse.py | 66 +++++++++++++++---- tools/isledecomp/isledecomp/compare/core.py | 56 +++++++++++----- tools/isledecomp/tests/test_sanitize.py | 52 ++++++++++++--- 3 files changed, 138 insertions(+), 36 deletions(-) diff --git a/tools/isledecomp/isledecomp/compare/asm/parse.py b/tools/isledecomp/isledecomp/compare/asm/parse.py index d69fe76b..93c28f03 100644 --- a/tools/isledecomp/isledecomp/compare/asm/parse.py +++ b/tools/isledecomp/isledecomp/compare/asm/parse.py @@ -7,10 +7,10 @@ placeholder string.""" import re +import struct from functools import cache from typing import Callable, List, Optional, Tuple from collections import namedtuple -from isledecomp.bin import InvalidVirtualAddressError from .const import JUMP_MNEMONICS, SINGLE_OPERAND_INSTS from .instgen import InstructGen, SectionType @@ -35,16 +35,33 @@ def from_hex(string: str) -> Optional[int]: return None +def bytes_to_float(b: bytes) -> Optional[float]: + if len(b) == 4: + return struct.unpack(" Optional[int]: + if len(b) == 4: + return struct.unpack(" None: self.relocate_lookup = relocate_lookup self.name_lookup = name_lookup - self.float_lookup = float_lookup + self.bin_lookup = bin_lookup self.replacements = {} self.number_placeholders = True @@ -58,14 +75,14 @@ def is_relocated(self, addr: int) -> bool: return False def float_replace(self, addr: int, data_size: int) -> Optional[str]: - if callable(self.float_lookup): - try: - float_str = self.float_lookup(addr, data_size) - except InvalidVirtualAddressError: - # probably caused by reading an invalid instruction + if callable(self.bin_lookup): + float_bytes = self.bin_lookup(addr, data_size) + if float_bytes is None: return None - if float_str is not None: - return f"{float_str} (FLOAT)" + + float_value = bytes_to_float(float_bytes) + if float_value is not None: + return f"{float_value} (FLOAT)" return None @@ -122,6 +139,30 @@ def hex_replace_annotated(self, match: re.Match) -> str: return match.group(0) + def hex_replace_indirect(self, match: re.Match) -> str: + """Edge case for hex_replace_always. The context of the instruction + tells us that the pointer value is an absolute indirect. + So we go to that location in the binary to get the address. + If we cannot identify the indirect address, fall back to a lookup + on the original pointer value so we might display something useful.""" + value = int(match.group(1), 16) + indirect_value = None + + if callable(self.bin_lookup): + indirect_value = self.bin_lookup(value, 4) + + if indirect_value is not None: + indirect_addr = bytes_to_dword(indirect_value) + if ( + indirect_addr is not None + and self.lookup(indirect_addr, use_cache=False) is not None + ): + return match.group(0).replace( + match.group(1), "->" + self.replace(indirect_addr) + ) + + return match.group(0).replace(match.group(1), self.replace(value)) + def hex_replace_float(self, match: re.Match) -> str: """Special case for replacements on float instructions. If the pointer is a float constant, read it from the binary.""" @@ -178,7 +219,10 @@ def sanitize(self, inst: DisasmLiteInst) -> Tuple[str, str]: jump_displacement = op_str_address - (inst.address + inst.size) return (inst.mnemonic, hex(jump_displacement)) - if inst.mnemonic.startswith("f"): + if inst.mnemonic == "call": + # Special handling for absolute indirect CALL. + op_str = ptr_replace_regex.sub(self.hex_replace_indirect, inst.op_str) + elif inst.mnemonic.startswith("f"): # If floating point instruction op_str = ptr_replace_regex.sub(self.hex_replace_float, inst.op_str) else: diff --git a/tools/isledecomp/isledecomp/compare/core.py b/tools/isledecomp/isledecomp/compare/core.py index bbd77307..d602af69 100644 --- a/tools/isledecomp/isledecomp/compare/core.py +++ b/tools/isledecomp/isledecomp/compare/core.py @@ -4,7 +4,7 @@ import struct from dataclasses import dataclass from typing import Callable, Iterable, List, Optional -from isledecomp.bin import Bin as IsleBin +from isledecomp.bin import Bin as IsleBin, InvalidVirtualAddressError from isledecomp.cvdump.demangler import demangle_string_const from isledecomp.cvdump import Cvdump, CvdumpAnalysis from isledecomp.parser import DecompCodebase @@ -50,20 +50,13 @@ def lookup(addr: int) -> bool: return lookup -def create_float_lookup(bin_file: IsleBin) -> Callable[[int, int], Optional[str]]: - """Function generator for floating point lookup""" +def create_bin_lookup(bin_file: IsleBin) -> Callable[[int, int], Optional[str]]: + """Function generator for reading from the bin file""" - def lookup(addr: int, size: int) -> Optional[str]: - data = bin_file.read(addr, size) - # If this is a float constant, it should be initialized data. - if data is None: - return None - - struct_str = " Optional[bytes]: try: - (float_value,) = struct.unpack(struct_str, data) - return str(float_value) - except struct.error: + return bin_file.read(addr, size) + except InvalidVirtualAddressError: return None return lookup @@ -273,8 +266,37 @@ def _match_imports(self): # the connection between the thunk functions. # We already have the symbol name we need from the PDB. for orig, recomp in orig_to_recomp.items(): + if orig is None or recomp is None: + continue + + # Match the __imp__ symbol self._db.set_pair(orig, recomp, SymbolType.POINTER) + # Read the relative address from .idata + try: + (recomp_rva,) = struct.unpack(" Optional[str]: orig_should_replace = create_reloc_lookup(self.orig_bin) recomp_should_replace = create_reloc_lookup(self.recomp_bin) - orig_float = create_float_lookup(self.orig_bin) - recomp_float = create_float_lookup(self.recomp_bin) + orig_bin_lookup = create_bin_lookup(self.orig_bin) + recomp_bin_lookup = create_bin_lookup(self.recomp_bin) orig_parse = ParseAsm( relocate_lookup=orig_should_replace, name_lookup=orig_lookup, - float_lookup=orig_float, + bin_lookup=orig_bin_lookup, ) recomp_parse = ParseAsm( relocate_lookup=recomp_should_replace, name_lookup=recomp_lookup, - float_lookup=recomp_float, + bin_lookup=recomp_bin_lookup, ) orig_combined = orig_parse.parse_asm(orig_raw, match.orig_addr) diff --git a/tools/isledecomp/tests/test_sanitize.py b/tools/isledecomp/tests/test_sanitize.py index d13fae56..57097bf9 100644 --- a/tools/isledecomp/tests/test_sanitize.py +++ b/tools/isledecomp/tests/test_sanitize.py @@ -198,13 +198,14 @@ def test_float_replacement(): The float constants don't appear to be deduplicated (like strings are) because there is another 0.5 at 0x100d40b0.""" - def substitute_float(addr: int, _: int) -> str: - return "zero-point-five" if addr == 0x1234 else None + def bin_lookup(addr: int, _: int) -> Optional[bytes]: + return b"\xdb\x0f\x49\x40" if addr == 0x1234 else None - p = ParseAsm(float_lookup=substitute_float) + p = ParseAsm(bin_lookup=bin_lookup) inst = DisasmLiteInst(0x1000, 6, "fld", "dword ptr [0x1234]") (_, op_str) = p.sanitize(inst) - assert op_str == "dword ptr [zero-point-five (FLOAT)]" + # Single-precision float. struct.unpack(" Optional[str]: return "g_myFloatVariable" if addr == 0x1234 else None - def substitute_float(_: int, __: int) -> str: - return "" - - p = ParseAsm(name_lookup=name_lookup, float_lookup=substitute_float) + p = ParseAsm(name_lookup=name_lookup) inst = DisasmLiteInst(0x1000, 6, "fld", "dword ptr [0x1234]") (_, op_str) = p.sanitize(inst) assert op_str == "dword ptr [g_myFloatVariable]" @@ -256,3 +254,41 @@ def name_lookup(addr: int) -> Optional[str]: # Should replace here (_, op_str) = p.sanitize(mock_inst("cmp", "eax, 0x5555")) assert op_str == "eax, hello" + + +def test_absolute_indirect(): + """The instruction `call dword ptr [0x1234]` means we call the function + whose address is at 0x1234. (i.e. absolute indirect addressing mode) + It is probably more useful to show the name of the function itself if + we have it, but there are some circumstances where we want to replace + with the pointer's name (i.e. an import function).""" + + def name_lookup(addr: int) -> Optional[str]: + return { + 0x1234: "Hello", + 0x4321: "xyz", + 0x5555: "Test", + }.get(addr) + + def bin_lookup(addr: int, _: int) -> Optional[bytes]: + return ( + { + 0x1234: b"\x55\x55\x00\x00", + 0x4321: b"\x99\x99\x00\x00", + } + ).get(addr) + + p = ParseAsm(name_lookup=name_lookup, bin_lookup=bin_lookup) + + # If we know the indirect address (0x5555) + # Arrow to indicate this is an indirect replacement + (_, op_str) = p.sanitize(mock_inst("call", "dword ptr [0x1234]")) + assert op_str == "dword ptr [->Test]" + + # If we do not know the indirect address (0x9999) + (_, op_str) = p.sanitize(mock_inst("call", "dword ptr [0x4321]")) + assert op_str == "dword ptr [xyz]" + + # If we can't read the indirect address + (_, op_str) = p.sanitize(mock_inst("call", "dword ptr [0x5555]")) + assert op_str == "dword ptr [Test]"