Detect calls using absolute indirect addressing (#784)

* Detect calls using absolute indirect addressing

* Ignore imports we can't match
This commit is contained in:
MS 2024-04-08 03:40:05 -04:00 committed by GitHub
parent 70912d16c6
commit 512eb786ab
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 138 additions and 36 deletions

View file

@ -7,10 +7,10 @@
placeholder string."""
import re
import struct
from functools import cache
from typing import Callable, List, Optional, Tuple
from collections import namedtuple
from isledecomp.bin import InvalidVirtualAddressError
from .const import JUMP_MNEMONICS, SINGLE_OPERAND_INSTS
from .instgen import InstructGen, SectionType
@ -35,16 +35,33 @@ def from_hex(string: str) -> Optional[int]:
return None
def bytes_to_float(b: bytes) -> Optional[float]:
if len(b) == 4:
return struct.unpack("<f", b)[0]
if len(b) == 8:
return struct.unpack("<d", b)[0]
return None
def bytes_to_dword(b: bytes) -> Optional[int]:
if len(b) == 4:
return struct.unpack("<L", b)[0]
return None
class ParseAsm:
def __init__(
self,
relocate_lookup: Optional[Callable[[int], bool]] = None,
name_lookup: Optional[Callable[[int], str]] = None,
float_lookup: Optional[Callable[[int, int], Optional[str]]] = None,
bin_lookup: Optional[Callable[[int, int], Optional[bytes]]] = None,
) -> None:
self.relocate_lookup = relocate_lookup
self.name_lookup = name_lookup
self.float_lookup = float_lookup
self.bin_lookup = bin_lookup
self.replacements = {}
self.number_placeholders = True
@ -58,14 +75,14 @@ def is_relocated(self, addr: int) -> bool:
return False
def float_replace(self, addr: int, data_size: int) -> Optional[str]:
if callable(self.float_lookup):
try:
float_str = self.float_lookup(addr, data_size)
except InvalidVirtualAddressError:
# probably caused by reading an invalid instruction
if callable(self.bin_lookup):
float_bytes = self.bin_lookup(addr, data_size)
if float_bytes is None:
return None
if float_str is not None:
return f"{float_str} (FLOAT)"
float_value = bytes_to_float(float_bytes)
if float_value is not None:
return f"{float_value} (FLOAT)"
return None
@ -122,6 +139,30 @@ def hex_replace_annotated(self, match: re.Match) -> str:
return match.group(0)
def hex_replace_indirect(self, match: re.Match) -> str:
"""Edge case for hex_replace_always. The context of the instruction
tells us that the pointer value is an absolute indirect.
So we go to that location in the binary to get the address.
If we cannot identify the indirect address, fall back to a lookup
on the original pointer value so we might display something useful."""
value = int(match.group(1), 16)
indirect_value = None
if callable(self.bin_lookup):
indirect_value = self.bin_lookup(value, 4)
if indirect_value is not None:
indirect_addr = bytes_to_dword(indirect_value)
if (
indirect_addr is not None
and self.lookup(indirect_addr, use_cache=False) is not None
):
return match.group(0).replace(
match.group(1), "->" + self.replace(indirect_addr)
)
return match.group(0).replace(match.group(1), self.replace(value))
def hex_replace_float(self, match: re.Match) -> str:
"""Special case for replacements on float instructions.
If the pointer is a float constant, read it from the binary."""
@ -178,7 +219,10 @@ def sanitize(self, inst: DisasmLiteInst) -> Tuple[str, str]:
jump_displacement = op_str_address - (inst.address + inst.size)
return (inst.mnemonic, hex(jump_displacement))
if inst.mnemonic.startswith("f"):
if inst.mnemonic == "call":
# Special handling for absolute indirect CALL.
op_str = ptr_replace_regex.sub(self.hex_replace_indirect, inst.op_str)
elif inst.mnemonic.startswith("f"):
# If floating point instruction
op_str = ptr_replace_regex.sub(self.hex_replace_float, inst.op_str)
else:

View file

@ -4,7 +4,7 @@
import struct
from dataclasses import dataclass
from typing import Callable, Iterable, List, Optional
from isledecomp.bin import Bin as IsleBin
from isledecomp.bin import Bin as IsleBin, InvalidVirtualAddressError
from isledecomp.cvdump.demangler import demangle_string_const
from isledecomp.cvdump import Cvdump, CvdumpAnalysis
from isledecomp.parser import DecompCodebase
@ -50,20 +50,13 @@ def lookup(addr: int) -> bool:
return lookup
def create_float_lookup(bin_file: IsleBin) -> Callable[[int, int], Optional[str]]:
"""Function generator for floating point lookup"""
def create_bin_lookup(bin_file: IsleBin) -> Callable[[int, int], Optional[str]]:
"""Function generator for reading from the bin file"""
def lookup(addr: int, size: int) -> Optional[str]:
data = bin_file.read(addr, size)
# If this is a float constant, it should be initialized data.
if data is None:
return None
struct_str = "<f" if size == 4 else "<d"
def lookup(addr: int, size: int) -> Optional[bytes]:
try:
(float_value,) = struct.unpack(struct_str, data)
return str(float_value)
except struct.error:
return bin_file.read(addr, size)
except InvalidVirtualAddressError:
return None
return lookup
@ -273,8 +266,37 @@ def _match_imports(self):
# the connection between the thunk functions.
# We already have the symbol name we need from the PDB.
for orig, recomp in orig_to_recomp.items():
if orig is None or recomp is None:
continue
# Match the __imp__ symbol
self._db.set_pair(orig, recomp, SymbolType.POINTER)
# Read the relative address from .idata
try:
(recomp_rva,) = struct.unpack("<L", self.recomp_bin.read(recomp, 4))
(orig_rva,) = struct.unpack("<L", self.orig_bin.read(orig, 4))
except ValueError:
# Bail out if there's a problem with struct.unpack
continue
# Strictly speaking, this is a hack to support asm sanitize.
# When calling an import, we will recognize that the address for the
# CALL instruction is a pointer to the actual address, but this is
# not only not the address of a function, it is not an address at all.
# To make the asm display work correctly (i.e. to match what you see
# in ghidra) create a function match on the RVA. This is not a valid
# virtual address because it is before the imagebase, but it will
# do what we need it to do in the sanitize function.
(dll_name, func_name) = orig_byaddr[orig]
fullname = dll_name + ":" + func_name
self._db.set_recomp_symbol(
recomp_rva, SymbolType.FUNCTION, fullname, None, 4
)
self._db.set_pair(orig_rva, recomp_rva, SymbolType.FUNCTION)
self._db.skip_compare(orig_rva)
def _match_thunks(self):
"""Thunks are (by nature) matched by indirection. If a thunk from orig
points at a function we have already matched, we can find the matching
@ -444,18 +466,18 @@ def recomp_lookup(addr: int) -> Optional[str]:
orig_should_replace = create_reloc_lookup(self.orig_bin)
recomp_should_replace = create_reloc_lookup(self.recomp_bin)
orig_float = create_float_lookup(self.orig_bin)
recomp_float = create_float_lookup(self.recomp_bin)
orig_bin_lookup = create_bin_lookup(self.orig_bin)
recomp_bin_lookup = create_bin_lookup(self.recomp_bin)
orig_parse = ParseAsm(
relocate_lookup=orig_should_replace,
name_lookup=orig_lookup,
float_lookup=orig_float,
bin_lookup=orig_bin_lookup,
)
recomp_parse = ParseAsm(
relocate_lookup=recomp_should_replace,
name_lookup=recomp_lookup,
float_lookup=recomp_float,
bin_lookup=recomp_bin_lookup,
)
orig_combined = orig_parse.parse_asm(orig_raw, match.orig_addr)

View file

@ -198,13 +198,14 @@ def test_float_replacement():
The float constants don't appear to be deduplicated (like strings are)
because there is another 0.5 at 0x100d40b0."""
def substitute_float(addr: int, _: int) -> str:
return "zero-point-five" if addr == 0x1234 else None
def bin_lookup(addr: int, _: int) -> Optional[bytes]:
return b"\xdb\x0f\x49\x40" if addr == 0x1234 else None
p = ParseAsm(float_lookup=substitute_float)
p = ParseAsm(bin_lookup=bin_lookup)
inst = DisasmLiteInst(0x1000, 6, "fld", "dword ptr [0x1234]")
(_, op_str) = p.sanitize(inst)
assert op_str == "dword ptr [zero-point-five (FLOAT)]"
# Single-precision float. struct.unpack("<f", struct.pack("<f", math.pi))
assert op_str == "dword ptr [3.1415927410125732 (FLOAT)]"
def test_float_variable():
@ -214,10 +215,7 @@ def test_float_variable():
def name_lookup(addr: int) -> Optional[str]:
return "g_myFloatVariable" if addr == 0x1234 else None
def substitute_float(_: int, __: int) -> str:
return ""
p = ParseAsm(name_lookup=name_lookup, float_lookup=substitute_float)
p = ParseAsm(name_lookup=name_lookup)
inst = DisasmLiteInst(0x1000, 6, "fld", "dword ptr [0x1234]")
(_, op_str) = p.sanitize(inst)
assert op_str == "dword ptr [g_myFloatVariable]"
@ -256,3 +254,41 @@ def name_lookup(addr: int) -> Optional[str]:
# Should replace here
(_, op_str) = p.sanitize(mock_inst("cmp", "eax, 0x5555"))
assert op_str == "eax, hello"
def test_absolute_indirect():
"""The instruction `call dword ptr [0x1234]` means we call the function
whose address is at 0x1234. (i.e. absolute indirect addressing mode)
It is probably more useful to show the name of the function itself if
we have it, but there are some circumstances where we want to replace
with the pointer's name (i.e. an import function)."""
def name_lookup(addr: int) -> Optional[str]:
return {
0x1234: "Hello",
0x4321: "xyz",
0x5555: "Test",
}.get(addr)
def bin_lookup(addr: int, _: int) -> Optional[bytes]:
return (
{
0x1234: b"\x55\x55\x00\x00",
0x4321: b"\x99\x99\x00\x00",
}
).get(addr)
p = ParseAsm(name_lookup=name_lookup, bin_lookup=bin_lookup)
# If we know the indirect address (0x5555)
# Arrow to indicate this is an indirect replacement
(_, op_str) = p.sanitize(mock_inst("call", "dword ptr [0x1234]"))
assert op_str == "dword ptr [->Test]"
# If we do not know the indirect address (0x9999)
(_, op_str) = p.sanitize(mock_inst("call", "dword ptr [0x4321]"))
assert op_str == "dword ptr [xyz]"
# If we can't read the indirect address
(_, op_str) = p.sanitize(mock_inst("call", "dword ptr [0x5555]"))
assert op_str == "dword ptr [Test]"