mirror of
https://github.com/isledecomp/isle-portable.git
synced 2024-11-26 17:36:12 -05:00
Identify and handle jump tables (#732)
This commit is contained in:
parent
1e3ca11886
commit
ededdf31c3
4 changed files with 502 additions and 27 deletions
235
tools/isledecomp/isledecomp/compare/asm/instgen.py
Normal file
235
tools/isledecomp/isledecomp/compare/asm/instgen.py
Normal file
|
@ -0,0 +1,235 @@
|
||||||
|
"""Pre-parser for x86 instructions. Will identify data/jump tables used with
|
||||||
|
switch statements and local jump/call destinations."""
|
||||||
|
import re
|
||||||
|
import bisect
|
||||||
|
import struct
|
||||||
|
from enum import Enum, auto
|
||||||
|
from collections import namedtuple
|
||||||
|
from typing import List, NamedTuple, Optional, Tuple, Union
|
||||||
|
from capstone import Cs, CS_ARCH_X86, CS_MODE_32
|
||||||
|
from .const import JUMP_MNEMONICS
|
||||||
|
|
||||||
|
disassembler = Cs(CS_ARCH_X86, CS_MODE_32)
|
||||||
|
|
||||||
|
DisasmLiteInst = namedtuple("DisasmLiteInst", "address, size, mnemonic, op_str")
|
||||||
|
|
||||||
|
displacement_regex = re.compile(r".*\+ (0x[0-9a-f]+)\]")
|
||||||
|
|
||||||
|
|
||||||
|
class SectionType(Enum):
|
||||||
|
CODE = auto()
|
||||||
|
DATA_TAB = auto()
|
||||||
|
ADDR_TAB = auto()
|
||||||
|
|
||||||
|
|
||||||
|
class FuncSection(NamedTuple):
|
||||||
|
type: SectionType
|
||||||
|
contents: List[Union[DisasmLiteInst, Tuple[str, int]]]
|
||||||
|
|
||||||
|
|
||||||
|
class InstructGen:
|
||||||
|
# pylint: disable=too-many-instance-attributes
|
||||||
|
def __init__(self, blob: bytes, start: int) -> None:
|
||||||
|
self.blob = blob
|
||||||
|
self.start = start
|
||||||
|
self.end = len(blob) + start
|
||||||
|
self.section_end: int = self.end
|
||||||
|
self.code_tracks: List[List[DisasmLiteInst]] = []
|
||||||
|
|
||||||
|
# Todo: Could be refactored later
|
||||||
|
self.cur_addr: int = 0
|
||||||
|
self.cur_section_type: SectionType = SectionType.CODE
|
||||||
|
self.section_start = start
|
||||||
|
|
||||||
|
self.sections: List[FuncSection] = []
|
||||||
|
|
||||||
|
self.confirmed_addrs = {}
|
||||||
|
self.analysis()
|
||||||
|
|
||||||
|
def _finish_section(self, type_: SectionType, stuff):
|
||||||
|
sect = FuncSection(type_, stuff)
|
||||||
|
self.sections.append(sect)
|
||||||
|
|
||||||
|
def _insert_confirmed_addr(self, addr: int, type_: SectionType):
|
||||||
|
# Ignore address outside the bounds of the function
|
||||||
|
if not self.start <= addr < self.end:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.confirmed_addrs[addr] = type_
|
||||||
|
|
||||||
|
# This newly inserted address might signal the end of this section.
|
||||||
|
# For example, a jump table at the end of the function means we should
|
||||||
|
# stop reading instructions once we hit that address.
|
||||||
|
# However, if there is a jump table in between code sections, we might
|
||||||
|
# read a jump to an address back to the beginning of the function
|
||||||
|
# (e.g. a loop that spans the entire function)
|
||||||
|
# so ignore this address because we have already passed it.
|
||||||
|
if type_ != self.cur_section_type and addr > self.cur_addr:
|
||||||
|
self.section_end = min(self.section_end, addr)
|
||||||
|
|
||||||
|
def _next_section(self, addr: int) -> Optional[SectionType]:
|
||||||
|
"""We have reached the start of a new section. Tell what kind of
|
||||||
|
data we are looking at (code or other) and how much we should read."""
|
||||||
|
|
||||||
|
# Assume the start of every function is code.
|
||||||
|
if addr == self.start:
|
||||||
|
self.section_end = self.end
|
||||||
|
return SectionType.CODE
|
||||||
|
|
||||||
|
# The start of a new section must be an address that we've seen.
|
||||||
|
new_type = self.confirmed_addrs.get(addr)
|
||||||
|
if new_type is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
self.cur_section_type = new_type
|
||||||
|
|
||||||
|
# The confirmed addrs dict is sorted by insertion order
|
||||||
|
# i.e. the order in which we read the addresses
|
||||||
|
# So we have to sort and then find the next item
|
||||||
|
# to see where this section should end.
|
||||||
|
|
||||||
|
# If we are in a CODE section, ignore contiguous CODE addresses.
|
||||||
|
# These are not the start of a new section.
|
||||||
|
# However: if we are not in CODE, any upcoming address is a new section.
|
||||||
|
# Do this so we can detect contiguous non-CODE sections.
|
||||||
|
confirmed = [
|
||||||
|
conf_addr
|
||||||
|
for (conf_addr, conf_type) in sorted(self.confirmed_addrs.items())
|
||||||
|
if self.cur_section_type != SectionType.CODE
|
||||||
|
or conf_type != self.cur_section_type
|
||||||
|
]
|
||||||
|
|
||||||
|
index = bisect.bisect_right(confirmed, addr)
|
||||||
|
if index < len(confirmed):
|
||||||
|
self.section_end = confirmed[index]
|
||||||
|
else:
|
||||||
|
self.section_end = self.end
|
||||||
|
|
||||||
|
return new_type
|
||||||
|
|
||||||
|
def _get_code_for(self, addr: int) -> List[DisasmLiteInst]:
|
||||||
|
"""Start disassembling at the given address."""
|
||||||
|
# If we are reading a code block beyond the first, see if we already
|
||||||
|
# have disassembled instructions beginning at the specified address.
|
||||||
|
# For a CODE/ADDR/CODE function, we might get lucky and produce the
|
||||||
|
# correct instruction after the jump table's junk instructions.
|
||||||
|
for track in self.code_tracks:
|
||||||
|
for i, inst in enumerate(track):
|
||||||
|
if inst.address == addr:
|
||||||
|
return track[i:]
|
||||||
|
|
||||||
|
# If we are here, we don't have the instructions.
|
||||||
|
# Todo: Could try to be clever here and disassemble only
|
||||||
|
# as much as we probably need (i.e. if a jump table is between CODE
|
||||||
|
# blocks, there are probably only a few bad instructions after the
|
||||||
|
# jump table is finished. We could disassemble up to the next verified
|
||||||
|
# code address and stitch it together)
|
||||||
|
|
||||||
|
blob_cropped = self.blob[addr - self.start :]
|
||||||
|
instructions = [
|
||||||
|
DisasmLiteInst(*inst)
|
||||||
|
for inst in disassembler.disasm_lite(blob_cropped, addr)
|
||||||
|
]
|
||||||
|
self.code_tracks.append(instructions)
|
||||||
|
return instructions
|
||||||
|
|
||||||
|
def _handle_jump(self, inst: DisasmLiteInst):
|
||||||
|
# If this is a regular jump and its destination is within the
|
||||||
|
# bounds of the binary data (i.e. presumed function size)
|
||||||
|
# add it to our list of confirmed addresses.
|
||||||
|
if inst.op_str[0] == "0":
|
||||||
|
value = int(inst.op_str, 16)
|
||||||
|
self._insert_confirmed_addr(value, SectionType.CODE)
|
||||||
|
|
||||||
|
# If this is jumping into a table of addresses, save the destination
|
||||||
|
elif (match := displacement_regex.match(inst.op_str)) is not None:
|
||||||
|
value = int(match.group(1), 16)
|
||||||
|
self._insert_confirmed_addr(value, SectionType.ADDR_TAB)
|
||||||
|
|
||||||
|
def analysis(self):
|
||||||
|
self.cur_addr = self.start
|
||||||
|
|
||||||
|
while (sect_type := self._next_section(self.cur_addr)) is not None:
|
||||||
|
self.section_start = self.cur_addr
|
||||||
|
|
||||||
|
if sect_type == SectionType.CODE:
|
||||||
|
instructions = self._get_code_for(self.cur_addr)
|
||||||
|
|
||||||
|
# If we didn't get any instructions back, something is wrong.
|
||||||
|
# i.e. We can only read part of the full instruction that is up next.
|
||||||
|
if len(instructions) == 0:
|
||||||
|
# Nudge the current addr so we will eventually move on to the
|
||||||
|
# next section.
|
||||||
|
# Todo: Maybe we could just call it quits here
|
||||||
|
self.cur_addr += 1
|
||||||
|
break
|
||||||
|
|
||||||
|
for inst in instructions:
|
||||||
|
# section_end is updated as we read instructions.
|
||||||
|
# If we are into a jump/data table and would read
|
||||||
|
# a junk instruction, stop here.
|
||||||
|
if self.cur_addr >= self.section_end:
|
||||||
|
break
|
||||||
|
|
||||||
|
# print(f"{inst.address:x} : {inst.mnemonic} {inst.op_str}")
|
||||||
|
|
||||||
|
if inst.mnemonic in JUMP_MNEMONICS:
|
||||||
|
self._handle_jump(inst)
|
||||||
|
# Todo: log calls too (unwind section)
|
||||||
|
elif inst.mnemonic == "mov":
|
||||||
|
# Todo: maintain pairing of data/jump tables
|
||||||
|
if (match := displacement_regex.match(inst.op_str)) is not None:
|
||||||
|
value = int(match.group(1), 16)
|
||||||
|
self._insert_confirmed_addr(value, SectionType.DATA_TAB)
|
||||||
|
|
||||||
|
# Do this instead of copying instruction address.
|
||||||
|
# If there is only one instruction, we would get stuck here.
|
||||||
|
self.cur_addr += inst.size
|
||||||
|
|
||||||
|
# End of for loop on instructions.
|
||||||
|
# We are at the end of the section or the entire function.
|
||||||
|
# Cut out only the valid instructions for this section
|
||||||
|
# and save it for later.
|
||||||
|
|
||||||
|
# Todo: don't need to iter on every instruction here.
|
||||||
|
# They are already in order.
|
||||||
|
instruction_slice = [
|
||||||
|
inst for inst in instructions if inst.address < self.section_end
|
||||||
|
]
|
||||||
|
self._finish_section(SectionType.CODE, instruction_slice)
|
||||||
|
|
||||||
|
elif sect_type == SectionType.ADDR_TAB:
|
||||||
|
# Clamp to multiple of 4 (dwords)
|
||||||
|
read_size = ((self.section_end - self.cur_addr) // 4) * 4
|
||||||
|
offsets = range(self.section_start, self.section_start + read_size, 4)
|
||||||
|
dwords = self.blob[
|
||||||
|
self.cur_addr - self.start : self.cur_addr - self.start + read_size
|
||||||
|
]
|
||||||
|
addrs = [addr for addr, in struct.iter_unpack("<L", dwords)]
|
||||||
|
for addr in addrs:
|
||||||
|
# Todo: the fact that these are jump table destinations
|
||||||
|
# should factor into the label name.
|
||||||
|
self._insert_confirmed_addr(addr, SectionType.CODE)
|
||||||
|
|
||||||
|
jump_table = list(zip(offsets, addrs))
|
||||||
|
# for (t0,t1) in jump_table:
|
||||||
|
# print(f"{t0:x} : --> {t1:x}")
|
||||||
|
|
||||||
|
self._finish_section(SectionType.ADDR_TAB, jump_table)
|
||||||
|
self.cur_addr = self.section_end
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Todo: variable data size?
|
||||||
|
read_size = self.section_end - self.cur_addr
|
||||||
|
offsets = range(self.section_start, self.section_start + read_size)
|
||||||
|
bytes_ = self.blob[
|
||||||
|
self.cur_addr - self.start : self.cur_addr - self.start + read_size
|
||||||
|
]
|
||||||
|
data = [b for b, in struct.iter_unpack("<B", bytes_)]
|
||||||
|
|
||||||
|
data_table = list(zip(offsets, data))
|
||||||
|
# for (t0,t1) in data_table:
|
||||||
|
# print(f"{t0:x} : value {t1:02x}")
|
||||||
|
|
||||||
|
self._finish_section(SectionType.DATA_TAB, data_table)
|
||||||
|
self.cur_addr = self.section_end
|
|
@ -11,13 +11,13 @@
|
||||||
from typing import Callable, List, Optional, Tuple
|
from typing import Callable, List, Optional, Tuple
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from isledecomp.bin import InvalidVirtualAddressError
|
from isledecomp.bin import InvalidVirtualAddressError
|
||||||
from capstone import Cs, CS_ARCH_X86, CS_MODE_32
|
|
||||||
from .const import JUMP_MNEMONICS, SINGLE_OPERAND_INSTS
|
from .const import JUMP_MNEMONICS, SINGLE_OPERAND_INSTS
|
||||||
|
from .instgen import InstructGen, SectionType
|
||||||
disassembler = Cs(CS_ARCH_X86, CS_MODE_32)
|
|
||||||
|
|
||||||
ptr_replace_regex = re.compile(r"\[(0x[0-9a-f]+)\]")
|
ptr_replace_regex = re.compile(r"\[(0x[0-9a-f]+)\]")
|
||||||
|
|
||||||
|
displace_replace_regex = re.compile(r"\+ (0x[0-9a-f]+)\]")
|
||||||
|
|
||||||
# For matching an immediate value on its own.
|
# For matching an immediate value on its own.
|
||||||
# Preceded by start-of-string (first operand) or comma-space (second operand)
|
# Preceded by start-of-string (first operand) or comma-space (second operand)
|
||||||
immediate_replace_regex = re.compile(r"(?:^|, )(0x[0-9a-f]+)")
|
immediate_replace_regex = re.compile(r"(?:^|, )(0x[0-9a-f]+)")
|
||||||
|
@ -172,34 +172,52 @@ def sanitize(self, inst: DisasmLiteInst) -> Tuple[str, str]:
|
||||||
else:
|
else:
|
||||||
op_str = ptr_replace_regex.sub(self.hex_replace_always, inst.op_str)
|
op_str = ptr_replace_regex.sub(self.hex_replace_always, inst.op_str)
|
||||||
|
|
||||||
|
# We only want relocated addresses for pointer displacement.
|
||||||
|
# i.e. ptr [register + something]
|
||||||
|
# Otherwise we would use a placeholder for every stack variable,
|
||||||
|
# vtable call, or this->member access.
|
||||||
|
op_str = displace_replace_regex.sub(self.hex_replace_relocated, op_str)
|
||||||
|
|
||||||
op_str = immediate_replace_regex.sub(self.hex_replace_relocated, op_str)
|
op_str = immediate_replace_regex.sub(self.hex_replace_relocated, op_str)
|
||||||
return (inst.mnemonic, op_str)
|
return (inst.mnemonic, op_str)
|
||||||
|
|
||||||
def parse_asm(self, data: bytes, start_addr: Optional[int] = 0) -> List[str]:
|
def parse_asm(self, data: bytes, start_addr: Optional[int] = 0) -> List[str]:
|
||||||
asm = []
|
asm = []
|
||||||
|
|
||||||
for raw_inst in disassembler.disasm_lite(data, start_addr):
|
ig = InstructGen(data, start_addr)
|
||||||
# Use heuristics to disregard some differences that aren't representative
|
|
||||||
# of the accuracy of a function (e.g. global offsets)
|
|
||||||
inst = DisasmLiteInst(*raw_inst)
|
|
||||||
|
|
||||||
# If there is no pointer or immediate value in the op_str,
|
for sect_type, sect_contents in ig.sections:
|
||||||
# there is nothing to sanitize.
|
if sect_type == SectionType.CODE:
|
||||||
# This leaves us with cases where a small immediate value or
|
for inst in sect_contents:
|
||||||
# small displacement (this.member or vtable calls) appears.
|
# Use heuristics to disregard some differences that aren't representative
|
||||||
# If we assume that instructions we want to sanitize need to be 5
|
# of the accuracy of a function (e.g. global offsets)
|
||||||
# bytes -- 1 for the opcode and 4 for the address -- exclude cases
|
|
||||||
# where the hex value could not be an address.
|
|
||||||
# The exception is jumps which are as small as 2 bytes
|
|
||||||
# but are still useful to sanitize.
|
|
||||||
if "0x" in inst.op_str and (
|
|
||||||
inst.mnemonic in JUMP_MNEMONICS or inst.size > 4
|
|
||||||
):
|
|
||||||
result = self.sanitize(inst)
|
|
||||||
else:
|
|
||||||
result = (inst.mnemonic, inst.op_str)
|
|
||||||
|
|
||||||
# mnemonic + " " + op_str
|
# If there is no pointer or immediate value in the op_str,
|
||||||
asm.append((hex(inst.address), " ".join(result)))
|
# there is nothing to sanitize.
|
||||||
|
# This leaves us with cases where a small immediate value or
|
||||||
|
# small displacement (this.member or vtable calls) appears.
|
||||||
|
# If we assume that instructions we want to sanitize need to be 5
|
||||||
|
# bytes -- 1 for the opcode and 4 for the address -- exclude cases
|
||||||
|
# where the hex value could not be an address.
|
||||||
|
# The exception is jumps which are as small as 2 bytes
|
||||||
|
# but are still useful to sanitize.
|
||||||
|
if "0x" in inst.op_str and (
|
||||||
|
inst.mnemonic in JUMP_MNEMONICS or inst.size > 4
|
||||||
|
):
|
||||||
|
result = self.sanitize(inst)
|
||||||
|
else:
|
||||||
|
result = (inst.mnemonic, inst.op_str)
|
||||||
|
|
||||||
|
# mnemonic + " " + op_str
|
||||||
|
asm.append((hex(inst.address), " ".join(result)))
|
||||||
|
elif sect_type == SectionType.ADDR_TAB:
|
||||||
|
asm.append(("", "Jump table:"))
|
||||||
|
for i, (ofs, _) in enumerate(sect_contents):
|
||||||
|
asm.append((hex(ofs), f"Jump_dest_{i}"))
|
||||||
|
|
||||||
|
elif sect_type == SectionType.DATA_TAB:
|
||||||
|
asm.append(("", "Data table:"))
|
||||||
|
for ofs, b in sect_contents:
|
||||||
|
asm.append((hex(ofs), hex(b)))
|
||||||
|
|
||||||
return asm
|
return asm
|
||||||
|
|
212
tools/isledecomp/tests/test_instgen.py
Normal file
212
tools/isledecomp/tests/test_instgen.py
Normal file
|
@ -0,0 +1,212 @@
|
||||||
|
from isledecomp.compare.asm.instgen import InstructGen, SectionType
|
||||||
|
|
||||||
|
|
||||||
|
def test_ret():
|
||||||
|
"""Make sure we can handle a function with one instruction."""
|
||||||
|
ig = InstructGen(b"\xc3", 0)
|
||||||
|
assert len(ig.sections) == 1
|
||||||
|
|
||||||
|
|
||||||
|
SCORE_NOTIFY = (
|
||||||
|
b"\x53\x56\x57\x8b\xd9\x33\xff\x8b\x74\x24\x10\x56\xe8\xbf\xe1\x01"
|
||||||
|
b"\x00\x80\xbb\xf6\x00\x00\x00\x00\x0f\x84\x9c\x00\x00\x00\x8b\x4e"
|
||||||
|
b"\x04\x49\x83\xf9\x17\x0f\x87\x8f\x00\x00\x00\x33\xc0\x8a\x81\xec"
|
||||||
|
b"\x14\x00\x10\xff\x24\x85\xd4\x14\x00\x10\x8b\xcb\xbf\x01\x00\x00"
|
||||||
|
b"\x00\xe8\x7a\x05\x00\x00\x8b\xc7\x5f\x5e\x5b\xc2\x04\x00\x56\x8b"
|
||||||
|
b"\xcb\xe8\xaa\x00\x00\x00\x8b\xf8\x8b\xc7\x5f\x5e\x5b\xc2\x04\x00"
|
||||||
|
b"\x80\x7e\x18\x20\x75\x07\x8b\xcb\xe8\xc3\xfe\xff\xff\xbf\x01\x00"
|
||||||
|
b"\x00\x00\x8b\xc7\x5f\x5e\x5b\xc2\x04\x00\x56\x8b\xcb\xe8\x3e\x02"
|
||||||
|
b"\x00\x00\x8b\xf8\x8b\xc7\x5f\x5e\x5b\xc2\x04\x00\x6a\x09\xa1\x4c"
|
||||||
|
b"\x45\x0f\x10\x6a\x07\x50\xe8\x35\x45\x01\x00\x83\xc4\x0c\x8b\x83"
|
||||||
|
b"\xf8\x00\x00\x00\x85\xc0\x74\x0d\x50\xe8\xa2\x42\x01\x00\x8b\xc8"
|
||||||
|
b"\xe8\x9b\x9b\x03\x00\xbf\x01\x00\x00\x00\x8b\xc7\x5f\x5e\x5b\xc2"
|
||||||
|
b"\x04\x00\x8b\xff\x4a\x14\x00\x10\x5e\x14\x00\x10\x70\x14\x00\x10"
|
||||||
|
b"\x8a\x14\x00\x10\x9c\x14\x00\x10\xca\x14\x00\x10\x00\x01\x05\x05"
|
||||||
|
b"\x05\x05\x02\x05\x05\x05\x05\x05\x05\x05\x05\x05\x03\x05\x05\x05"
|
||||||
|
b"\x05\x05\x05\x04\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_score_notify():
|
||||||
|
"""Score::Notify function from 0x10001410 in LEGO1.
|
||||||
|
Good representative function for jump table (at 0x100014d4)
|
||||||
|
and switch data (at 0x100014ec)."""
|
||||||
|
ig = InstructGen(SCORE_NOTIFY, 0x10001410)
|
||||||
|
|
||||||
|
# Did we get everything?
|
||||||
|
assert len(ig.sections) == 3
|
||||||
|
types_only = tuple(s.type for s in ig.sections)
|
||||||
|
assert types_only == (SectionType.CODE, SectionType.ADDR_TAB, SectionType.DATA_TAB)
|
||||||
|
|
||||||
|
# CODE section stopped at correct place?
|
||||||
|
instructions = ig.sections[0].contents
|
||||||
|
assert instructions[-1].address == 0x100014D2
|
||||||
|
# n.b. 0x100014d2 is the dummy instruction `mov edi, edi`
|
||||||
|
# Ghidra does more thorough analysis and ignores this.
|
||||||
|
# The last real instruction should be at 0x100014cf. Not a big deal
|
||||||
|
# to include this because it is not junk data.
|
||||||
|
|
||||||
|
# 6 switch addresses
|
||||||
|
assert len(ig.sections[1].contents) == 6
|
||||||
|
|
||||||
|
# TODO: The data table at the end includes all of the 0xCC padding bytes.
|
||||||
|
|
||||||
|
|
||||||
|
SMACK_CASE = (
|
||||||
|
# LEGO1: 0x100cdc43 (modified so jump table points at +0x1016)
|
||||||
|
b"\x2e\xff\x24\x8d\x16\x10\x00\x00"
|
||||||
|
# LEGO1: 0x100cdb62 (instructions before and after jump table)
|
||||||
|
b"\x8b\xf8\xeb\x1a\x87\xdb\x87\xc9\x87\xdb\x87\xc9\x87\xdb\x50\xdc"
|
||||||
|
b"\x0c\x10\xd0\xe2\x0c\x10\xb0\xe8\x0c\x10\x50\xe9\x0c\x10\xa0\x10"
|
||||||
|
b"\x27\x10\x10\x3c\x11\x77\x17\x8a\xc8"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_smack_case():
|
||||||
|
"""Case where we have code / jump table / code.
|
||||||
|
Need to properly separate code sections, eliminate junk instructions
|
||||||
|
and continue disassembling at the proper address following the data."""
|
||||||
|
ig = InstructGen(SMACK_CASE, 0x1000)
|
||||||
|
assert len(ig.sections) == 3
|
||||||
|
assert ig.sections[0].type == ig.sections[2].type == SectionType.CODE
|
||||||
|
|
||||||
|
# Make sure we captured the instruction immediately after
|
||||||
|
assert ig.sections[2].contents[0].mnemonic == "mov"
|
||||||
|
|
||||||
|
|
||||||
|
# BETA10 0x1004c9cc
|
||||||
|
BETA_FUNC = (
|
||||||
|
b"\x55\x8b\xec\x83\xec\x08\x53\x56\x57\x89\x4d\xfc\x8b\x45\xfc\x33"
|
||||||
|
b"\xc9\x8a\x88\x19\x02\x00\x00\x89\x4d\xf8\xe9\x1e\x00\x00\x00\xe9"
|
||||||
|
b"\x41\x00\x00\x00\xe9\x3c\x00\x00\x00\xe9\x37\x00\x00\x00\xe9\x32"
|
||||||
|
b"\x00\x00\x00\xe9\x2d\x00\x00\x00\xe9\x28\x00\x00\x00\x83\x7d\xf8"
|
||||||
|
b"\x04\x0f\x87\x1e\x00\x00\x00\x8b\x45\xf8\xff\x24\x85\x1d\xca\x04"
|
||||||
|
b"\x10\xeb\xc9\x04\x10\xf0\xc9\x04\x10\xf5\xc9\x04\x10\xfa\xc9\x04"
|
||||||
|
b"\x10\xff\xc9\x04\x10\xb0\x01\xe9\x00\x00\x00\x00\x5f\x5e\x5b\xc9"
|
||||||
|
b"\xc2\x04\x00"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_beta_case():
|
||||||
|
"""Complete (and short) function with CODE / ADDR / CODE"""
|
||||||
|
ig = InstructGen(BETA_FUNC, 0x1004C9CC)
|
||||||
|
# The JMP into the jump table immediately precedes the jump table.
|
||||||
|
# We have to detect this and switch sections correctly or we will only
|
||||||
|
# get 1 section.
|
||||||
|
assert len(ig.sections) == 3
|
||||||
|
assert ig.sections[0].type == ig.sections[2].type == SectionType.CODE
|
||||||
|
|
||||||
|
# Make sure we captured the instruction immediately after
|
||||||
|
assert ig.sections[2].contents[0].mnemonic == "mov"
|
||||||
|
|
||||||
|
|
||||||
|
# LEGO1 0x1000fb50
|
||||||
|
# TODO: The test data here is longer than it needs to be.
|
||||||
|
THUNK_TEST = (
|
||||||
|
b"\x2b\x49\xfc\xe9\x08\x00\x00\x00\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc"
|
||||||
|
b"\x56\x8b\xf1\xe8\xd8\xc5\x00\x00\x8b\xce\xe8\xb1\xdc\x01\x00\xf6"
|
||||||
|
b"\x44\x24\x08\x01\x74\x0c\x8d\x46\xe0\x50\xe8\xe1\x66\x07\x00\x83"
|
||||||
|
b"\xc4\x04\x8d\x46\xe0\x5e\xc2\x04\x00\xcc\xcc\xcc\xcc\xcc\xcc\xcc"
|
||||||
|
b"\x2b\x49\xfc\xe9\x08\x00\x00\x00\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc"
|
||||||
|
b"\xb8\x7c\x05\x0f\x10\xc3\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc"
|
||||||
|
b"\x2b\x49\xfc\xe9\x08\x00\x00\x00\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc"
|
||||||
|
b"\x8b\x54"
|
||||||
|
# The problem is here: the last two bytes are the start of the next
|
||||||
|
# function 0x1000fbc0. This is not enough data to read an instruction.
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_thunk_case():
|
||||||
|
"""Adjuster thunk incorrectly annotated.
|
||||||
|
We are reading way more bytes than we should for this function."""
|
||||||
|
ig = InstructGen(THUNK_TEST, 0x1000FB50)
|
||||||
|
# No switch cases here, so the only section is code.
|
||||||
|
# This caused an infinite loop during testing so the goal is just to finish.
|
||||||
|
assert len(ig.sections) == 1
|
||||||
|
|
||||||
|
# TODO: We might detect the 0xCC padding bytes and cut off the function.
|
||||||
|
# If we did that, we would correctly read only 2 instructions.
|
||||||
|
# assert len(ig.sections[0].contents) == 2
|
||||||
|
|
||||||
|
|
||||||
|
# LEGO1 0x1006f080, Infocenter::HandleEndAction
|
||||||
|
HANDLE_END_ACTION = (
|
||||||
|
b"\x53\x56\x57\x8b\xf1\x8b\x5c\x24\x10\x8b\x0d\x84\x45\x0f\x10\x8b"
|
||||||
|
b"\x7b\x0c\x8b\x47\x20\x39\x01\x75\x29\x81\x7f\x1c\xf3\x01\x00\x00"
|
||||||
|
b"\x75\x20\xe8\x59\x66\xfa\xff\x6a\x00\x8b\x40\x18\x6a\x00\x6a\x10"
|
||||||
|
b"\x50\xff\x15\x38\xb5\x10\x10\xb8\x01\x00\x00\x00\x5f\x5e\x5b\xc2"
|
||||||
|
b"\x04\x00\x39\x46\x0c\x0f\x85\xa2\x00\x00\x00\x8b\x47\x1c\x83\xf8"
|
||||||
|
b"\x28\x74\x18\x83\xf8\x29\x74\x13\x83\xf8\x2a\x74\x0e\x83\xf8\x2b"
|
||||||
|
b"\x74\x09\x83\xf8\x2c\x0f\x85\x82\x00\x00\x00\x66\x8b\x86\xd4\x01"
|
||||||
|
b"\x00\x00\x66\x85\xc0\x74\x09\x66\x48\x66\x89\x86\xd4\x01\x00\x00"
|
||||||
|
b"\x66\x83\xbe\xd4\x01\x00\x00\x00\x75\x63\x6a\x0b\xe8\xff\x67\xfa"
|
||||||
|
b"\xff\x66\x8b\x86\xfc\x00\x00\x00\x83\xc4\x04\x50\xe8\x3f\x66\xfa"
|
||||||
|
b"\xff\x8b\xc8\xe8\x58\xa6\xfc\xff\x0f\xbf\x86\xfc\x00\x00\x00\x48"
|
||||||
|
b"\x83\xf8\x04\x77\x2f\xff\x24\x85\x78\xf4\x06\x10\x68\x1d\x02\x00"
|
||||||
|
b"\x00\xeb\x1a\x68\x1e\x02\x00\x00\xeb\x13\x68\x1f\x02\x00\x00\xeb"
|
||||||
|
b"\x0c\x68\x20\x02\x00\x00\xeb\x05\x68\x21\x02\x00\x00\x8b\xce\xe8"
|
||||||
|
b"\x9c\x21\x00\x00\x6a\x01\x8b\xce\xe8\x53\x1c\x00\x00\x8d\x8e\x0c"
|
||||||
|
b"\x01\x00\x00\x53\x8b\x01\xff\x50\x04\x85\xc0\x0f\x85\xef\x02\x00"
|
||||||
|
b"\x00\x8b\x56\x0c\x8b\x4f\x20\x3b\xd1\x74\x0e\x8b\x1d\x74\x45\x0f"
|
||||||
|
b"\x10\x39\x0b\x0f\x85\xd7\x02\x00\x00\x81\x7f\x1c\x02\x02\x00\x00"
|
||||||
|
b"\x75\x1a\x6a\x00\x52\x6a\x10\xe8\xa4\x65\xfa\xff\x8b\xc8\xe8\x0d"
|
||||||
|
b"\xa2\xfb\xff\x66\xc7\x86\xd6\x01\x00\x00\x00\x00\x8b\x96\x00\x01"
|
||||||
|
b"\x00\x00\x8d\x42\x74\x8b\x18\x83\xfb\x0c\x0f\x87\x9b\x02\x00\x00"
|
||||||
|
b"\x33\xc9\x8a\x8b\xac\xf4\x06\x10\xff\x24\x8d\x8c\xf4\x06\x10\x8b"
|
||||||
|
b"\x86\x08\x01\x00\x00\x83\xf8\x05\x77\x07\xff\x24\x85\xbc\xf4\x06"
|
||||||
|
b"\x10\x8b\xce\xe8\xb8\x1a\x00\x00\x8b\x86\x00\x01\x00\x00\x68\xf4"
|
||||||
|
b"\x01\x00\x00\x8b\xce\xc7\x40\x74\x0b\x00\x00\x00\xe8\xef\x20\x00"
|
||||||
|
b"\x00\x8b\x86\x00\x01\x00\x00\xc7\x86\x08\x01\x00\x00\xff\xff\xff"
|
||||||
|
b"\xff\x83\x78\x78\x00\x0f\x85\x40\x02\x00\x00\xb8\x01\x00\x00\x00"
|
||||||
|
b"\x5f\x66\xc7\x86\xd2\x01\x00\x00\x01\x00\x5e\x5b\xc2\x04\x00\x6a"
|
||||||
|
b"\x00\x8b\xce\x6a\x01\xe8\xd6\x19\x00\x00\xb8\x01\x00\x00\x00\x5f"
|
||||||
|
b"\x5e\x5b\xc2\x04\x00\x6a\x01\x8b\xce\x6a\x02\xe8\xc0\x19\x00\x00"
|
||||||
|
b"\xb8\x01\x00\x00\x00\x5f\x5e\x5b\xc2\x04\x00\x8b\xce\xe8\x3e\x1a"
|
||||||
|
b"\x00\x00\x8b\x86\x00\x01\x00\x00\x68\x1c\x02\x00\x00\x8b\xce\xc7"
|
||||||
|
b"\x40\x74\x0b\x00\x00\x00\xe8\x75\x20\x00\x00\xb8\x01\x00\x00\x00"
|
||||||
|
b"\x5f\xc7\x86\x08\x01\x00\x00\xff\xff\xff\xff\x5e\x5b\xc2\x04\x00"
|
||||||
|
b"\x8b\xce\xe8\x09\x1a\x00\x00\x8b\x86\x00\x01\x00\x00\x68\x1b\x02"
|
||||||
|
b"\x00\x00\x8b\xce\xc7\x40\x74\x0b\x00\x00\x00\xe8\x40\x20\x00\x00"
|
||||||
|
b"\xb8\x01\x00\x00\x00\x5f\xc7\x86\x08\x01\x00\x00\xff\xff\xff\xff"
|
||||||
|
b"\x5e\x5b\xc2\x04\x00\xc7\x00\x0b\x00\x00\x00\x8b\x86\x08\x01\x00"
|
||||||
|
b"\x00\x83\xf8\x04\x74\x0c\x83\xf8\x05\x74\x0e\x68\xf4\x01\x00\x00"
|
||||||
|
b"\xeb\x0c\x68\x1c\x02\x00\x00\xeb\x05\x68\x1b\x02\x00\x00\x8b\xce"
|
||||||
|
b"\xe8\xfb\x1f\x00\x00\xb8\x01\x00\x00\x00\x5f\xc7\x86\x08\x01\x00"
|
||||||
|
b"\x00\xff\xff\xff\xff\x5e\x5b\xc2\x04\x00\x6a\x00\xa1\xa0\x76\x0f"
|
||||||
|
b"\x10\x50\xe8\x39\x65\xfa\xff\x83\xc4\x08\xa1\xa4\x76\x0f\x10\x6a"
|
||||||
|
b"\x00\x50\xe8\x29\x65\xfa\xff\x83\xc4\x08\xe8\xf1\x63\xfa\xff\x8b"
|
||||||
|
b"\xc8\xe8\x6a\x02\x01\x00\xb8\x01\x00\x00\x00\x5f\x5e\x5b\xc2\x04"
|
||||||
|
b"\x00\x8b\x47\x1c\x83\xf8\x46\x74\x09\x83\xf8\x47\x0f\x85\x09\x01"
|
||||||
|
b"\x00\x00\x6a\x00\x6a\x00\x6a\x32\x6a\x03\xe8\x91\x65\xfa\xff\x8b"
|
||||||
|
b"\xc8\xe8\xfa\xc7\xfd\xff\x8b\x86\x00\x01\x00\x00\x5f\x5e\x5b\xc7"
|
||||||
|
b"\x40\x74\x0e\x00\x00\x00\xb8\x01\x00\x00\x00\xc2\x04\x00\x8b\x47"
|
||||||
|
b"\x1c\x39\x86\xf8\x00\x00\x00\x0f\x85\xce\x00\x00\x00\xe8\xbe\x63"
|
||||||
|
b"\xfa\xff\x83\x78\x10\x02\x74\x19\x66\x8b\x86\xfc\x00\x00\x00\x66"
|
||||||
|
b"\x85\xc0\x74\x0d\x50\xe8\xa6\x63\xfa\xff\x8b\xc8\xe8\xbf\xa3\xfc"
|
||||||
|
b"\xff\x6a\x00\x6a\x00\x6a\x32\x6a\x03\xe8\x32\x65\xfa\xff\x8b\xc8"
|
||||||
|
b"\xe8\x9b\xc7\xfd\xff\x8b\x86\x00\x01\x00\x00\x5f\x5e\x5b\xc7\x40"
|
||||||
|
b"\x74\x0e\x00\x00\x00\xb8\x01\x00\x00\x00\xc2\x04\x00\x83\x7a\x78"
|
||||||
|
b"\x00\x75\x32\x8b\x86\xf8\x00\x00\x00\x83\xf8\x28\x74\x27\x83\xf8"
|
||||||
|
b"\x29\x74\x22\x83\xf8\x2a\x74\x1d\x83\xf8\x2b\x74\x18\x83\xf8\x2c"
|
||||||
|
b"\x74\x13\x66\xc7\x86\xd0\x01\x00\x00\x01\x00\x6a\x0b\xe8\xee\x64"
|
||||||
|
b"\xfa\xff\x83\xc4\x04\x8b\x86\x00\x01\x00\x00\x6a\x01\x68\xdc\x44"
|
||||||
|
b"\x0f\x10\xc7\x40\x74\x02\x00\x00\x00\xe8\x22\x64\xfa\xff\x83\xc4"
|
||||||
|
b"\x08\xb8\x01\x00\x00\x00\x5f\x5e\x5b\xc2\x04\x00\x8b\x47\x1c\x39"
|
||||||
|
b"\x86\xf8\x00\x00\x00\x75\x14\x6a\x00\x6a\x00\x6a\x32\x6a\x03\xe8"
|
||||||
|
b"\x9c\x64\xfa\xff\x8b\xc8\xe8\x05\xc7\xfd\xff\xb8\x01\x00\x00\x00"
|
||||||
|
b"\x5f\x5e\x5b\xc2\x04\x00\x8b\xff\x3c\xf1\x06\x10\x43\xf1\x06\x10"
|
||||||
|
b"\x4a\xf1\x06\x10\x51\xf1\x06\x10\x58\xf1\x06\x10\xdf\xf1\x06\x10"
|
||||||
|
b"\xd5\xf2\x06\x10\x1a\xf3\x06\x10\x51\xf3\x06\x10\x8e\xf3\x06\x10"
|
||||||
|
b"\xed\xf3\x06\x10\x4c\xf4\x06\x10\x6b\xf4\x06\x10\x00\x01\x02\x07"
|
||||||
|
b"\x03\x04\x07\x07\x07\x07\x07\x05\x06\x8d\x49\x00\x3f\xf2\x06\x10"
|
||||||
|
b"\x55\xf2\x06\x10\xf1\xf1\x06\x10\xf1\xf1\x06\x10\x6b\xf2\x06\x10"
|
||||||
|
b"\xa0\xf2\x06\x10\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_action_case():
|
||||||
|
"""3 switches: 3 jump tables, 1 data table"""
|
||||||
|
ig = InstructGen(HANDLE_END_ACTION, 0x1006F080)
|
||||||
|
# Two of the jump tables (0x1006f478 with 5, 0x1006f48c with 8)
|
||||||
|
# are contiguous.
|
||||||
|
assert len(ig.sections) == 5
|
|
@ -81,13 +81,23 @@ def test_jump_displacement():
|
||||||
assert op_str == "-0x2"
|
assert op_str == "-0x2"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail(reason="Not implemented yet")
|
|
||||||
def test_jmp_table():
|
def test_jmp_table():
|
||||||
"""Should detect the characteristic jump table instruction
|
"""To ignore cases where it would be inappropriate to replace pointer
|
||||||
(for a switch statement) and use placeholder."""
|
displacement (i.e. the vast majority of them) we require the address
|
||||||
|
to be relocated. This excludes any address less than the imagebase."""
|
||||||
p = ParseAsm()
|
p = ParseAsm()
|
||||||
inst = mock_inst("jmp", "dword ptr [eax*4 + 0x5555]")
|
inst = mock_inst("jmp", "dword ptr [eax*4 + 0x5555]")
|
||||||
(_, op_str) = p.sanitize(inst)
|
(_, op_str) = p.sanitize(inst)
|
||||||
|
# i.e. no change
|
||||||
|
assert op_str == "dword ptr [eax*4 + 0x5555]"
|
||||||
|
|
||||||
|
def relocate_lookup(addr: int) -> bool:
|
||||||
|
return addr == 0x5555
|
||||||
|
|
||||||
|
# Now add the relocation lookup
|
||||||
|
p = ParseAsm(relocate_lookup=relocate_lookup)
|
||||||
|
(_, op_str) = p.sanitize(inst)
|
||||||
|
# Should replace it now
|
||||||
assert op_str == "dword ptr [eax*4 + <OFFSET1>]"
|
assert op_str == "dword ptr [eax*4 + <OFFSET1>]"
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue