Add new script to compare the stack layout (#1112)

* Add new script to debug the stack layout

* fix small error in script

---------

Co-authored-by: jonschz <jonschz@users.noreply.github.com>
This commit is contained in:
jonschz 2024-10-13 23:31:15 +02:00 committed by GitHub
parent 974cd7ce7c
commit 8446a7ffa1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 377 additions and 4 deletions

View file

@ -175,6 +175,8 @@ The example usages below assume that the current working directory is this repos
* Generate an HTML report: `py -m tools.reccmp.reccmp --html output.html legobin/LEGO1.DLL build/LEGO1.DLL build/LEGO1.PDB .`
* Create a base file for diffs: `py -m tools.reccmp.reccmp --json base.json --silent legobin/LEGO1.DLL build/LEGO1.DLL build/LEGO1.PDB .`
* Diff against a base file: `py -m tools.reccmp.reccmp --diff base.json legobin/LEGO1.DLL build/LEGO1.DLL build/LEGO1.PDB .`
* [`stackcmp`](/tools/stackcmp): Compares the stack layout for a given function that almost matches.
* e.g. `py -m tools.stackcmp.stackcmp legobin/BETA10.DLL build_debug/LEGO1.DLL build_debug/LEGO1.pdb . 0x1007165d`
* [`roadmap`](/tools/roadmap): Compares symbol locations in an original binary with the same symbol locations of a recompiled binary
* [`verexp`](/tools/verexp): Verifies exports by comparing the exports of the original DLL and the recompiled DLL
* [`vtable`](/tools/vtable): Asserts virtual table correctness by comparing a recompiled binary with the original

View file

@ -96,7 +96,8 @@ def get_func_signature(self, fn: SymbolsEntry) -> Optional[FunctionSignature]:
stack_symbols: list[CppStackOrRegisterSymbol] = []
# for some unexplained reason, the reported stack is offset by 4 when this flag is set
# for some unexplained reason, the reported stack is offset by 4 when this flag is set.
# Note that this affects the arguments (ebp + ...) but not the function stack (ebp - ...)
stack_offset_delta = -4 if fn.frame_pointer_present else 0
for symbol in fn.stack_symbols:

View file

@ -15,7 +15,7 @@
from isledecomp.compare.asm import ParseAsm
from isledecomp.compare.asm.fixes import assert_fixup, find_effective_match
from .db import CompareDb, MatchInfo
from .diff import combined_diff
from .diff import combined_diff, CombinedDiffOutput
from .lines import LinesDb
@ -29,7 +29,7 @@ class DiffReport:
orig_addr: int
recomp_addr: int
name: str
udiff: Optional[List[str]] = None
udiff: Optional[CombinedDiffOutput] = None
ratio: float = 0.0
is_effective_match: bool = False
is_stub: bool = False

View file

@ -2,7 +2,13 @@
from typing import Dict, List, Tuple
CombinedDiffInput = List[Tuple[str, str]]
CombinedDiffOutput = List[Tuple[str, List[Dict[str, Tuple[str, str]]]]]
# from inner to outer:
# Tuple[str, ...]: either (orig_addr, instruction, recomp_addr) or (addr, instruction)
# List[...]: a contiguous block of instructions, all matching or all mismatching
# Dict[...]: either {"both": List[...]} or {"orig": [...], "recomp": [...]}
# Tuple[str, List[...]]: One contiguous part of the diff (without skipping matching code)
# List[...]: The list of all the contiguous diffs of a given function
CombinedDiffOutput = List[Tuple[str, List[Dict[str, List[Tuple[str, ...]]]]]]
def combined_diff(

364
tools/stackcmp/stackcmp.py Normal file
View file

@ -0,0 +1,364 @@
from dataclasses import dataclass
import re
import logging
import os
import argparse
import struct
from typing import Dict, List, NamedTuple, Optional, Set, Tuple
from isledecomp import Bin
from isledecomp.compare import Compare as IsleCompare
from isledecomp.compare.diff import CombinedDiffOutput
from isledecomp.cvdump.symbols import SymbolsEntry
import colorama
# pylint: disable=duplicate-code # misdetects a code duplication with reccmp
colorama.just_fix_windows_console()
CHECK_ICON = f"{colorama.Fore.GREEN}{colorama.Style.RESET_ALL}"
SWAP_ICON = f"{colorama.Fore.YELLOW}{colorama.Style.RESET_ALL}"
ERROR_ICON = f"{colorama.Fore.RED}{colorama.Style.RESET_ALL}"
UNCLEAR_ICON = f"{colorama.Fore.BLUE}?{colorama.Style.RESET_ALL}"
STACK_ENTRY_REGEX = re.compile(
r"(?P<register>e[sb]p)\s(?P<sign>[+-])\s(?P<offset>(0x)?[0-9a-f]+)(?![0-9a-f])"
)
@dataclass
class StackSymbol:
name: str
data_type: str
@dataclass
class StackRegisterOffset:
register: str
offset: int
symbol: Optional[StackSymbol] = None
def __str__(self) -> str:
first_part = (
f"{self.register} + {self.offset:#04x}"
if self.offset > 0
else f"{self.register} - {-self.offset:#04x}"
)
second_part = f" {self.symbol.name}" if self.symbol else ""
return first_part + second_part
def __hash__(self) -> int:
return hash(self.register) + self.offset
def copy(self) -> "StackRegisterOffset":
return StackRegisterOffset(self.register, self.offset, self.symbol)
def __eq__(self, other: "StackRegisterOffset"):
return self.register == other.register and self.offset == other.offset
class StackPair(NamedTuple):
orig: StackRegisterOffset
recomp: StackRegisterOffset
StackPairs = Set[StackPair]
@dataclass
class Warnings:
structural_mismatches_present: bool = False
error_map_not_bijective: bool = False
def extract_stack_offset_from_instruction(
instruction: str,
) -> StackRegisterOffset | None:
match = STACK_ENTRY_REGEX.search(instruction)
if not match:
return None
offset = int(match.group("sign") + match.group("offset"), 16)
return StackRegisterOffset(match.group("register"), offset)
def analyze_diff(
diff: Dict[str, List[Tuple[str, ...]]], warnings: Warnings
) -> StackPairs:
stack_pairs: StackPairs = set()
if "both" in diff:
# get the matching stack entries
for line in diff["both"]:
# 0 = orig addr, 1 = instruction, 2 = reccmp addr
instruction = line[1]
if match := extract_stack_offset_from_instruction(instruction):
logging.debug("stack match: %s", match)
# need a copy for recomp because we might add a debug symbol to it
stack_pairs.add(StackPair(match, match.copy()))
elif any(x in instruction for x in ["ebp", "esp"]):
logging.debug("not a stack offset: %s", instruction)
else:
orig = diff["orig"]
recomp = diff["recomp"]
if len(orig) != len(recomp):
if orig:
mismatch_location = f"orig={orig[0][0]}"
else:
mismatch_location = f"recomp={recomp[0][0]}"
logging.error(
"Structural mismatch at %s:\n%s",
mismatch_location,
print_structural_mismatch(orig, recomp),
)
warnings.structural_mismatches_present = True
return set()
for orig_line, recomp_line in zip(orig, recomp):
if orig_match := extract_stack_offset_from_instruction(orig_line[1]):
recomp_match = extract_stack_offset_from_instruction(recomp_line[1])
if not recomp_match:
logging.error(
"Mismatching line structure at orig=%s:\n%s",
orig_line[0],
print_structural_mismatch(orig, recomp),
)
# not recoverable, whole block has a structural mismatch
warnings.structural_mismatches_present = True
return set()
stack_pair = StackPair(orig_match, recomp_match)
logging.debug(
"stack match, wrong order: %s vs %s", stack_pair[0], stack_pair[1]
)
stack_pairs.add(stack_pair)
elif any(x in orig_line[1] for x in ["ebp", "esp"]):
logging.debug("not a stack offset: %s", orig_line[1])
return stack_pairs
def print_bijective_match(left: str, right: str, exact: bool):
icon = CHECK_ICON if exact else SWAP_ICON
print(f"{icon}{colorama.Style.RESET_ALL} {left}: {right}")
def print_non_bijective_match(left: str, right: str):
print(f"{ERROR_ICON} {left}: {right}")
def print_structural_mismatch(
orig: List[Tuple[str, ...]], recomp: List[Tuple[str, ...]]
) -> str:
orig_str = "\n".join(f"-{x[1]}" for x in orig) if orig else "-"
recomp_str = "\n".join(f"+{x[1]}" for x in recomp) if recomp else "+"
return f"{colorama.Fore.RED}{orig_str}\n{colorama.Fore.GREEN}{recomp_str}\n{colorama.Style.RESET_ALL}"
def format_list_of_offsets(offsets: List[StackRegisterOffset]) -> str:
return str([str(x) for x in offsets])
def compare_function_stacks(udiff: CombinedDiffOutput, fn_symbol: SymbolsEntry):
warnings = Warnings()
# consists of pairs (orig, recomp)
# don't use a dict because we can have m:n relations
stack_pairs: StackPairs = set()
for block in udiff:
# block[0] is e.g. "@@ -0x10071662,60 +0x10031368,60 @@"
for diff in block[1]:
stack_pairs = stack_pairs.union(analyze_diff(diff, warnings))
# Note that the 'Frame Ptr Present' property is not relevant to the stack below `ebp`,
# but only to entries above (i.e. the function arguments on the stack).
# See also pdb_extraction.py.
stack_symbols: Dict[int, StackSymbol] = {}
for symbol in fn_symbol.stack_symbols:
if symbol.symbol_type == "S_BPREL32":
# convert hex to signed 32 bit integer
hex_bytes = bytes.fromhex(symbol.location[1:-1])
stack_offset = struct.unpack(">l", hex_bytes)[0]
stack_symbols[stack_offset] = StackSymbol(
symbol.name,
symbol.data_type,
)
for _, recomp in stack_pairs:
if recomp.register == "ebp":
recomp.symbol = stack_symbols.get(recomp.offset)
elif recomp.register == "esp":
logging.debug(
"Matching esp offsets to debug symbols is not implemented right now"
)
print("\nOrdered by original stack (left=orig, right=recomp):")
all_orig_offsets = set(x.orig.offset for x in stack_pairs)
for orig_offset in sorted(all_orig_offsets):
orig = next(x.orig for x in stack_pairs if x.orig.offset == orig_offset)
recomps = [x.recomp for x in stack_pairs if x.orig == orig]
if len(recomps) == 1:
recomp = recomps[0]
print_bijective_match(str(orig), str(recomp), exact=orig == recomp)
else:
print_non_bijective_match(str(orig), format_list_of_offsets(recomps))
warnings.error_map_not_bijective = True
# Show offsets from the debug symbols that we have not encountered in the diff
all_recomp_offsets = set(x.recomp.offset for x in stack_pairs).union(
stack_symbols.keys()
)
print("\nOrdered by recomp stack (left=orig, right=recomp):")
for recomp_offset in sorted(all_recomp_offsets):
recomp = next(
(x.recomp for x in stack_pairs if x.recomp.offset == recomp_offset), None
)
if recomp is None:
# The offset only appears in the debug symbols.
# The legend below explains why this can happen.
stack_offset = StackRegisterOffset(
"ebp", recomp_offset, stack_symbols[recomp_offset]
)
print(f"{UNCLEAR_ICON} not seen: {stack_offset}")
continue
origs = [x.orig for x in stack_pairs if x.recomp == recomp]
if len(origs) == 1:
# 1:1 clean match
print_bijective_match(str(origs[0]), str(recomp), origs[0] == recomp)
else:
print_non_bijective_match(format_list_of_offsets(origs), str(recomp))
warnings.error_map_not_bijective = True
print(
"\nLegend:\n"
+ f"{SWAP_ICON} : This stack variable matches 1:1, but the order of variables is not correct.\n"
+ f"{ERROR_ICON} : This stack variable matches multiple variables in the other binary.\n"
+ f"{UNCLEAR_ICON} : This stack variable did not appear in the diff. It either matches or only appears in structural mismatches.\n"
)
if warnings.error_map_not_bijective:
print(
"ERROR: The stack variables of original and recomp are not in a 1:1 correspondence, "
+ "suggesting that the logic in the recomp is incorrect."
)
elif warnings.structural_mismatches_present:
print(
"WARNING: Original and recomp have at least one structural discrepancy, "
+ "so the comparison of stack variables might be incomplete. "
+ "The structural mismatches above need to be checked manually."
)
def parse_args() -> argparse.Namespace:
def virtual_address(value) -> int:
"""Helper method for argparse, verbose parameter"""
return int(value, 16)
parser = argparse.ArgumentParser(
allow_abbrev=False,
description="Recompilation Compare: compare an original EXE with a recompiled EXE + PDB.",
)
parser.add_argument(
"original", metavar="original-binary", help="The original binary"
)
parser.add_argument(
"recompiled", metavar="recompiled-binary", help="The recompiled binary"
)
parser.add_argument(
"pdb", metavar="recompiled-pdb", help="The PDB of the recompiled binary"
)
parser.add_argument(
"decomp_dir", metavar="decomp-dir", help="The decompiled source tree"
)
parser.add_argument(
"address",
metavar="<offset>",
type=virtual_address,
help="The original file's offset of the function to be analyzed",
)
parser.set_defaults(loglevel=logging.INFO)
parser.add_argument(
"--debug",
action="store_const",
const=logging.DEBUG,
dest="loglevel",
help="Print script debug information",
)
args = parser.parse_args()
if not os.path.isfile(args.original):
parser.error(f"Original binary {args.original} does not exist")
if not os.path.isfile(args.recompiled):
parser.error(f"Recompiled binary {args.recompiled} does not exist")
if not os.path.isfile(args.pdb):
parser.error(f"Symbols PDB {args.pdb} does not exist")
if not os.path.isdir(args.decomp_dir):
parser.error(f"Source directory {args.decomp_dir} does not exist")
return args
def main():
args = parse_args()
logging.basicConfig(level=args.loglevel, format="[%(levelname)s] %(message)s")
with Bin(args.original, find_str=True) as origfile, Bin(
args.recompiled
) as recompfile:
if args.loglevel != logging.DEBUG:
# Mute logger events from compare engine
logging.getLogger("isledecomp.compare.core").setLevel(logging.CRITICAL)
logging.getLogger("isledecomp.compare.db").setLevel(logging.CRITICAL)
logging.getLogger("isledecomp.compare.lines").setLevel(logging.CRITICAL)
isle_compare = IsleCompare(origfile, recompfile, args.pdb, args.decomp_dir)
if args.loglevel == logging.DEBUG:
isle_compare.debug = True
print()
match = isle_compare.compare_address(args.address)
if match is None:
print(f"Failed to find a match at address 0x{args.address:x}")
return
assert match.udiff is not None
function_data = next(
(
y
for y in isle_compare.cvdump_analysis.nodes
if y.addr == match.recomp_addr
),
None,
)
assert function_data is not None
assert function_data.symbol_entry is not None
compare_function_stacks(match.udiff, function_data.symbol_entry)
if __name__ == "__main__":
raise SystemExit(main())