From 8446a7ffa1983e053507876a9c61706a1abbe190 Mon Sep 17 00:00:00 2001 From: jonschz <17198703+jonschz@users.noreply.github.com> Date: Sun, 13 Oct 2024 23:31:15 +0200 Subject: [PATCH] Add new script to compare the stack layout (#1112) * Add new script to debug the stack layout * fix small error in script --------- Co-authored-by: jonschz --- tools/README.md | 2 + .../lego_util/pdb_extraction.py | 3 +- tools/isledecomp/isledecomp/compare/core.py | 4 +- tools/isledecomp/isledecomp/compare/diff.py | 8 +- tools/stackcmp/stackcmp.py | 364 ++++++++++++++++++ 5 files changed, 377 insertions(+), 4 deletions(-) create mode 100644 tools/stackcmp/stackcmp.py diff --git a/tools/README.md b/tools/README.md index 98eb1f8d..d2ff3dc9 100644 --- a/tools/README.md +++ b/tools/README.md @@ -175,6 +175,8 @@ The example usages below assume that the current working directory is this repos * Generate an HTML report: `py -m tools.reccmp.reccmp --html output.html legobin/LEGO1.DLL build/LEGO1.DLL build/LEGO1.PDB .` * Create a base file for diffs: `py -m tools.reccmp.reccmp --json base.json --silent legobin/LEGO1.DLL build/LEGO1.DLL build/LEGO1.PDB .` * Diff against a base file: `py -m tools.reccmp.reccmp --diff base.json legobin/LEGO1.DLL build/LEGO1.DLL build/LEGO1.PDB .` +* [`stackcmp`](/tools/stackcmp): Compares the stack layout for a given function that almost matches. + * e.g. `py -m tools.stackcmp.stackcmp legobin/BETA10.DLL build_debug/LEGO1.DLL build_debug/LEGO1.pdb . 0x1007165d` * [`roadmap`](/tools/roadmap): Compares symbol locations in an original binary with the same symbol locations of a recompiled binary * [`verexp`](/tools/verexp): Verifies exports by comparing the exports of the original DLL and the recompiled DLL * [`vtable`](/tools/vtable): Asserts virtual table correctness by comparing a recompiled binary with the original diff --git a/tools/ghidra_scripts/lego_util/pdb_extraction.py b/tools/ghidra_scripts/lego_util/pdb_extraction.py index 6a0db6f8..3325c159 100644 --- a/tools/ghidra_scripts/lego_util/pdb_extraction.py +++ b/tools/ghidra_scripts/lego_util/pdb_extraction.py @@ -96,7 +96,8 @@ def get_func_signature(self, fn: SymbolsEntry) -> Optional[FunctionSignature]: stack_symbols: list[CppStackOrRegisterSymbol] = [] - # for some unexplained reason, the reported stack is offset by 4 when this flag is set + # for some unexplained reason, the reported stack is offset by 4 when this flag is set. + # Note that this affects the arguments (ebp + ...) but not the function stack (ebp - ...) stack_offset_delta = -4 if fn.frame_pointer_present else 0 for symbol in fn.stack_symbols: diff --git a/tools/isledecomp/isledecomp/compare/core.py b/tools/isledecomp/isledecomp/compare/core.py index c44f3987..f2aebdcf 100644 --- a/tools/isledecomp/isledecomp/compare/core.py +++ b/tools/isledecomp/isledecomp/compare/core.py @@ -15,7 +15,7 @@ from isledecomp.compare.asm import ParseAsm from isledecomp.compare.asm.fixes import assert_fixup, find_effective_match from .db import CompareDb, MatchInfo -from .diff import combined_diff +from .diff import combined_diff, CombinedDiffOutput from .lines import LinesDb @@ -29,7 +29,7 @@ class DiffReport: orig_addr: int recomp_addr: int name: str - udiff: Optional[List[str]] = None + udiff: Optional[CombinedDiffOutput] = None ratio: float = 0.0 is_effective_match: bool = False is_stub: bool = False diff --git a/tools/isledecomp/isledecomp/compare/diff.py b/tools/isledecomp/isledecomp/compare/diff.py index ad453191..a328c997 100644 --- a/tools/isledecomp/isledecomp/compare/diff.py +++ b/tools/isledecomp/isledecomp/compare/diff.py @@ -2,7 +2,13 @@ from typing import Dict, List, Tuple CombinedDiffInput = List[Tuple[str, str]] -CombinedDiffOutput = List[Tuple[str, List[Dict[str, Tuple[str, str]]]]] +# from inner to outer: +# Tuple[str, ...]: either (orig_addr, instruction, recomp_addr) or (addr, instruction) +# List[...]: a contiguous block of instructions, all matching or all mismatching +# Dict[...]: either {"both": List[...]} or {"orig": [...], "recomp": [...]} +# Tuple[str, List[...]]: One contiguous part of the diff (without skipping matching code) +# List[...]: The list of all the contiguous diffs of a given function +CombinedDiffOutput = List[Tuple[str, List[Dict[str, List[Tuple[str, ...]]]]]] def combined_diff( diff --git a/tools/stackcmp/stackcmp.py b/tools/stackcmp/stackcmp.py new file mode 100644 index 00000000..9aa02f2a --- /dev/null +++ b/tools/stackcmp/stackcmp.py @@ -0,0 +1,364 @@ +from dataclasses import dataclass +import re +import logging +import os +import argparse +import struct +from typing import Dict, List, NamedTuple, Optional, Set, Tuple + +from isledecomp import Bin +from isledecomp.compare import Compare as IsleCompare +from isledecomp.compare.diff import CombinedDiffOutput +from isledecomp.cvdump.symbols import SymbolsEntry +import colorama + +# pylint: disable=duplicate-code # misdetects a code duplication with reccmp + +colorama.just_fix_windows_console() + +CHECK_ICON = f"{colorama.Fore.GREEN}✓{colorama.Style.RESET_ALL}" +SWAP_ICON = f"{colorama.Fore.YELLOW}⇄{colorama.Style.RESET_ALL}" +ERROR_ICON = f"{colorama.Fore.RED}✗{colorama.Style.RESET_ALL}" +UNCLEAR_ICON = f"{colorama.Fore.BLUE}?{colorama.Style.RESET_ALL}" + + +STACK_ENTRY_REGEX = re.compile( + r"(?Pe[sb]p)\s(?P[+-])\s(?P(0x)?[0-9a-f]+)(?![0-9a-f])" +) + + +@dataclass +class StackSymbol: + name: str + data_type: str + + +@dataclass +class StackRegisterOffset: + register: str + offset: int + symbol: Optional[StackSymbol] = None + + def __str__(self) -> str: + first_part = ( + f"{self.register} + {self.offset:#04x}" + if self.offset > 0 + else f"{self.register} - {-self.offset:#04x}" + ) + second_part = f" {self.symbol.name}" if self.symbol else "" + return first_part + second_part + + def __hash__(self) -> int: + return hash(self.register) + self.offset + + def copy(self) -> "StackRegisterOffset": + return StackRegisterOffset(self.register, self.offset, self.symbol) + + def __eq__(self, other: "StackRegisterOffset"): + return self.register == other.register and self.offset == other.offset + + +class StackPair(NamedTuple): + orig: StackRegisterOffset + recomp: StackRegisterOffset + + +StackPairs = Set[StackPair] + + +@dataclass +class Warnings: + structural_mismatches_present: bool = False + error_map_not_bijective: bool = False + + +def extract_stack_offset_from_instruction( + instruction: str, +) -> StackRegisterOffset | None: + match = STACK_ENTRY_REGEX.search(instruction) + if not match: + return None + offset = int(match.group("sign") + match.group("offset"), 16) + return StackRegisterOffset(match.group("register"), offset) + + +def analyze_diff( + diff: Dict[str, List[Tuple[str, ...]]], warnings: Warnings +) -> StackPairs: + stack_pairs: StackPairs = set() + if "both" in diff: + # get the matching stack entries + for line in diff["both"]: + # 0 = orig addr, 1 = instruction, 2 = reccmp addr + instruction = line[1] + + if match := extract_stack_offset_from_instruction(instruction): + logging.debug("stack match: %s", match) + # need a copy for recomp because we might add a debug symbol to it + stack_pairs.add(StackPair(match, match.copy())) + elif any(x in instruction for x in ["ebp", "esp"]): + logging.debug("not a stack offset: %s", instruction) + + else: + orig = diff["orig"] + recomp = diff["recomp"] + if len(orig) != len(recomp): + if orig: + mismatch_location = f"orig={orig[0][0]}" + else: + mismatch_location = f"recomp={recomp[0][0]}" + logging.error( + "Structural mismatch at %s:\n%s", + mismatch_location, + print_structural_mismatch(orig, recomp), + ) + warnings.structural_mismatches_present = True + return set() + + for orig_line, recomp_line in zip(orig, recomp): + if orig_match := extract_stack_offset_from_instruction(orig_line[1]): + recomp_match = extract_stack_offset_from_instruction(recomp_line[1]) + + if not recomp_match: + logging.error( + "Mismatching line structure at orig=%s:\n%s", + orig_line[0], + print_structural_mismatch(orig, recomp), + ) + # not recoverable, whole block has a structural mismatch + warnings.structural_mismatches_present = True + return set() + + stack_pair = StackPair(orig_match, recomp_match) + + logging.debug( + "stack match, wrong order: %s vs %s", stack_pair[0], stack_pair[1] + ) + stack_pairs.add(stack_pair) + + elif any(x in orig_line[1] for x in ["ebp", "esp"]): + logging.debug("not a stack offset: %s", orig_line[1]) + + return stack_pairs + + +def print_bijective_match(left: str, right: str, exact: bool): + icon = CHECK_ICON if exact else SWAP_ICON + print(f"{icon}{colorama.Style.RESET_ALL} {left}: {right}") + + +def print_non_bijective_match(left: str, right: str): + print(f"{ERROR_ICON} {left}: {right}") + + +def print_structural_mismatch( + orig: List[Tuple[str, ...]], recomp: List[Tuple[str, ...]] +) -> str: + orig_str = "\n".join(f"-{x[1]}" for x in orig) if orig else "-" + recomp_str = "\n".join(f"+{x[1]}" for x in recomp) if recomp else "+" + return f"{colorama.Fore.RED}{orig_str}\n{colorama.Fore.GREEN}{recomp_str}\n{colorama.Style.RESET_ALL}" + + +def format_list_of_offsets(offsets: List[StackRegisterOffset]) -> str: + return str([str(x) for x in offsets]) + + +def compare_function_stacks(udiff: CombinedDiffOutput, fn_symbol: SymbolsEntry): + warnings = Warnings() + + # consists of pairs (orig, recomp) + # don't use a dict because we can have m:n relations + stack_pairs: StackPairs = set() + + for block in udiff: + # block[0] is e.g. "@@ -0x10071662,60 +0x10031368,60 @@" + for diff in block[1]: + stack_pairs = stack_pairs.union(analyze_diff(diff, warnings)) + + # Note that the 'Frame Ptr Present' property is not relevant to the stack below `ebp`, + # but only to entries above (i.e. the function arguments on the stack). + # See also pdb_extraction.py. + + stack_symbols: Dict[int, StackSymbol] = {} + + for symbol in fn_symbol.stack_symbols: + if symbol.symbol_type == "S_BPREL32": + # convert hex to signed 32 bit integer + hex_bytes = bytes.fromhex(symbol.location[1:-1]) + stack_offset = struct.unpack(">l", hex_bytes)[0] + + stack_symbols[stack_offset] = StackSymbol( + symbol.name, + symbol.data_type, + ) + + for _, recomp in stack_pairs: + if recomp.register == "ebp": + recomp.symbol = stack_symbols.get(recomp.offset) + elif recomp.register == "esp": + logging.debug( + "Matching esp offsets to debug symbols is not implemented right now" + ) + + print("\nOrdered by original stack (left=orig, right=recomp):") + + all_orig_offsets = set(x.orig.offset for x in stack_pairs) + + for orig_offset in sorted(all_orig_offsets): + orig = next(x.orig for x in stack_pairs if x.orig.offset == orig_offset) + recomps = [x.recomp for x in stack_pairs if x.orig == orig] + + if len(recomps) == 1: + recomp = recomps[0] + print_bijective_match(str(orig), str(recomp), exact=orig == recomp) + else: + print_non_bijective_match(str(orig), format_list_of_offsets(recomps)) + warnings.error_map_not_bijective = True + + # Show offsets from the debug symbols that we have not encountered in the diff + all_recomp_offsets = set(x.recomp.offset for x in stack_pairs).union( + stack_symbols.keys() + ) + + print("\nOrdered by recomp stack (left=orig, right=recomp):") + for recomp_offset in sorted(all_recomp_offsets): + recomp = next( + (x.recomp for x in stack_pairs if x.recomp.offset == recomp_offset), None + ) + + if recomp is None: + # The offset only appears in the debug symbols. + # The legend below explains why this can happen. + stack_offset = StackRegisterOffset( + "ebp", recomp_offset, stack_symbols[recomp_offset] + ) + print(f"{UNCLEAR_ICON} not seen: {stack_offset}") + continue + + origs = [x.orig for x in stack_pairs if x.recomp == recomp] + + if len(origs) == 1: + # 1:1 clean match + print_bijective_match(str(origs[0]), str(recomp), origs[0] == recomp) + else: + print_non_bijective_match(format_list_of_offsets(origs), str(recomp)) + warnings.error_map_not_bijective = True + + print( + "\nLegend:\n" + + f"{SWAP_ICON} : This stack variable matches 1:1, but the order of variables is not correct.\n" + + f"{ERROR_ICON} : This stack variable matches multiple variables in the other binary.\n" + + f"{UNCLEAR_ICON} : This stack variable did not appear in the diff. It either matches or only appears in structural mismatches.\n" + ) + + if warnings.error_map_not_bijective: + print( + "ERROR: The stack variables of original and recomp are not in a 1:1 correspondence, " + + "suggesting that the logic in the recomp is incorrect." + ) + elif warnings.structural_mismatches_present: + print( + "WARNING: Original and recomp have at least one structural discrepancy, " + + "so the comparison of stack variables might be incomplete. " + + "The structural mismatches above need to be checked manually." + ) + + +def parse_args() -> argparse.Namespace: + def virtual_address(value) -> int: + """Helper method for argparse, verbose parameter""" + return int(value, 16) + + parser = argparse.ArgumentParser( + allow_abbrev=False, + description="Recompilation Compare: compare an original EXE with a recompiled EXE + PDB.", + ) + parser.add_argument( + "original", metavar="original-binary", help="The original binary" + ) + parser.add_argument( + "recompiled", metavar="recompiled-binary", help="The recompiled binary" + ) + parser.add_argument( + "pdb", metavar="recompiled-pdb", help="The PDB of the recompiled binary" + ) + parser.add_argument( + "decomp_dir", metavar="decomp-dir", help="The decompiled source tree" + ) + + parser.add_argument( + "address", + metavar="", + type=virtual_address, + help="The original file's offset of the function to be analyzed", + ) + + parser.set_defaults(loglevel=logging.INFO) + parser.add_argument( + "--debug", + action="store_const", + const=logging.DEBUG, + dest="loglevel", + help="Print script debug information", + ) + + args = parser.parse_args() + + if not os.path.isfile(args.original): + parser.error(f"Original binary {args.original} does not exist") + + if not os.path.isfile(args.recompiled): + parser.error(f"Recompiled binary {args.recompiled} does not exist") + + if not os.path.isfile(args.pdb): + parser.error(f"Symbols PDB {args.pdb} does not exist") + + if not os.path.isdir(args.decomp_dir): + parser.error(f"Source directory {args.decomp_dir} does not exist") + + return args + + +def main(): + args = parse_args() + logging.basicConfig(level=args.loglevel, format="[%(levelname)s] %(message)s") + + with Bin(args.original, find_str=True) as origfile, Bin( + args.recompiled + ) as recompfile: + if args.loglevel != logging.DEBUG: + # Mute logger events from compare engine + logging.getLogger("isledecomp.compare.core").setLevel(logging.CRITICAL) + logging.getLogger("isledecomp.compare.db").setLevel(logging.CRITICAL) + logging.getLogger("isledecomp.compare.lines").setLevel(logging.CRITICAL) + + isle_compare = IsleCompare(origfile, recompfile, args.pdb, args.decomp_dir) + + if args.loglevel == logging.DEBUG: + isle_compare.debug = True + + print() + + match = isle_compare.compare_address(args.address) + if match is None: + print(f"Failed to find a match at address 0x{args.address:x}") + return + + assert match.udiff is not None + + function_data = next( + ( + y + for y in isle_compare.cvdump_analysis.nodes + if y.addr == match.recomp_addr + ), + None, + ) + assert function_data is not None + assert function_data.symbol_entry is not None + + compare_function_stacks(match.udiff, function_data.symbol_entry) + + +if __name__ == "__main__": + raise SystemExit(main())