From a65eb9a4e056e99082a869b34c790997af65b86d Mon Sep 17 00:00:00 2001 From: MS Date: Mon, 22 Jan 2024 10:15:12 -0500 Subject: [PATCH] Roadmap tool to compare binary structure (#479) --- CMakeLists.txt | 2 +- tools/isledecomp/isledecomp/bin.py | 10 +- tools/isledecomp/isledecomp/compare/core.py | 3 + tools/isledecomp/isledecomp/compare/db.py | 11 + tools/isledecomp/isledecomp/cvdump/parser.py | 26 +- tools/isledecomp/isledecomp/cvdump/runner.py | 6 + tools/roadmap/roadmap.py | 265 +++++++++++++++++++ 7 files changed, 320 insertions(+), 3 deletions(-) create mode 100644 tools/roadmap/roadmap.py diff --git a/CMakeLists.txt b/CMakeLists.txt index de976bde..b7c93768 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -299,8 +299,8 @@ set_property(TARGET lego1 PROPERTY SUFFIX ".DLL") if (ISLE_BUILD_APP) add_executable(isle WIN32 ISLE/res/isle.rc - ISLE/isleapp.cpp ISLE/define.cpp + ISLE/isleapp.cpp ) target_compile_definitions(isle PRIVATE ISLE_APP) diff --git a/tools/isledecomp/isledecomp/bin.py b/tools/isledecomp/isledecomp/bin.py index 3b600af6..dc04bb53 100644 --- a/tools/isledecomp/isledecomp/bin.py +++ b/tools/isledecomp/isledecomp/bin.py @@ -1,6 +1,6 @@ import logging import struct -from typing import List, Optional +from typing import List, Optional, Tuple from dataclasses import dataclass from collections import namedtuple @@ -365,6 +365,14 @@ def get_abs_addr(self, section: int, offset: int) -> int: into an absolute vaddr.""" return self.get_section_offset_by_index(section) + offset + def get_relative_addr(self, addr: int) -> Tuple[int, int]: + """Convert an absolute address back into a (section, offset) pair.""" + for i, section in enumerate(self.sections): + if section.contains_vaddr(addr): + return (i + 1, addr - section.virtual_address) + + return (0, 0) + def get_raw_addr(self, vaddr: int) -> int: """Returns the raw offset in the PE binary for the given virtual address.""" self._set_section_for_vaddr(vaddr) diff --git a/tools/isledecomp/isledecomp/compare/core.py b/tools/isledecomp/isledecomp/compare/core.py index 84f9131f..4e1a1f70 100644 --- a/tools/isledecomp/isledecomp/compare/core.py +++ b/tools/isledecomp/isledecomp/compare/core.py @@ -409,6 +409,9 @@ def _compare_match(self, match: MatchInfo) -> Optional[DiffReport]: ## Public API + def get_all(self) -> List[MatchInfo]: + return self._db.get_all() + def get_functions(self) -> List[MatchInfo]: return self._db.get_matches_by_type(SymbolType.FUNCTION) diff --git a/tools/isledecomp/isledecomp/compare/db.py b/tools/isledecomp/isledecomp/compare/db.py index 43d6811b..96ab3e10 100644 --- a/tools/isledecomp/isledecomp/compare/db.py +++ b/tools/isledecomp/isledecomp/compare/db.py @@ -82,6 +82,17 @@ def get_unmatched_strings(self) -> List[str]: return [string for (string,) in cur.fetchall()] + def get_all(self) -> List[MatchInfo]: + cur = self._db.execute( + """SELECT compare_type, orig_addr, recomp_addr, name, size + FROM `symbols` + ORDER BY orig_addr NULLS LAST + """, + ) + cur.row_factory = matchinfo_factory + + return cur.fetchall() + def get_matches(self) -> Optional[MatchInfo]: cur = self._db.execute( """SELECT compare_type, orig_addr, recomp_addr, name, size diff --git a/tools/isledecomp/isledecomp/cvdump/parser.py b/tools/isledecomp/isledecomp/cvdump/parser.py index d14abe88..8d1c71bb 100644 --- a/tools/isledecomp/isledecomp/cvdump/parser.py +++ b/tools/isledecomp/isledecomp/cvdump/parser.py @@ -39,6 +39,9 @@ r"S_GDATA32: \[(?P
\w{4}):(?P\w{8})\], Type:\s*(?P\S+), (?P.+)" ) +# e.g. 0003 "CMakeFiles/isle.dir/ISLE/res/isle.rc.res" +# e.g. 0004 "C:\work\lego-island\isle\3rdparty\smartheap\SHLW32MT.LIB" "check.obj" +_module_regex = re.compile(r"(?P\w{4})(?: \"(?P.+?)\")?(?: \"(?P.+?)\")") # User functions only LinesEntry = namedtuple("LinesEntry", "filename line_no section offset") @@ -52,13 +55,16 @@ SymbolsEntry = namedtuple("SymbolsEntry", "type section offset size name") # (Estimated) size of any symbol -SizeRefEntry = namedtuple("SizeRefEntry", "section offset size") +SizeRefEntry = namedtuple("SizeRefEntry", "module section offset size") # global variables GdataEntry = namedtuple("GdataEntry", "section offset type name") +ModuleEntry = namedtuple("ModuleEntry", "id lib obj") + class CvdumpParser: + # pylint: disable=too-many-instance-attributes def __init__(self) -> None: self._section: str = "" self._lines_function: Tuple[str, int] = ("", 0) @@ -68,6 +74,7 @@ def __init__(self) -> None: self.symbols = [] self.sizerefs = [] self.globals = [] + self.modules = [] def _lines_section(self, line: str): """Parsing entries from the LINES section. We only care about the pairs of @@ -144,12 +151,26 @@ def _section_contributions(self, line: str): if (match := _section_contrib_regex.match(line)) is not None: self.sizerefs.append( SizeRefEntry( + module=int(match.group("module"), 16), section=int(match.group("section"), 16), offset=int(match.group("offset"), 16), size=int(match.group("size"), 16), ) ) + def _modules_section(self, line: str): + """Record the object file (and lib file, if used) linked into the binary. + The auto-incrementing id is cross-referenced in SECTION CONTRIBUTIONS + (and perhaps other locations)""" + if (match := _module_regex.match(line)) is not None: + self.modules.append( + ModuleEntry( + id=int(match.group("id"), 16), + lib=match.group("lib"), + obj=match.group("obj"), + ) + ) + def read_line(self, line: str): # Blank lines are there to help the reader; they have no context significance if line.strip() == "": @@ -174,6 +195,9 @@ def read_line(self, line: str): elif self._section == "GLOBALS": self._globals_section(line) + elif self._section == "MODULES": + self._modules_section(line) + def read_lines(self, lines: Iterable[str]): for line in lines: self.read_line(line) diff --git a/tools/isledecomp/isledecomp/cvdump/runner.py b/tools/isledecomp/isledecomp/cvdump/runner.py index f1c163b5..6b2c2ff4 100644 --- a/tools/isledecomp/isledecomp/cvdump/runner.py +++ b/tools/isledecomp/isledecomp/cvdump/runner.py @@ -13,6 +13,7 @@ class DumpOpt(Enum): GLOBALS = 2 PUBLICS = 3 SECTION_CONTRIB = 4 + MODULES = 5 cvdump_opt_map = { @@ -21,6 +22,7 @@ class DumpOpt(Enum): DumpOpt.GLOBALS: "-g", DumpOpt.PUBLICS: "-p", DumpOpt.SECTION_CONTRIB: "-seccontrib", + DumpOpt.MODULES: "-m", } @@ -49,6 +51,10 @@ def section_contributions(self): self._options.add(DumpOpt.SECTION_CONTRIB) return self + def modules(self): + self._options.add(DumpOpt.MODULES) + return self + def cmd_line(self) -> List[str]: cvdump_exe = lib_path_join("cvdump.exe") flags = [cvdump_opt_map[opt] for opt in self._options] diff --git a/tools/roadmap/roadmap.py b/tools/roadmap/roadmap.py new file mode 100644 index 00000000..e8023c1d --- /dev/null +++ b/tools/roadmap/roadmap.py @@ -0,0 +1,265 @@ +"""For all addresses matched by code annotations or recomp pdb, +report how "far off" the recomp symbol is from its proper place +in the original binary.""" + +import os +import argparse +import logging +from typing import List, Optional +from collections import namedtuple +from isledecomp import Bin as IsleBin +from isledecomp.cvdump import Cvdump +from isledecomp.compare import Compare as IsleCompare +from isledecomp.types import SymbolType + +# Ignore all compare-db messages. +logging.getLogger("isledecomp.compare").addHandler(logging.NullHandler()) + + +def or_blank(value) -> str: + """Helper for dealing with potential None values in text output.""" + return "" if value is None else str(value) + + +class ModuleMap: + """Load a subset of sections from the pdb to allow you to look up the + module number based on the recomp address.""" + + def __init__(self, pdb, binfile) -> None: + cvdump = Cvdump(pdb).section_contributions().modules().run() + self.module_lookup = {m.id: (m.lib, m.obj) for m in cvdump.modules} + self.section_contrib = [ + ( + binfile.get_abs_addr(sizeref.section, sizeref.offset), + sizeref.size, + sizeref.module, + ) + for sizeref in cvdump.sizerefs + if binfile.is_valid_section(sizeref.section) + ] + + def get_module(self, addr: int) -> Optional[str]: + for start, size, module_id in self.section_contrib: + if start <= addr < start + size: + if (module := self.module_lookup.get(module_id)) is not None: + return module + + return None + + +def print_sections(sections): + print(" name | start | v.size | raw size") + print("---------|----------|----------|----------") + for sect in sections: + name = sect.name.decode("ascii").rstrip("\x00") + print( + f"{name:>8} | {sect.virtual_address:8x} | {sect.virtual_size:8x} | {sect.size_of_raw_data:8x}" + ) + print() + + +def match_type_abbreviation(mtype: Optional[SymbolType]) -> str: + """Return abbreviation of the given SymbolType name""" + if mtype is None: + return "" + + return mtype.name.lower()[:3] + + +RoadmapRow = namedtuple( + "RoadmapRow", + [ + "orig_sect_ofs", + "recomp_sect_ofs", + "orig_addr", + "recomp_addr", + "displacement", + "sym_type", + "size", + "name", + "module", + ], +) + + +def print_text_report(results: List[RoadmapRow]): + """Print the result with original and recomp addresses.""" + for row in results: + print( + " ".join( + [ + f"{or_blank(row.orig_sect_ofs):14}", + f"{or_blank(row.recomp_sect_ofs):14}", + f"{or_blank(row.displacement):>8}", + f"{row.sym_type:3}", + f"{or_blank(row.size):6}", + or_blank(row.name), + ] + ) + ) + + +def print_diff_report(results: List[RoadmapRow]): + """Print only entries where we have the recomp address. + This is intended for generating a file to diff against. + The recomp addresses are always changing so we hide those.""" + for row in results: + if row.orig_addr is None or row.recomp_addr is None: + continue + + print( + " ".join( + [ + f"{or_blank(row.orig_sect_ofs):14}", + f"{or_blank(row.displacement):>8}", + f"{row.sym_type:3}", + f"{or_blank(row.size):6}", + or_blank(row.name), + ] + ) + ) + + +def export_to_csv(csv_file: str, results: List[RoadmapRow]): + with open(csv_file, "w+", encoding="utf-8") as f: + f.write( + "orig_sect_ofs,recomp_sect_ofs,orig_addr,recomp_addr,displacement,row_type,size,name,module\n" + ) + for row in results: + f.write(",".join(map(or_blank, row))) + f.write("\n") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Show all addresses from original and recomp." + ) + parser.add_argument( + "original", metavar="original-binary", help="The original binary" + ) + parser.add_argument( + "recompiled", metavar="recompiled-binary", help="The recompiled binary" + ) + parser.add_argument( + "pdb", metavar="recompiled-pdb", help="The PDB of the recompiled binary" + ) + parser.add_argument( + "decomp_dir", metavar="decomp-dir", help="The decompiled source tree" + ) + parser.add_argument("--csv", metavar="", help="If set, export to CSV") + parser.add_argument( + "--verbose", "-v", action="store_true", help="Show recomp addresses in output" + ) + + (args, _) = parser.parse_known_args() + + if not os.path.isfile(args.original): + parser.error(f"Original binary {args.original} does not exist") + + if not os.path.isfile(args.recompiled): + parser.error(f"Recompiled binary {args.recompiled} does not exist") + + if not os.path.isfile(args.pdb): + parser.error(f"Symbols PDB {args.pdb} does not exist") + + if not os.path.isdir(args.decomp_dir): + parser.error(f"Source directory {args.decomp_dir} does not exist") + + return args + + +def main(): + args = parse_args() + + with IsleBin(args.original, find_str=True) as orig_bin, IsleBin( + args.recompiled + ) as recomp_bin: + engine = IsleCompare(orig_bin, recomp_bin, args.pdb, args.decomp_dir) + + module_map = ModuleMap(args.pdb, recomp_bin) + + def is_same_section(orig: int, recomp: int) -> bool: + """Compare the section name instead of the index. + LEGO1.dll adds extra sections for some reason. (Smacker library?)""" + + try: + orig_name = orig_bin.sections[orig - 1].name + recomp_name = recomp_bin.sections[recomp - 1].name + return orig_name == recomp_name + except IndexError: + return False + + def to_roadmap_row(match): + orig_sect = None + orig_ofs = None + orig_sect_ofs = None + recomp_sect = None + recomp_ofs = None + recomp_sect_ofs = None + orig_addr = None + recomp_addr = None + displacement = None + module_name = None + + if match.recomp_addr is not None: + if (module_ref := module_map.get_module(match.recomp_addr)) is not None: + (_, module_name) = module_ref + + row_type = match_type_abbreviation(match.compare_type) + name = ( + repr(match.name) + if match.compare_type == SymbolType.STRING + else match.name + ) + + if match.orig_addr is not None: + orig_addr = match.orig_addr + (orig_sect, orig_ofs) = orig_bin.get_relative_addr(match.orig_addr) + orig_sect_ofs = f"{orig_sect:04}:{orig_ofs:08x}" + + if match.recomp_addr is not None: + recomp_addr = match.recomp_addr + (recomp_sect, recomp_ofs) = recomp_bin.get_relative_addr( + match.recomp_addr + ) + recomp_sect_ofs = f"{recomp_sect:04}:{recomp_ofs:08x}" + + if ( + orig_sect is not None + and recomp_sect is not None + and is_same_section(orig_sect, recomp_sect) + ): + displacement = recomp_ofs - orig_ofs + + return RoadmapRow( + orig_sect_ofs, + recomp_sect_ofs, + orig_addr, + recomp_addr, + displacement, + row_type, + match.size, + name, + module_name, + ) + + results = list(map(to_roadmap_row, engine.get_all())) + + if args.csv is None: + if args.verbose: + print("ORIG sections:") + print_sections(orig_bin.sections) + + print("RECOMP sections:") + print_sections(recomp_bin.sections) + + print_text_report(results) + else: + print_diff_report(results) + + if args.csv is not None: + export_to_csv(args.csv, results) + + +if __name__ == "__main__": + main()