Roadmap tool to compare binary structure (#479)

2024-11-22 15:48:09 -05:00 · 2024-01-22 10:15:12 -05:00 · 2024-01-22 10:15:12 -05:00 · a65eb9a4e0
commit a65eb9a4e0
parent 05bc94f030
7 changed files with 320 additions and 3 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -299,8 +299,8 @@ set_property(TARGET lego1 PROPERTY SUFFIX ".DLL")
 if (ISLE_BUILD_APP)
  add_executable(isle WIN32
    ISLE/res/isle.rc
    ISLE/isleapp.cpp
    ISLE/define.cpp
    ISLE/isleapp.cpp
  )
  target_compile_definitions(isle PRIVATE ISLE_APP)
--- a/tools/isledecomp/isledecomp/bin.py
+++ b/tools/isledecomp/isledecomp/bin.py
@ -1,6 +1,6 @@
 import logging
 import struct
-from typing import List, Optional
+from typing import List, Optional, Tuple
 from dataclasses import dataclass
 from collections import namedtuple
@ -365,6 +365,14 @@ def get_abs_addr(self, section: int, offset: int) -> int:
        into an absolute vaddr."""
        return self.get_section_offset_by_index(section) + offset
    def get_relative_addr(self, addr: int) -> Tuple[int, int]:
        """Convert an absolute address back into a (section, offset) pair."""
        for i, section in enumerate(self.sections):
            if section.contains_vaddr(addr):
                return (i + 1, addr - section.virtual_address)
        return (0, 0)
    def get_raw_addr(self, vaddr: int) -> int:
        """Returns the raw offset in the PE binary for the given virtual address."""
        self._set_section_for_vaddr(vaddr)
--- a/tools/isledecomp/isledecomp/compare/core.py
+++ b/tools/isledecomp/isledecomp/compare/core.py
@ -409,6 +409,9 @@ def _compare_match(self, match: MatchInfo) -> Optional[DiffReport]:
    ## Public API
    def get_all(self) -> List[MatchInfo]:
        return self._db.get_all()
    def get_functions(self) -> List[MatchInfo]:
        return self._db.get_matches_by_type(SymbolType.FUNCTION)
--- a/tools/isledecomp/isledecomp/compare/db.py
+++ b/tools/isledecomp/isledecomp/compare/db.py
@ -82,6 +82,17 @@ def get_unmatched_strings(self) -> List[str]:
        return [string for (string,) in cur.fetchall()]
    def get_all(self) -> List[MatchInfo]:
        cur = self._db.execute(
            """SELECT compare_type, orig_addr, recomp_addr, name, size
            FROM `symbols`
            ORDER BY orig_addr NULLS LAST
            """,
        )
        cur.row_factory = matchinfo_factory
        return cur.fetchall()
    def get_matches(self) -> Optional[MatchInfo]:
        cur = self._db.execute(
            """SELECT compare_type, orig_addr, recomp_addr, name, size
--- a/tools/isledecomp/isledecomp/cvdump/parser.py
+++ b/tools/isledecomp/isledecomp/cvdump/parser.py
@ -39,6 +39,9 @@
    r"S_GDATA32: \[(?P<section>\w{4}):(?P<offset>\w{8})\], Type:\s*(?P<type>\S+), (?P<name>.+)"
 )
 # e.g. 0003 "CMakeFiles/isle.dir/ISLE/res/isle.rc.res"
 # e.g. 0004 "C:\work\lego-island\isle\3rdparty\smartheap\SHLW32MT.LIB" "check.obj"
 _module_regex = re.compile(r"(?P<id>\w{4})(?: \"(?P<lib>.+?)\")?(?: \"(?P<obj>.+?)\")")
 # User functions only
 LinesEntry = namedtuple("LinesEntry", "filename line_no section offset")
@ -52,13 +55,16 @@
 SymbolsEntry = namedtuple("SymbolsEntry", "type section offset size name")
 # (Estimated) size of any symbol
-SizeRefEntry = namedtuple("SizeRefEntry", "section offset size")
+SizeRefEntry = namedtuple("SizeRefEntry", "module section offset size")
 # global variables
 GdataEntry = namedtuple("GdataEntry", "section offset type name")
 ModuleEntry = namedtuple("ModuleEntry", "id lib obj")
 class CvdumpParser:
    # pylint: disable=too-many-instance-attributes
    def __init__(self) -> None:
        self._section: str = ""
        self._lines_function: Tuple[str, int] = ("", 0)
@ -68,6 +74,7 @@ def __init__(self) -> None:
        self.symbols = []
        self.sizerefs = []
        self.globals = []
        self.modules = []
    def _lines_section(self, line: str):
        """Parsing entries from the LINES section. We only care about the pairs of
@ -144,12 +151,26 @@ def _section_contributions(self, line: str):
        if (match := _section_contrib_regex.match(line)) is not None:
            self.sizerefs.append(
                SizeRefEntry(
                    module=int(match.group("module"), 16),
                    section=int(match.group("section"), 16),
                    offset=int(match.group("offset"), 16),
                    size=int(match.group("size"), 16),
                )
            )
    def _modules_section(self, line: str):
        """Record the object file (and lib file, if used) linked into the binary.
        The auto-incrementing id is cross-referenced in SECTION CONTRIBUTIONS
        (and perhaps other locations)"""
        if (match := _module_regex.match(line)) is not None:
            self.modules.append(
                ModuleEntry(
                    id=int(match.group("id"), 16),
                    lib=match.group("lib"),
                    obj=match.group("obj"),
                )
            )
    def read_line(self, line: str):
        # Blank lines are there to help the reader; they have no context significance
        if line.strip() == "":
@ -174,6 +195,9 @@ def read_line(self, line: str):
        elif self._section == "GLOBALS":
            self._globals_section(line)
        elif self._section == "MODULES":
            self._modules_section(line)
    def read_lines(self, lines: Iterable[str]):
        for line in lines:
            self.read_line(line)
--- a/tools/isledecomp/isledecomp/cvdump/runner.py
+++ b/tools/isledecomp/isledecomp/cvdump/runner.py
@ -13,6 +13,7 @@ class DumpOpt(Enum):
    GLOBALS = 2
    PUBLICS = 3
    SECTION_CONTRIB = 4
    MODULES = 5
 cvdump_opt_map = {
@ -21,6 +22,7 @@ class DumpOpt(Enum):
    DumpOpt.GLOBALS: "-g",
    DumpOpt.PUBLICS: "-p",
    DumpOpt.SECTION_CONTRIB: "-seccontrib",
    DumpOpt.MODULES: "-m",
 }
@ -49,6 +51,10 @@ def section_contributions(self):
        self._options.add(DumpOpt.SECTION_CONTRIB)
        return self
    def modules(self):
        self._options.add(DumpOpt.MODULES)
        return self
    def cmd_line(self) -> List[str]:
        cvdump_exe = lib_path_join("cvdump.exe")
        flags = [cvdump_opt_map[opt] for opt in self._options]
--- a/tools/roadmap/roadmap.py
+++ b/tools/roadmap/roadmap.py
@ -0,0 +1,265 @@
 """For all addresses matched by code annotations or recomp pdb,
 report how "far off" the recomp symbol is from its proper place
 in the original binary."""
 import os
 import argparse
 import logging
 from typing import List, Optional
 from collections import namedtuple
 from isledecomp import Bin as IsleBin
 from isledecomp.cvdump import Cvdump
 from isledecomp.compare import Compare as IsleCompare
 from isledecomp.types import SymbolType
 # Ignore all compare-db messages.
 logging.getLogger("isledecomp.compare").addHandler(logging.NullHandler())
 def or_blank(value) -> str:
    """Helper for dealing with potential None values in text output."""
    return "" if value is None else str(value)
 class ModuleMap:
    """Load a subset of sections from the pdb to allow you to look up the
    module number based on the recomp address."""
    def __init__(self, pdb, binfile) -> None:
        cvdump = Cvdump(pdb).section_contributions().modules().run()
        self.module_lookup = {m.id: (m.lib, m.obj) for m in cvdump.modules}
        self.section_contrib = [
            (
                binfile.get_abs_addr(sizeref.section, sizeref.offset),
                sizeref.size,
                sizeref.module,
            )
            for sizeref in cvdump.sizerefs
            if binfile.is_valid_section(sizeref.section)
        ]
    def get_module(self, addr: int) -> Optional[str]:
        for start, size, module_id in self.section_contrib:
            if start <= addr < start + size:
                if (module := self.module_lookup.get(module_id)) is not None:
                    return module
        return None
 def print_sections(sections):
    print("    name |    start |   v.size | raw size")
    print("---------|----------|----------|----------")
    for sect in sections:
        name = sect.name.decode("ascii").rstrip("\x00")
        print(
            f"{name:>8} | {sect.virtual_address:8x} | {sect.virtual_size:8x} | {sect.size_of_raw_data:8x}"
        )
    print()
 def match_type_abbreviation(mtype: Optional[SymbolType]) -> str:
    """Return abbreviation of the given SymbolType name"""
    if mtype is None:
        return ""
    return mtype.name.lower()[:3]
 RoadmapRow = namedtuple(
    "RoadmapRow",
    [
        "orig_sect_ofs",
        "recomp_sect_ofs",
        "orig_addr",
        "recomp_addr",
        "displacement",
        "sym_type",
        "size",
        "name",
        "module",
    ],
 )
 def print_text_report(results: List[RoadmapRow]):
    """Print the result with original and recomp addresses."""
    for row in results:
        print(
            "  ".join(
                [
                    f"{or_blank(row.orig_sect_ofs):14}",
                    f"{or_blank(row.recomp_sect_ofs):14}",
                    f"{or_blank(row.displacement):>8}",
                    f"{row.sym_type:3}",
                    f"{or_blank(row.size):6}",
                    or_blank(row.name),
                ]
            )
        )
 def print_diff_report(results: List[RoadmapRow]):
    """Print only entries where we have the recomp address.
    This is intended for generating a file to diff against.
    The recomp addresses are always changing so we hide those."""
    for row in results:
        if row.orig_addr is None or row.recomp_addr is None:
            continue
        print(
            "  ".join(
                [
                    f"{or_blank(row.orig_sect_ofs):14}",
                    f"{or_blank(row.displacement):>8}",
                    f"{row.sym_type:3}",
                    f"{or_blank(row.size):6}",
                    or_blank(row.name),
                ]
            )
        )
 def export_to_csv(csv_file: str, results: List[RoadmapRow]):
    with open(csv_file, "w+", encoding="utf-8") as f:
        f.write(
            "orig_sect_ofs,recomp_sect_ofs,orig_addr,recomp_addr,displacement,row_type,size,name,module\n"
        )
        for row in results:
            f.write(",".join(map(or_blank, row)))
            f.write("\n")
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Show all addresses from original and recomp."
    )
    parser.add_argument(
        "original", metavar="original-binary", help="The original binary"
    )
    parser.add_argument(
        "recompiled", metavar="recompiled-binary", help="The recompiled binary"
    )
    parser.add_argument(
        "pdb", metavar="recompiled-pdb", help="The PDB of the recompiled binary"
    )
    parser.add_argument(
        "decomp_dir", metavar="decomp-dir", help="The decompiled source tree"
    )
    parser.add_argument("--csv", metavar="<file>", help="If set, export to CSV")
    parser.add_argument(
        "--verbose", "-v", action="store_true", help="Show recomp addresses in output"
    )
    (args, _) = parser.parse_known_args()
    if not os.path.isfile(args.original):
        parser.error(f"Original binary {args.original} does not exist")
    if not os.path.isfile(args.recompiled):
        parser.error(f"Recompiled binary {args.recompiled} does not exist")
    if not os.path.isfile(args.pdb):
        parser.error(f"Symbols PDB {args.pdb} does not exist")
    if not os.path.isdir(args.decomp_dir):
        parser.error(f"Source directory {args.decomp_dir} does not exist")
    return args
 def main():
    args = parse_args()
    with IsleBin(args.original, find_str=True) as orig_bin, IsleBin(
        args.recompiled
    ) as recomp_bin:
        engine = IsleCompare(orig_bin, recomp_bin, args.pdb, args.decomp_dir)
        module_map = ModuleMap(args.pdb, recomp_bin)
        def is_same_section(orig: int, recomp: int) -> bool:
            """Compare the section name instead of the index.
            LEGO1.dll adds extra sections for some reason. (Smacker library?)"""
            try:
                orig_name = orig_bin.sections[orig - 1].name
                recomp_name = recomp_bin.sections[recomp - 1].name
                return orig_name == recomp_name
            except IndexError:
                return False
        def to_roadmap_row(match):
            orig_sect = None
            orig_ofs = None
            orig_sect_ofs = None
            recomp_sect = None
            recomp_ofs = None
            recomp_sect_ofs = None
            orig_addr = None
            recomp_addr = None
            displacement = None
            module_name = None
            if match.recomp_addr is not None:
                if (module_ref := module_map.get_module(match.recomp_addr)) is not None:
                    (_, module_name) = module_ref
            row_type = match_type_abbreviation(match.compare_type)
            name = (
                repr(match.name)
                if match.compare_type == SymbolType.STRING
                else match.name
            )
            if match.orig_addr is not None:
                orig_addr = match.orig_addr
                (orig_sect, orig_ofs) = orig_bin.get_relative_addr(match.orig_addr)
                orig_sect_ofs = f"{orig_sect:04}:{orig_ofs:08x}"
            if match.recomp_addr is not None:
                recomp_addr = match.recomp_addr
                (recomp_sect, recomp_ofs) = recomp_bin.get_relative_addr(
                    match.recomp_addr
                )
                recomp_sect_ofs = f"{recomp_sect:04}:{recomp_ofs:08x}"
            if (
                orig_sect is not None
                and recomp_sect is not None
                and is_same_section(orig_sect, recomp_sect)
            ):
                displacement = recomp_ofs - orig_ofs
            return RoadmapRow(
                orig_sect_ofs,
                recomp_sect_ofs,
                orig_addr,
                recomp_addr,
                displacement,
                row_type,
                match.size,
                name,
                module_name,
            )
        results = list(map(to_roadmap_row, engine.get_all()))
        if args.csv is None:
            if args.verbose:
                print("ORIG sections:")
                print_sections(orig_bin.sections)
                print("RECOMP sections:")
                print_sections(recomp_bin.sections)
                print_text_report(results)
            else:
                print_diff_report(results)
        if args.csv is not None:
            export_to_csv(args.csv, results)
 if __name__ == "__main__":
    main()