Roadmap tool to compare binary structure (#479)

2025-04-03 10:19:45 -04:00 · 2024-01-22 10:15:12 -05:00 · 2024-01-22 10:15:12 -05:00 · a65eb9a4e0
commit a65eb9a4e0
parent 05bc94f030
7 changed files with 320 additions and 3 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -299,8 +299,8 @@ set_property(TARGET lego1 PROPERTY SUFFIX ".DLL")
 if (ISLE_BUILD_APP)
  add_executable(isle WIN32
    ISLE/res/isle.rc
-    ISLE/isleapp.cpp
    ISLE/define.cpp
+    ISLE/isleapp.cpp
  )

  target_compile_definitions(isle PRIVATE ISLE_APP)
--- a/tools/isledecomp/isledecomp/bin.py
+++ b/tools/isledecomp/isledecomp/bin.py
@ -1,6 +1,6 @@
 import logging
 import struct
-from typing import List, Optional
+from typing import List, Optional, Tuple
 from dataclasses import dataclass
 from collections import namedtuple

@ -365,6 +365,14 @@ class Bin:
        into an absolute vaddr."""
        return self.get_section_offset_by_index(section) + offset

+    def get_relative_addr(self, addr: int) -> Tuple[int, int]:
+        """Convert an absolute address back into a (section, offset) pair."""
+        for i, section in enumerate(self.sections):
+            if section.contains_vaddr(addr):
+                return (i + 1, addr - section.virtual_address)
+
+        return (0, 0)
+
    def get_raw_addr(self, vaddr: int) -> int:
        """Returns the raw offset in the PE binary for the given virtual address."""
        self._set_section_for_vaddr(vaddr)
--- a/tools/isledecomp/isledecomp/compare/core.py
+++ b/tools/isledecomp/isledecomp/compare/core.py
@ -409,6 +409,9 @@ class Compare:

    ## Public API

+    def get_all(self) -> List[MatchInfo]:
+        return self._db.get_all()
+
    def get_functions(self) -> List[MatchInfo]:
        return self._db.get_matches_by_type(SymbolType.FUNCTION)

--- a/tools/isledecomp/isledecomp/compare/db.py
+++ b/tools/isledecomp/isledecomp/compare/db.py
@ -82,6 +82,17 @@ class CompareDb:

        return [string for (string,) in cur.fetchall()]

+    def get_all(self) -> List[MatchInfo]:
+        cur = self._db.execute(
+            """SELECT compare_type, orig_addr, recomp_addr, name, size
+            FROM `symbols`
+            ORDER BY orig_addr NULLS LAST
+            """,
+        )
+        cur.row_factory = matchinfo_factory
+
+        return cur.fetchall()
+
    def get_matches(self) -> Optional[MatchInfo]:
        cur = self._db.execute(
            """SELECT compare_type, orig_addr, recomp_addr, name, size
--- a/tools/isledecomp/isledecomp/cvdump/parser.py
+++ b/tools/isledecomp/isledecomp/cvdump/parser.py
@ -39,6 +39,9 @@ _gdata32_regex = re.compile(
    r"S_GDATA32: \[(?P<section>\w{4}):(?P<offset>\w{8})\], Type:\s*(?P<type>\S+), (?P<name>.+)"
 )

+# e.g. 0003 "CMakeFiles/isle.dir/ISLE/res/isle.rc.res"
+# e.g. 0004 "C:\work\lego-island\isle\3rdparty\smartheap\SHLW32MT.LIB" "check.obj"
+_module_regex = re.compile(r"(?P<id>\w{4})(?: \"(?P<lib>.+?)\")?(?: \"(?P<obj>.+?)\")")

 # User functions only
 LinesEntry = namedtuple("LinesEntry", "filename line_no section offset")
@ -52,13 +55,16 @@ PublicsEntry = namedtuple("PublicsEntry", "type section offset flags name")
 SymbolsEntry = namedtuple("SymbolsEntry", "type section offset size name")

 # (Estimated) size of any symbol
-SizeRefEntry = namedtuple("SizeRefEntry", "section offset size")
+SizeRefEntry = namedtuple("SizeRefEntry", "module section offset size")

 # global variables
 GdataEntry = namedtuple("GdataEntry", "section offset type name")

+ModuleEntry = namedtuple("ModuleEntry", "id lib obj")
+

 class CvdumpParser:
+    # pylint: disable=too-many-instance-attributes
    def __init__(self) -> None:
        self._section: str = ""
        self._lines_function: Tuple[str, int] = ("", 0)
@ -68,6 +74,7 @@ class CvdumpParser:
        self.symbols = []
        self.sizerefs = []
        self.globals = []
+        self.modules = []

    def _lines_section(self, line: str):
        """Parsing entries from the LINES section. We only care about the pairs of
@ -144,12 +151,26 @@ class CvdumpParser:
        if (match := _section_contrib_regex.match(line)) is not None:
            self.sizerefs.append(
                SizeRefEntry(
+                    module=int(match.group("module"), 16),
                    section=int(match.group("section"), 16),
                    offset=int(match.group("offset"), 16),
                    size=int(match.group("size"), 16),
                )
            )

+    def _modules_section(self, line: str):
+        """Record the object file (and lib file, if used) linked into the binary.
+        The auto-incrementing id is cross-referenced in SECTION CONTRIBUTIONS
+        (and perhaps other locations)"""
+        if (match := _module_regex.match(line)) is not None:
+            self.modules.append(
+                ModuleEntry(
+                    id=int(match.group("id"), 16),
+                    lib=match.group("lib"),
+                    obj=match.group("obj"),
+                )
+            )
+
    def read_line(self, line: str):
        # Blank lines are there to help the reader; they have no context significance
        if line.strip() == "":
@ -174,6 +195,9 @@ class CvdumpParser:
        elif self._section == "GLOBALS":
            self._globals_section(line)

+        elif self._section == "MODULES":
+            self._modules_section(line)
+
    def read_lines(self, lines: Iterable[str]):
        for line in lines:
            self.read_line(line)
--- a/tools/isledecomp/isledecomp/cvdump/runner.py
+++ b/tools/isledecomp/isledecomp/cvdump/runner.py
@ -13,6 +13,7 @@ class DumpOpt(Enum):
    GLOBALS = 2
    PUBLICS = 3
    SECTION_CONTRIB = 4
+    MODULES = 5


 cvdump_opt_map = {
@ -21,6 +22,7 @@ cvdump_opt_map = {
    DumpOpt.GLOBALS: "-g",
    DumpOpt.PUBLICS: "-p",
    DumpOpt.SECTION_CONTRIB: "-seccontrib",
+    DumpOpt.MODULES: "-m",
 }


@ -49,6 +51,10 @@ class Cvdump:
        self._options.add(DumpOpt.SECTION_CONTRIB)
        return self

+    def modules(self):
+        self._options.add(DumpOpt.MODULES)
+        return self
+
    def cmd_line(self) -> List[str]:
        cvdump_exe = lib_path_join("cvdump.exe")
        flags = [cvdump_opt_map[opt] for opt in self._options]
--- a/tools/roadmap/roadmap.py
+++ b/tools/roadmap/roadmap.py
@ -0,0 +1,265 @@
+"""For all addresses matched by code annotations or recomp pdb,
+report how "far off" the recomp symbol is from its proper place
+in the original binary."""
+
+import os
+import argparse
+import logging
+from typing import List, Optional
+from collections import namedtuple
+from isledecomp import Bin as IsleBin
+from isledecomp.cvdump import Cvdump
+from isledecomp.compare import Compare as IsleCompare
+from isledecomp.types import SymbolType
+
+# Ignore all compare-db messages.
+logging.getLogger("isledecomp.compare").addHandler(logging.NullHandler())
+
+
+def or_blank(value) -> str:
+    """Helper for dealing with potential None values in text output."""
+    return "" if value is None else str(value)
+
+
+class ModuleMap:
+    """Load a subset of sections from the pdb to allow you to look up the
+    module number based on the recomp address."""
+
+    def __init__(self, pdb, binfile) -> None:
+        cvdump = Cvdump(pdb).section_contributions().modules().run()
+        self.module_lookup = {m.id: (m.lib, m.obj) for m in cvdump.modules}
+        self.section_contrib = [
+            (
+                binfile.get_abs_addr(sizeref.section, sizeref.offset),
+                sizeref.size,
+                sizeref.module,
+            )
+            for sizeref in cvdump.sizerefs
+            if binfile.is_valid_section(sizeref.section)
+        ]
+
+    def get_module(self, addr: int) -> Optional[str]:
+        for start, size, module_id in self.section_contrib:
+            if start <= addr < start + size:
+                if (module := self.module_lookup.get(module_id)) is not None:
+                    return module
+
+        return None
+
+
+def print_sections(sections):
+    print("    name |    start |   v.size | raw size")
+    print("---------|----------|----------|----------")
+    for sect in sections:
+        name = sect.name.decode("ascii").rstrip("\x00")
+        print(
+            f"{name:>8} | {sect.virtual_address:8x} | {sect.virtual_size:8x} | {sect.size_of_raw_data:8x}"
+        )
+    print()
+
+
+def match_type_abbreviation(mtype: Optional[SymbolType]) -> str:
+    """Return abbreviation of the given SymbolType name"""
+    if mtype is None:
+        return ""
+
+    return mtype.name.lower()[:3]
+
+
+RoadmapRow = namedtuple(
+    "RoadmapRow",
+    [
+        "orig_sect_ofs",
+        "recomp_sect_ofs",
+        "orig_addr",
+        "recomp_addr",
+        "displacement",
+        "sym_type",
+        "size",
+        "name",
+        "module",
+    ],
+)
+
+
+def print_text_report(results: List[RoadmapRow]):
+    """Print the result with original and recomp addresses."""
+    for row in results:
+        print(
+            "  ".join(
+                [
+                    f"{or_blank(row.orig_sect_ofs):14}",
+                    f"{or_blank(row.recomp_sect_ofs):14}",
+                    f"{or_blank(row.displacement):>8}",
+                    f"{row.sym_type:3}",
+                    f"{or_blank(row.size):6}",
+                    or_blank(row.name),
+                ]
+            )
+        )
+
+
+def print_diff_report(results: List[RoadmapRow]):
+    """Print only entries where we have the recomp address.
+    This is intended for generating a file to diff against.
+    The recomp addresses are always changing so we hide those."""
+    for row in results:
+        if row.orig_addr is None or row.recomp_addr is None:
+            continue
+
+        print(
+            "  ".join(
+                [
+                    f"{or_blank(row.orig_sect_ofs):14}",
+                    f"{or_blank(row.displacement):>8}",
+                    f"{row.sym_type:3}",
+                    f"{or_blank(row.size):6}",
+                    or_blank(row.name),
+                ]
+            )
+        )
+
+
+def export_to_csv(csv_file: str, results: List[RoadmapRow]):
+    with open(csv_file, "w+", encoding="utf-8") as f:
+        f.write(
+            "orig_sect_ofs,recomp_sect_ofs,orig_addr,recomp_addr,displacement,row_type,size,name,module\n"
+        )
+        for row in results:
+            f.write(",".join(map(or_blank, row)))
+            f.write("\n")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Show all addresses from original and recomp."
+    )
+    parser.add_argument(
+        "original", metavar="original-binary", help="The original binary"
+    )
+    parser.add_argument(
+        "recompiled", metavar="recompiled-binary", help="The recompiled binary"
+    )
+    parser.add_argument(
+        "pdb", metavar="recompiled-pdb", help="The PDB of the recompiled binary"
+    )
+    parser.add_argument(
+        "decomp_dir", metavar="decomp-dir", help="The decompiled source tree"
+    )
+    parser.add_argument("--csv", metavar="<file>", help="If set, export to CSV")
+    parser.add_argument(
+        "--verbose", "-v", action="store_true", help="Show recomp addresses in output"
+    )
+
+    (args, _) = parser.parse_known_args()
+
+    if not os.path.isfile(args.original):
+        parser.error(f"Original binary {args.original} does not exist")
+
+    if not os.path.isfile(args.recompiled):
+        parser.error(f"Recompiled binary {args.recompiled} does not exist")
+
+    if not os.path.isfile(args.pdb):
+        parser.error(f"Symbols PDB {args.pdb} does not exist")
+
+    if not os.path.isdir(args.decomp_dir):
+        parser.error(f"Source directory {args.decomp_dir} does not exist")
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    with IsleBin(args.original, find_str=True) as orig_bin, IsleBin(
+        args.recompiled
+    ) as recomp_bin:
+        engine = IsleCompare(orig_bin, recomp_bin, args.pdb, args.decomp_dir)
+
+        module_map = ModuleMap(args.pdb, recomp_bin)
+
+        def is_same_section(orig: int, recomp: int) -> bool:
+            """Compare the section name instead of the index.
+            LEGO1.dll adds extra sections for some reason. (Smacker library?)"""
+
+            try:
+                orig_name = orig_bin.sections[orig - 1].name
+                recomp_name = recomp_bin.sections[recomp - 1].name
+                return orig_name == recomp_name
+            except IndexError:
+                return False
+
+        def to_roadmap_row(match):
+            orig_sect = None
+            orig_ofs = None
+            orig_sect_ofs = None
+            recomp_sect = None
+            recomp_ofs = None
+            recomp_sect_ofs = None
+            orig_addr = None
+            recomp_addr = None
+            displacement = None
+            module_name = None
+
+            if match.recomp_addr is not None:
+                if (module_ref := module_map.get_module(match.recomp_addr)) is not None:
+                    (_, module_name) = module_ref
+
+            row_type = match_type_abbreviation(match.compare_type)
+            name = (
+                repr(match.name)
+                if match.compare_type == SymbolType.STRING
+                else match.name
+            )
+
+            if match.orig_addr is not None:
+                orig_addr = match.orig_addr
+                (orig_sect, orig_ofs) = orig_bin.get_relative_addr(match.orig_addr)
+                orig_sect_ofs = f"{orig_sect:04}:{orig_ofs:08x}"
+
+            if match.recomp_addr is not None:
+                recomp_addr = match.recomp_addr
+                (recomp_sect, recomp_ofs) = recomp_bin.get_relative_addr(
+                    match.recomp_addr
+                )
+                recomp_sect_ofs = f"{recomp_sect:04}:{recomp_ofs:08x}"
+
+            if (
+                orig_sect is not None
+                and recomp_sect is not None
+                and is_same_section(orig_sect, recomp_sect)
+            ):
+                displacement = recomp_ofs - orig_ofs
+
+            return RoadmapRow(
+                orig_sect_ofs,
+                recomp_sect_ofs,
+                orig_addr,
+                recomp_addr,
+                displacement,
+                row_type,
+                match.size,
+                name,
+                module_name,
+            )
+
+        results = list(map(to_roadmap_row, engine.get_all()))
+
+        if args.csv is None:
+            if args.verbose:
+                print("ORIG sections:")
+                print_sections(orig_bin.sections)
+
+                print("RECOMP sections:")
+                print_sections(recomp_bin.sections)
+
+                print_text_report(results)
+            else:
+                print_diff_report(results)
+
+        if args.csv is not None:
+            export_to_csv(args.csv, results)
+
+
+if __name__ == "__main__":
+    main()