Make reccmp more compatible with debug builds (#674)

2024-11-22 23:48:12 -05:00 · 2024-03-15 11:17:08 -04:00 · 2024-03-15 11:17:08 -04:00 · 5e0e7ab908
commit 5e0e7ab908
parent 331aac73f2
4 changed files with 95 additions and 8 deletions
--- a/tools/isledecomp/isledecomp/bin.py
+++ b/tools/isledecomp/isledecomp/bin.py
@ -112,6 +112,7 @@ def __init__(self, filename: str, find_str: bool = False) -> None:
        self._relocated_addrs = set()
        self.imports = []
        self.thunks = []
        self.exports: List[Tuple[int, str]] = []
    def __enter__(self):
        logger.debug("Bin %s Enter", self.filename)
@ -137,6 +138,11 @@ def __enter__(self):
        (entry,) = struct.unpack("<i", optional_hdr[0x10:0x14])
        self.entry = entry + self.imagebase
        (number_of_rva,) = struct.unpack("<i", optional_hdr[0x5C:0x60])
        data_dictionaries = [
            *struct.iter_unpack("<2I", optional_hdr[0x60 : 0x60 + number_of_rva * 8])
        ]
        headers_view = optional_hdr[
            pe_hdr.SizeOfOptionalHeader : pe_hdr.SizeOfOptionalHeader
            + 0x28 * pe_hdr.NumberOfSections
@ -164,6 +170,8 @@ def __enter__(self):
        self._populate_relocations()
        self._populate_imports()
        self._populate_thunks()
        # Export dir is always first
        self._populate_exports(*data_dictionaries[0])
        # This is a (semi) expensive lookup that is not necesssary in every case.
        # We can find strings in the original if we have coverage using STRING markers.
@ -344,6 +352,42 @@ def _populate_thunks(self):
                    thunk_ofs = ofs + shift + i * 6
                    self.thunks.append((thunk_ofs, jmp_ofs))
    def _populate_exports(self, export_rva: int, _: int):
        """If you are missing a lot of annotations in your file
        (e.g. debug builds) then you can at least match up the
        export symbol names."""
        # Null = no exports
        if export_rva == 0:
            return
        export_start = self.imagebase + export_rva
        # TODO: namedtuple
        export_table = struct.unpack("<2L2H7L", self.read(export_start, 40))
        # TODO: if the number of functions doesn't match the number of names,
        # are the remaining functions ordinals?
        n_functions = export_table[6]
        func_start = export_start + 40
        func_addrs = [
            self.imagebase + rva
            for rva, in struct.iter_unpack("<L", self.read(func_start, 4 * n_functions))
        ]
        name_start = func_start + 4 * n_functions
        name_addrs = [
            self.imagebase + rva
            for rva, in struct.iter_unpack("<L", self.read(name_start, 4 * n_functions))
        ]
        combined = zip(func_addrs, name_addrs)
        self.exports = [
            (func_addr, self.read_string(name_addr))
            for (func_addr, name_addr) in combined
        ]
    def get_section_by_name(self, name: str) -> Section:
        section = next(
            filter(lambda section: section.match_name(name), self.sections),
--- a/tools/isledecomp/isledecomp/compare/core.py
+++ b/tools/isledecomp/isledecomp/compare/core.py
@ -85,6 +85,7 @@ def __init__(
        self._load_markers()
        self._find_original_strings()
        self._match_thunks()
        self._match_exports()
    def _load_cvdump(self):
        logger.info("Parsing %s ...", self.pdb_file)
@ -166,12 +167,11 @@ def _load_cvdump(self):
        self._db.set_function_pair(self.orig_bin.entry, self.recomp_bin.entry)
    def _load_markers(self):
-        # Guess at module name from PDB file name
+        # Assume module name is the base filename of the original binary.
-        # reccmp checks the original binary filename; we could use this too
+        (module, _) = os.path.splitext(os.path.basename(self.orig_bin.filename))
        (module, _) = os.path.splitext(os.path.basename(self.pdb_file))
        codefiles = list(walk_source_dir(self.code_dir))
-        codebase = DecompCodebase(codefiles, module)
+        codebase = DecompCodebase(codefiles, module.upper())
        # Match lineref functions first because this is a guaranteed match.
        # If we have two functions that share the same name, and one is
@ -274,6 +274,17 @@ def _match_thunks(self):
                # function in the first place.
                self._db.skip_compare(thunk_from_orig)
    def _match_exports(self):
        # invert for name lookup
        orig_exports = {y: x for (x, y) in self.orig_bin.exports}
        for recomp_addr, export_name in self.recomp_bin.exports:
            orig_addr = orig_exports.get(export_name)
            if orig_addr is not None and self._db.set_pair_tentative(
                orig_addr, recomp_addr
            ):
                logger.debug("Matched export %s", repr(export_name))
    def _compare_function(self, match: MatchInfo) -> DiffReport:
        orig_raw = self.orig_bin.read(match.orig_addr, match.size)
        recomp_raw = self.recomp_bin.read(match.recomp_addr, match.size)
--- a/tools/isledecomp/isledecomp/compare/db.py
+++ b/tools/isledecomp/isledecomp/compare/db.py
@ -86,7 +86,7 @@ def set_recomp_symbol(
    ):
        # Ignore collisions here. The same recomp address can have
        # multiple names (e.g. _strlwr and __strlwr)
-        if self.recomp_used(addr):
+        if self._recomp_used(addr):
            return
        compare_value = compare_type.value if compare_type is not None else None
@ -166,18 +166,18 @@ def get_matches_by_type(self, compare_type: SymbolType) -> List[MatchInfo]:
        return cur.fetchall()
-    def orig_used(self, addr: int) -> bool:
+    def _orig_used(self, addr: int) -> bool:
        cur = self._db.execute("SELECT 1 FROM symbols WHERE orig_addr = ?", (addr,))
        return cur.fetchone() is not None
-    def recomp_used(self, addr: int) -> bool:
+    def _recomp_used(self, addr: int) -> bool:
        cur = self._db.execute("SELECT 1 FROM symbols WHERE recomp_addr = ?", (addr,))
        return cur.fetchone() is not None
    def set_pair(
        self, orig: int, recomp: int, compare_type: Optional[SymbolType] = None
    ) -> bool:
-        if self.orig_used(orig):
+        if self._orig_used(orig):
            logger.error("Original address %s not unique!", hex(orig))
            return False
@ -189,6 +189,32 @@ def set_pair(
        return cur.rowcount > 0
    def set_pair_tentative(
        self, orig: int, recomp: int, compare_type: Optional[SymbolType] = None
    ) -> bool:
        """Declare a match for the original and recomp addresses given, but only if:
        1. The original address is not used elsewhere (as with set_pair)
        2. The recomp address has not already been matched
        If the compare_type is given, update this also, but only if NULL in the db.
        The purpose here is to set matches found via some automated analysis
        but to not overwrite a match provided by the human operator."""
        if self._orig_used(orig):
            # Probable and expected situation. Just ignore it.
            return False
        compare_value = compare_type.value if compare_type is not None else None
        cur = self._db.execute(
            """UPDATE `symbols`
            SET orig_addr = ?, compare_type = coalesce(compare_type, ?)
            WHERE recomp_addr = ?
            AND orig_addr IS NULL""",
            (orig, compare_value, recomp),
        )
        return cur.rowcount > 0
    def set_function_pair(self, orig: int, recomp: int) -> bool:
        """For lineref match or _entry"""
        return self.set_pair(orig, recomp, SymbolType.FUNCTION)
--- a/tools/isledecomp/tests/test_islebin.py
+++ b/tools/isledecomp/tests/test_islebin.py
@ -144,3 +144,9 @@ def test_imports(import_ref: Tuple[str, str, int], binfile: IsleBin):
@pytest.mark.parametrize("thunk_ref", THUNKS)
 def test_thunks(thunk_ref: Tuple[int, int], binfile: IsleBin):
    assert thunk_ref in binfile.thunks
 def test_exports(binfile: IsleBin):
    assert len(binfile.exports) == 130
    assert (0x1003BFB0, b"??0LegoBackgroundColor@@QAE@PBD0@Z") in binfile.exports
    assert (0x10091EE0, b"_DllMain@12") in binfile.exports