Identify thunks in debug builds (#741)

2024-11-26 01:17:55 -05:00 · 2024-03-27 16:13:23 -04:00 · 2024-03-27 16:13:23 -04:00 · 3b68a90a08
commit 3b68a90a08
parent 97ebb22c42
3 changed files with 88 additions and 22 deletions
--- a/tools/isledecomp/isledecomp/bin.py
+++ b/tools/isledecomp/isledecomp/bin.py
@ -113,6 +113,7 @@ def __init__(self, filename: str, find_str: bool = False) -> None:
        self.imports = []
        self.thunks = []
        self.exports: List[Tuple[int, str]] = []
        self.is_debug: bool = False
    def __enter__(self):
        logger.debug("Bin %s Enter", self.filename)
@ -143,6 +144,13 @@ def __enter__(self):
            *struct.iter_unpack("<2I", optional_hdr[0x60 : 0x60 + number_of_rva * 8])
        ]
        # Check for presence of .debug subsection in .rdata
        try:
            if data_dictionaries[6][0] != 0:
                self.is_debug = True
        except IndexError:
            pass
        headers_view = optional_hdr[
            pe_hdr.SizeOfOptionalHeader : pe_hdr.SizeOfOptionalHeader
            + 0x28 * pe_hdr.NumberOfSections
@ -337,9 +345,27 @@ def _populate_thunks(self):
        Search .text to find these functions."""
        text_sect = self.get_section_by_name(".text")
        text_start = text_sect.virtual_address
        # If this is a debug build, read the thunks at the start of .text
        # Terminated by a big block of 0xcc padding bytes before the first
        # real function in the section.
        if self.is_debug:
            ofs = 0
            while True:
                (opcode, operand) = struct.unpack("<Bi", text_sect.view[ofs : ofs + 5])
                if opcode != 0xE9:
                    break
                thunk_ofs = text_start + ofs
                jmp_ofs = text_start + ofs + 5 + operand
                self.thunks.append((thunk_ofs, jmp_ofs))
                ofs += 5
        # Now check for import thunks which are present in debug and release.
        # These use an absolute JMP with the 2 byte opcode: 0xff 0x25
        idata_sect = self.get_section_by_name(".idata")
-        start = text_sect.virtual_address
+        ofs = text_start
        ofs = start
        for shift in (0, 2, 4):
            window = text_sect.view[shift:]
--- a/tools/isledecomp/isledecomp/compare/core.py
+++ b/tools/isledecomp/isledecomp/compare/core.py
@ -84,6 +84,7 @@ def __init__(
        self._load_cvdump()
        self._load_markers()
        self._find_original_strings()
        self._match_imports()
        self._match_thunks()
        self._match_exports()
        self._find_vtordisp()
@ -250,7 +251,9 @@ def _find_original_strings(self):
            self._db.match_string(addr, string)
-    def _match_thunks(self):
+    def _match_imports(self):
        """We can match imported functions based on the DLL name and
        function symbol name."""
        orig_byaddr = {
            addr: (dll.upper(), name) for (dll, name, addr) in self.orig_bin.imports
        }
@ -268,27 +271,41 @@ def _match_thunks(self):
        # Now: we have the IAT offset in each matched up, so we need to make
        # the connection between the thunk functions.
        # We already have the symbol name we need from the PDB.
        orig_thunks = {
            iat_ofs: func_ofs for (func_ofs, iat_ofs) in self.orig_bin.thunks
        }
        recomp_thunks = {
            iat_ofs: func_ofs for (func_ofs, iat_ofs) in self.recomp_bin.thunks
        }
        for orig, recomp in orig_to_recomp.items():
            self._db.set_pair(orig, recomp, SymbolType.POINTER)
            thunk_from_orig = orig_thunks.get(orig, None)
            thunk_from_recomp = recomp_thunks.get(recomp, None)
-            if thunk_from_orig is not None and thunk_from_recomp is not None:
+    def _match_thunks(self):
-                self._db.set_function_pair(thunk_from_orig, thunk_from_recomp)
+        """Thunks are (by nature) matched by indirection. If a thunk from orig
-                # Don't compare thunk functions for now. The comparison isn't
+        points at a function we have already matched, we can find the matching
-                # "useful" in the usual sense. We are only looking at the 6
+        thunk in recomp because it points to the same place."""
-                # bytes of the jmp instruction and not the larger context of
+
-                # where this function is. Also: these will always match 100%
+        # Turn this one inside out for easy lookup
-                # because we are searching for a match to register this as a
+        recomp_thunks = {
-                # function in the first place.
+            func_addr: thunk_addr for (thunk_addr, func_addr) in self.recomp_bin.thunks
-                self._db.skip_compare(thunk_from_orig)
+        }
        for orig_thunk, orig_addr in self.orig_bin.thunks:
            orig_func = self._db.get_by_orig(orig_addr)
            if orig_func is None or orig_func.recomp_addr is None:
                continue
            # Check whether the thunk destination is a matched symbol
            recomp_thunk = recomp_thunks.get(orig_func.recomp_addr)
            if recomp_thunk is None:
                continue
            # The thunk symbol should already exist if it is the thunk of an
            # imported function. Incremental build thunks have no symbol,
            # so we need to give it a name for the asm diff output.
            self._db.register_thunk(orig_thunk, recomp_thunk, orig_func.name)
            # Don't compare thunk functions for now. The comparison isn't
            # "useful" in the usual sense. We are only looking at the
            # bytes of the jmp instruction and not the larger context of
            # where this function is. Also: these will always match 100%
            # because we are searching for a match to register this as a
            # function in the first place.
            self._db.skip_compare(orig_thunk)
    def _match_exports(self):
        # invert for name lookup
@ -560,7 +577,7 @@ def match_text(m: Optional[MatchInfo], raw_addr: Optional[int] = None) -> str:
    def _compare_match(self, match: MatchInfo) -> Optional[DiffReport]:
        """Router for comparison type"""
-        if match.size == 0:
+        if match.size is None or match.size == 0:
            return None
        options = self._db.get_match_options(match.orig_addr)
--- a/tools/isledecomp/isledecomp/compare/db.py
+++ b/tools/isledecomp/isledecomp/compare/db.py
@ -221,6 +221,29 @@ def set_function_pair(self, orig: int, recomp: int) -> bool:
        """For lineref match or _entry"""
        return self.set_pair(orig, recomp, SymbolType.FUNCTION)
    def register_thunk(self, orig: int, recomp: int, name: str) -> bool:
        """orig/recomp are an address pair of a thunk to some other function.
        We may or may not already have this function tracked in the db.
        If not, we need to create it, and we will use the name
        (of the function being thunked, presumably) to mock up a name for
        this symbol."""
        # Start by assuming the row exists
        if self.set_function_pair(orig, recomp):
            return True
        thunk_name = f"Thunk of '{name}'"
        # Assuming relative jump instruction for thunks (5 bytes)
        cur = self._db.execute(
            """INSERT INTO `symbols`
            (orig_addr, recomp_addr, compare_type, name, size)
            VALUES (?,?,?,?,?)""",
            (orig, recomp, SymbolType.FUNCTION.value, thunk_name, 5),
        )
        return cur.rowcount > 0
    def _set_opt_bool(self, addr: int, option: str, enabled: bool = True):
        if enabled:
            self._db.execute(