Identify thunks in debug builds (#741)

This commit is contained in:
MS 2024-03-27 16:13:23 -04:00 committed by GitHub
parent 97ebb22c42
commit 3b68a90a08
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 88 additions and 22 deletions

View file

@ -113,6 +113,7 @@ def __init__(self, filename: str, find_str: bool = False) -> None:
self.imports = [] self.imports = []
self.thunks = [] self.thunks = []
self.exports: List[Tuple[int, str]] = [] self.exports: List[Tuple[int, str]] = []
self.is_debug: bool = False
def __enter__(self): def __enter__(self):
logger.debug("Bin %s Enter", self.filename) logger.debug("Bin %s Enter", self.filename)
@ -143,6 +144,13 @@ def __enter__(self):
*struct.iter_unpack("<2I", optional_hdr[0x60 : 0x60 + number_of_rva * 8]) *struct.iter_unpack("<2I", optional_hdr[0x60 : 0x60 + number_of_rva * 8])
] ]
# Check for presence of .debug subsection in .rdata
try:
if data_dictionaries[6][0] != 0:
self.is_debug = True
except IndexError:
pass
headers_view = optional_hdr[ headers_view = optional_hdr[
pe_hdr.SizeOfOptionalHeader : pe_hdr.SizeOfOptionalHeader pe_hdr.SizeOfOptionalHeader : pe_hdr.SizeOfOptionalHeader
+ 0x28 * pe_hdr.NumberOfSections + 0x28 * pe_hdr.NumberOfSections
@ -337,9 +345,27 @@ def _populate_thunks(self):
Search .text to find these functions.""" Search .text to find these functions."""
text_sect = self.get_section_by_name(".text") text_sect = self.get_section_by_name(".text")
text_start = text_sect.virtual_address
# If this is a debug build, read the thunks at the start of .text
# Terminated by a big block of 0xcc padding bytes before the first
# real function in the section.
if self.is_debug:
ofs = 0
while True:
(opcode, operand) = struct.unpack("<Bi", text_sect.view[ofs : ofs + 5])
if opcode != 0xE9:
break
thunk_ofs = text_start + ofs
jmp_ofs = text_start + ofs + 5 + operand
self.thunks.append((thunk_ofs, jmp_ofs))
ofs += 5
# Now check for import thunks which are present in debug and release.
# These use an absolute JMP with the 2 byte opcode: 0xff 0x25
idata_sect = self.get_section_by_name(".idata") idata_sect = self.get_section_by_name(".idata")
start = text_sect.virtual_address ofs = text_start
ofs = start
for shift in (0, 2, 4): for shift in (0, 2, 4):
window = text_sect.view[shift:] window = text_sect.view[shift:]

View file

@ -84,6 +84,7 @@ def __init__(
self._load_cvdump() self._load_cvdump()
self._load_markers() self._load_markers()
self._find_original_strings() self._find_original_strings()
self._match_imports()
self._match_thunks() self._match_thunks()
self._match_exports() self._match_exports()
self._find_vtordisp() self._find_vtordisp()
@ -250,7 +251,9 @@ def _find_original_strings(self):
self._db.match_string(addr, string) self._db.match_string(addr, string)
def _match_thunks(self): def _match_imports(self):
"""We can match imported functions based on the DLL name and
function symbol name."""
orig_byaddr = { orig_byaddr = {
addr: (dll.upper(), name) for (dll, name, addr) in self.orig_bin.imports addr: (dll.upper(), name) for (dll, name, addr) in self.orig_bin.imports
} }
@ -268,27 +271,41 @@ def _match_thunks(self):
# Now: we have the IAT offset in each matched up, so we need to make # Now: we have the IAT offset in each matched up, so we need to make
# the connection between the thunk functions. # the connection between the thunk functions.
# We already have the symbol name we need from the PDB. # We already have the symbol name we need from the PDB.
orig_thunks = {
iat_ofs: func_ofs for (func_ofs, iat_ofs) in self.orig_bin.thunks
}
recomp_thunks = {
iat_ofs: func_ofs for (func_ofs, iat_ofs) in self.recomp_bin.thunks
}
for orig, recomp in orig_to_recomp.items(): for orig, recomp in orig_to_recomp.items():
self._db.set_pair(orig, recomp, SymbolType.POINTER) self._db.set_pair(orig, recomp, SymbolType.POINTER)
thunk_from_orig = orig_thunks.get(orig, None)
thunk_from_recomp = recomp_thunks.get(recomp, None)
if thunk_from_orig is not None and thunk_from_recomp is not None: def _match_thunks(self):
self._db.set_function_pair(thunk_from_orig, thunk_from_recomp) """Thunks are (by nature) matched by indirection. If a thunk from orig
# Don't compare thunk functions for now. The comparison isn't points at a function we have already matched, we can find the matching
# "useful" in the usual sense. We are only looking at the 6 thunk in recomp because it points to the same place."""
# bytes of the jmp instruction and not the larger context of
# where this function is. Also: these will always match 100% # Turn this one inside out for easy lookup
# because we are searching for a match to register this as a recomp_thunks = {
# function in the first place. func_addr: thunk_addr for (thunk_addr, func_addr) in self.recomp_bin.thunks
self._db.skip_compare(thunk_from_orig) }
for orig_thunk, orig_addr in self.orig_bin.thunks:
orig_func = self._db.get_by_orig(orig_addr)
if orig_func is None or orig_func.recomp_addr is None:
continue
# Check whether the thunk destination is a matched symbol
recomp_thunk = recomp_thunks.get(orig_func.recomp_addr)
if recomp_thunk is None:
continue
# The thunk symbol should already exist if it is the thunk of an
# imported function. Incremental build thunks have no symbol,
# so we need to give it a name for the asm diff output.
self._db.register_thunk(orig_thunk, recomp_thunk, orig_func.name)
# Don't compare thunk functions for now. The comparison isn't
# "useful" in the usual sense. We are only looking at the
# bytes of the jmp instruction and not the larger context of
# where this function is. Also: these will always match 100%
# because we are searching for a match to register this as a
# function in the first place.
self._db.skip_compare(orig_thunk)
def _match_exports(self): def _match_exports(self):
# invert for name lookup # invert for name lookup
@ -560,7 +577,7 @@ def match_text(m: Optional[MatchInfo], raw_addr: Optional[int] = None) -> str:
def _compare_match(self, match: MatchInfo) -> Optional[DiffReport]: def _compare_match(self, match: MatchInfo) -> Optional[DiffReport]:
"""Router for comparison type""" """Router for comparison type"""
if match.size == 0: if match.size is None or match.size == 0:
return None return None
options = self._db.get_match_options(match.orig_addr) options = self._db.get_match_options(match.orig_addr)

View file

@ -221,6 +221,29 @@ def set_function_pair(self, orig: int, recomp: int) -> bool:
"""For lineref match or _entry""" """For lineref match or _entry"""
return self.set_pair(orig, recomp, SymbolType.FUNCTION) return self.set_pair(orig, recomp, SymbolType.FUNCTION)
def register_thunk(self, orig: int, recomp: int, name: str) -> bool:
"""orig/recomp are an address pair of a thunk to some other function.
We may or may not already have this function tracked in the db.
If not, we need to create it, and we will use the name
(of the function being thunked, presumably) to mock up a name for
this symbol."""
# Start by assuming the row exists
if self.set_function_pair(orig, recomp):
return True
thunk_name = f"Thunk of '{name}'"
# Assuming relative jump instruction for thunks (5 bytes)
cur = self._db.execute(
"""INSERT INTO `symbols`
(orig_addr, recomp_addr, compare_type, name, size)
VALUES (?,?,?,?,?)""",
(orig, recomp, SymbolType.FUNCTION.value, thunk_name, 5),
)
return cur.rowcount > 0
def _set_opt_bool(self, addr: int, option: str, enabled: bool = True): def _set_opt_bool(self, addr: int, option: str, enabled: bool = True):
if enabled: if enabled:
self._db.execute( self._db.execute(