diff --git a/tools/isledecomp/isledecomp/bin.py b/tools/isledecomp/isledecomp/bin.py index 38245ace..bee28444 100644 --- a/tools/isledecomp/isledecomp/bin.py +++ b/tools/isledecomp/isledecomp/bin.py @@ -1,47 +1,200 @@ import struct +from collections import namedtuple + + +class MZHeaderNotFoundError(Exception): + """MZ magic string not found at the start of the binary.""" + + +class PEHeaderNotFoundError(Exception): + """PE magic string not found at the offset given in 0x3c.""" + + +class SectionNotFoundError(KeyError): + """The specified section was not found in the file.""" + + +class InvalidVirtualAddressError(IndexError): + """The given virtual address is too high or low + to point to something in the binary file.""" + + +PEHeader = namedtuple( + "PEHeader", + [ + "Signature", + "Machine", + "NumberOfSections", + "TimeDateStamp", + "PointerToSymbolTable", # deprecated + "NumberOfSymbols", # deprecated + "SizeOfOptionalHeader", + "Characteristics", + ], +) + +ImageSectionHeader = namedtuple( + "ImageSectionHeader", + [ + "Name", + "Misc", + "VirtualAddress", + "SizeOfRawData", + "PointerToRawData", + "PointerToRelocations", + "PointerToLineNumbers", + "NumberOfRelocations", + "NumberOfLineNumbers", + "Characteristics", + ], +) + + +def section_name_match(section, name): + return section.Name == struct.pack("8s", name.encode("ascii")) + + +def section_contains_vaddr(section, imagebase, vaddr) -> bool: + debased = vaddr - imagebase + ofs = debased - section.VirtualAddress + return 0 <= ofs < section.SizeOfRawData -# Declare a class that can automatically convert virtual executable addresses -# to file addresses class Bin: - def __init__(self, filename, logger): + """Parses a PE format EXE and allows reading data from a virtual address. + Reference: https://learn.microsoft.com/en-us/windows/win32/debug/pe-format""" + + def __init__(self, filename, logger=None): self.logger = logger - self.logger.debug('Parsing headers of "%s"... ', filename) + self._debuglog(f'Parsing headers of "{filename}"... ') self.filename = filename self.file = None self.imagebase = None - self.textvirt = None - self.textraw = None + self.sections = [] + self.last_section = None def __enter__(self): - self.logger.debug(f"Bin {self.filename} Enter") + self._debuglog(f"Bin {self.filename} Enter") self.file = open(self.filename, "rb") - # HACK: Strictly, we should be parsing the header, but we know where - # everything is in these two files so we just jump straight there + (mz_str,) = struct.unpack("2s", self.file.read(2)) + if mz_str != b"MZ": + raise MZHeaderNotFoundError - # Read ImageBase - self.file.seek(0xB4) - (self.imagebase,) = struct.unpack(" int: + """The symbols output from cvdump gives addresses in this format: AAAA.BBBBBBBB + where A is the index (1-based) into the section table and B is the local offset. + This will return the virtual address for the start of the section at the given index + so you can get the virtual address for whatever symbol you are looking at. + """ + + section = self.sections[index - 1] + return self.imagebase + section.VirtualAddress + + def get_section_offset_by_name(self, name) -> int: + """Same as above, but use the section name as the lookup""" + + section = self._get_section_by_name(name) + return self.imagebase + section.VirtualAddress + + def get_raw_addr(self, vaddr) -> int: + """Returns the raw offset in the PE binary for the given virtual address.""" + self._set_section_for_vaddr(vaddr) + return ( + vaddr + - self.imagebase + - self.last_section.VirtualAddress + + self.last_section.PointerToRawData + ) + + def is_valid_vaddr(self, vaddr) -> bool: + """Does this virtual address point to anything in the exe?""" + section = next( + filter( + lambda section: section_contains_vaddr(section, self.imagebase, vaddr), + self.sections, + ), + None, + ) + + return section is not None def read(self, offset, size): - self.file.seek(self.get_addr(offset)) - return self.file.read(size) + self._set_section_for_vaddr(offset) + + raw_addr = self.get_raw_addr(offset) + self.file.seek(raw_addr) + + # Clamp the read within the extent of the current section. + # Reading off the end will most likely misrepresent the virtual addressing. + _size = min( + size, + self.last_section.PointerToRawData + + self.last_section.SizeOfRawData + - raw_addr, + ) + return self.file.read(_size) diff --git a/tools/isledecomp/isledecomp/syminfo.py b/tools/isledecomp/isledecomp/syminfo.py index a14bbbe2..6a26e30f 100644 --- a/tools/isledecomp/isledecomp/syminfo.py +++ b/tools/isledecomp/isledecomp/syminfo.py @@ -40,11 +40,12 @@ class SymInfo: current_section = line[4:] if current_section == "SYMBOLS" and "S_GPROC32" in line: + sym_section = int(line[21:25], 16) sym_addr = int(line[26:34], 16) info = RecompiledInfo() - info.addr = ( - sym_addr + sym_recompfile.imagebase + sym_recompfile.textvirt + info.addr = sym_addr + sym_recompfile.get_section_offset_by_index( + sym_section ) use_dbg_offs = False diff --git a/tools/reccmp/reccmp.py b/tools/reccmp/reccmp.py index f7659703..d2a7f23a 100755 --- a/tools/reccmp/reccmp.py +++ b/tools/reccmp/reccmp.py @@ -86,7 +86,7 @@ def sanitize(file, placeholder_generator, mnemonic, op_str): for i, word in enumerate(words): try: inttest = int(word, 16) - if inttest >= file.imagebase + file.textvirt: + if inttest >= file.get_section_offset_by_index(1): words[i] = placeholder_generator.get(inttest) except ValueError: pass