Read section info from PE header (#311)

* Read section info from PE header

* Remove the need for textraw and textvirt members

* typo
This commit is contained in:
MS 2023-12-06 14:30:09 -05:00 committed by GitHub
parent a7b81539b1
commit b46801a774
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 181 additions and 27 deletions

View file

@ -1,47 +1,200 @@
import struct import struct
from collections import namedtuple
class MZHeaderNotFoundError(Exception):
"""MZ magic string not found at the start of the binary."""
class PEHeaderNotFoundError(Exception):
"""PE magic string not found at the offset given in 0x3c."""
class SectionNotFoundError(KeyError):
"""The specified section was not found in the file."""
class InvalidVirtualAddressError(IndexError):
"""The given virtual address is too high or low
to point to something in the binary file."""
PEHeader = namedtuple(
"PEHeader",
[
"Signature",
"Machine",
"NumberOfSections",
"TimeDateStamp",
"PointerToSymbolTable", # deprecated
"NumberOfSymbols", # deprecated
"SizeOfOptionalHeader",
"Characteristics",
],
)
ImageSectionHeader = namedtuple(
"ImageSectionHeader",
[
"Name",
"Misc",
"VirtualAddress",
"SizeOfRawData",
"PointerToRawData",
"PointerToRelocations",
"PointerToLineNumbers",
"NumberOfRelocations",
"NumberOfLineNumbers",
"Characteristics",
],
)
def section_name_match(section, name):
return section.Name == struct.pack("8s", name.encode("ascii"))
def section_contains_vaddr(section, imagebase, vaddr) -> bool:
debased = vaddr - imagebase
ofs = debased - section.VirtualAddress
return 0 <= ofs < section.SizeOfRawData
# Declare a class that can automatically convert virtual executable addresses
# to file addresses
class Bin: class Bin:
def __init__(self, filename, logger): """Parses a PE format EXE and allows reading data from a virtual address.
Reference: https://learn.microsoft.com/en-us/windows/win32/debug/pe-format"""
def __init__(self, filename, logger=None):
self.logger = logger self.logger = logger
self.logger.debug('Parsing headers of "%s"... ', filename) self._debuglog(f'Parsing headers of "{filename}"... ')
self.filename = filename self.filename = filename
self.file = None self.file = None
self.imagebase = None self.imagebase = None
self.textvirt = None self.sections = []
self.textraw = None self.last_section = None
def __enter__(self): def __enter__(self):
self.logger.debug(f"Bin {self.filename} Enter") self._debuglog(f"Bin {self.filename} Enter")
self.file = open(self.filename, "rb") self.file = open(self.filename, "rb")
# HACK: Strictly, we should be parsing the header, but we know where (mz_str,) = struct.unpack("2s", self.file.read(2))
# everything is in these two files so we just jump straight there if mz_str != b"MZ":
raise MZHeaderNotFoundError
# Read ImageBase # Skip to PE header offset in MZ header.
self.file.seek(0xB4) self.file.seek(0x3C)
(self.imagebase,) = struct.unpack("<i", self.file.read(4)) (pe_header_start,) = struct.unpack("<I", self.file.read(4))
# Read .text VirtualAddress # PE header offset is absolute, so seek there
self.file.seek(0x184) self.file.seek(pe_header_start)
(self.textvirt,) = struct.unpack("<i", self.file.read(4)) pe_hdr = PEHeader(*struct.unpack("<2s2x2H3I2H", self.file.read(0x18)))
# Read .text PointerToRawData if pe_hdr.Signature != b"PE":
self.file.seek(0x18C) raise PEHeaderNotFoundError
(self.textraw,) = struct.unpack("<i", self.file.read(4))
self.logger.debug("... Parsing finished") optional_hdr = self.file.read(pe_hdr.SizeOfOptionalHeader)
(self.imagebase,) = struct.unpack("<i", optional_hdr[0x1C:0x20])
self.sections = [
ImageSectionHeader(*struct.unpack("<8s6I2HI", self.file.read(0x28)))
for i in range(pe_hdr.NumberOfSections)
]
text_section = self._get_section_by_name(".text")
self.last_section = text_section
self._debuglog("... Parsing finished")
return self return self
def __exit__(self, exc_type, exc_value, exc_traceback): def __exit__(self, exc_type, exc_value, exc_traceback):
self.logger.debug(f"Bin {self.filename} Exit") self._debuglog(f"Bin {self.filename} Exit")
if self.file: if self.file:
self.file.close() self.file.close()
def get_addr(self, virt): def _debuglog(self, msg):
return virt - self.imagebase - self.textvirt + self.textraw """Write to the logger, if present"""
if self.logger is not None:
self.logger.debug(msg)
def _set_section_for_vaddr(self, vaddr):
if self.last_section is not None and section_contains_vaddr(
self.last_section, self.imagebase, vaddr
):
return
# TODO: assumes no potential for section overlap. reasonable?
self.last_section = next(
filter(
lambda section: section_contains_vaddr(section, self.imagebase, vaddr),
self.sections,
),
None,
)
if self.last_section is None:
raise InvalidVirtualAddressError
def _get_section_by_name(self, name):
section = next(
filter(lambda section: section_name_match(section, name), self.sections),
None,
)
if section is None:
raise SectionNotFoundError
return section
def get_section_offset_by_index(self, index) -> int:
"""The symbols output from cvdump gives addresses in this format: AAAA.BBBBBBBB
where A is the index (1-based) into the section table and B is the local offset.
This will return the virtual address for the start of the section at the given index
so you can get the virtual address for whatever symbol you are looking at.
"""
section = self.sections[index - 1]
return self.imagebase + section.VirtualAddress
def get_section_offset_by_name(self, name) -> int:
"""Same as above, but use the section name as the lookup"""
section = self._get_section_by_name(name)
return self.imagebase + section.VirtualAddress
def get_raw_addr(self, vaddr) -> int:
"""Returns the raw offset in the PE binary for the given virtual address."""
self._set_section_for_vaddr(vaddr)
return (
vaddr
- self.imagebase
- self.last_section.VirtualAddress
+ self.last_section.PointerToRawData
)
def is_valid_vaddr(self, vaddr) -> bool:
"""Does this virtual address point to anything in the exe?"""
section = next(
filter(
lambda section: section_contains_vaddr(section, self.imagebase, vaddr),
self.sections,
),
None,
)
return section is not None
def read(self, offset, size): def read(self, offset, size):
self.file.seek(self.get_addr(offset)) self._set_section_for_vaddr(offset)
return self.file.read(size)
raw_addr = self.get_raw_addr(offset)
self.file.seek(raw_addr)
# Clamp the read within the extent of the current section.
# Reading off the end will most likely misrepresent the virtual addressing.
_size = min(
size,
self.last_section.PointerToRawData
+ self.last_section.SizeOfRawData
- raw_addr,
)
return self.file.read(_size)

View file

@ -40,11 +40,12 @@ def __init__(self, pdb, sym_recompfile, sym_logger, sym_wine_path_converter=None
current_section = line[4:] current_section = line[4:]
if current_section == "SYMBOLS" and "S_GPROC32" in line: if current_section == "SYMBOLS" and "S_GPROC32" in line:
sym_section = int(line[21:25], 16)
sym_addr = int(line[26:34], 16) sym_addr = int(line[26:34], 16)
info = RecompiledInfo() info = RecompiledInfo()
info.addr = ( info.addr = sym_addr + sym_recompfile.get_section_offset_by_index(
sym_addr + sym_recompfile.imagebase + sym_recompfile.textvirt sym_section
) )
use_dbg_offs = False use_dbg_offs = False

View file

@ -86,7 +86,7 @@ def filter_out_ptr(ptype, op_str):
for i, word in enumerate(words): for i, word in enumerate(words):
try: try:
inttest = int(word, 16) inttest = int(word, 16)
if inttest >= file.imagebase + file.textvirt: if inttest >= file.get_section_offset_by_index(1):
words[i] = placeholder_generator.get(inttest) words[i] = placeholder_generator.get(inttest)
except ValueError: except ValueError:
pass pass