Cvdump parser and comparing library functions (#383)

* Cvdump wrapper and parser. Matching library functions

* Remove 'Self' type int (3.11+)

* Add temp reference for entrypoints

* ISLE using multithreaded libc

* 🙄
This commit is contained in:
MS 2023-12-28 16:10:57 -05:00 committed by GitHub
parent ff4845a6ea
commit 9a6d555508
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 395 additions and 117 deletions

View file

@ -294,7 +294,7 @@ if (MSVC)
# game was originally built with) and tweaked slightly to produce more debugging info for reccmp.
# They ensure a recompilation that can be byte/instruction accurate to the original binaries.
if (ISLE_BUILD_APP)
target_compile_options(isle PRIVATE "/ML$<$<CONFIG:Debug>:d>")
target_compile_options(isle PRIVATE "/MT$<$<CONFIG:Debug>:d>")
endif()
target_compile_options(lego1 PRIVATE "/MT$<$<CONFIG:Debug>:d>")

47
LEGO1/library_msvc.h Normal file
View file

@ -0,0 +1,47 @@
#ifdef 0
// LIBRARY: ISLE 0x402f80
// LIBRARY: LEGO1 0x10086240
// _malloc
// LIBRARY: ISLE 0x402fa0
// LIBRARY: LEGO1 0x10086260
// _free
// LIBRARY: ISLE 0x408220
// LIBRARY: LEGO1 0x1008b400
// _atol
// LIBRARY: ISLE 0x4082d0
// LIBRARY: LEGO1 0x1008b4b0
// _atoi
// LIBRARY: LEGO1 0x1008b4c0
// _strtok
// LIBRARY: ISLE 0x4085c0
// LIBRARY: LEGO1 0x1008b5a0
// _sprintf
// LIBRARY: ISLE 0x4081e0
// _srand
// LIBRARY: ISLE 0x4081f0
// LIBRARY: LEGO1 0x1008b640
// _rand
// entry
// LIBRARY: ISLE 0x4082e0
// _WinMainCRTStartup
// entry
// LIBRARY: LEGO1 0x1008c860
// __DllMainCRTStartup@12
// LIBRARY: ISLE 0x409110
// __mtinit
// LIBRARY: ISLE 0x409190
// __getptd
#endif

View file

@ -1,4 +1,6 @@
import struct
from typing import List, Optional
from dataclasses import dataclass
from collections import namedtuple
@ -33,44 +35,56 @@ PEHeader = namedtuple(
],
)
ImageSectionHeader = namedtuple(
"ImageSectionHeader",
[
"Name",
"Misc",
"VirtualAddress",
"SizeOfRawData",
"PointerToRawData",
"PointerToRelocations",
"PointerToLineNumbers",
"NumberOfRelocations",
"NumberOfLineNumbers",
"Characteristics",
],
)
@dataclass
class ImageSectionHeader:
# pylint: disable=too-many-instance-attributes
# Most attributes are unused, but this is the struct format
name: bytes
virtual_size: int
virtual_address: int
size_of_raw_data: int
pointer_to_raw_data: int
pointer_to_relocations: int
pointer_to_line_numbers: int
number_of_relocations: int
number_of_line_numbers: int
characteristics: int
def section_name_match(section, name):
return section.Name == struct.pack("8s", name.encode("ascii"))
def match_name(self, name: str) -> bool:
return self.name == struct.pack("8s", name.encode("ascii"))
def contains_vaddr(self, vaddr: int) -> bool:
ofs = vaddr - self.virtual_address
return 0 <= ofs < max(self.size_of_raw_data, self.virtual_size)
def section_contains_vaddr(section, imagebase, vaddr) -> bool:
debased = vaddr - imagebase
ofs = debased - section.VirtualAddress
return 0 <= ofs < section.SizeOfRawData
def addr_is_uninitialized(self, vaddr: int) -> bool:
"""We cannot rely on the IMAGE_SCN_CNT_UNINITIALIZED_DATA flag (0x80) in
the characteristics field so instead we determine it this way."""
if not self.contains_vaddr(vaddr):
return False
# Should include the case where size_of_raw_data == 0,
# meaning the entire section is uninitialized
return (self.virtual_size > self.size_of_raw_data) and (
vaddr - self.virtual_address >= self.size_of_raw_data
)
class Bin:
"""Parses a PE format EXE and allows reading data from a virtual address.
Reference: https://learn.microsoft.com/en-us/windows/win32/debug/pe-format"""
def __init__(self, filename, logger=None):
# pylint: disable=too-many-instance-attributes
def __init__(self, filename: str, logger=None) -> None:
self.logger = logger
self._debuglog(f'Parsing headers of "{filename}"... ')
self.filename = filename
self.file = None
self.imagebase = None
self.sections = []
self.entry = None
self.sections: List[ImageSectionHeader] = []
self.last_section = None
self._relocated_addrs = set()
@ -95,12 +109,18 @@ class Bin:
optional_hdr = self.file.read(pe_hdr.SizeOfOptionalHeader)
(self.imagebase,) = struct.unpack("<i", optional_hdr[0x1C:0x20])
(entry,) = struct.unpack("<i", optional_hdr[0x10:0x14])
self.entry = entry + self.imagebase
self.sections = [
ImageSectionHeader(*struct.unpack("<8s6I2HI", self.file.read(0x28)))
for i in range(pe_hdr.NumberOfSections)
]
# Add the imagebase here because we almost never need the base vaddr without it
for sect in self.sections:
sect.virtual_address += self.imagebase
self._populate_relocations()
text_section = self._get_section_by_name(".text")
@ -119,7 +139,7 @@ class Bin:
if self.logger is not None:
self.logger.debug(msg)
def get_relocated_addresses(self):
def get_relocated_addresses(self) -> List[int]:
return sorted(self._relocated_addrs)
def is_relocated_addr(self, vaddr) -> bool:
@ -165,27 +185,25 @@ class Bin:
(relocated_addr,) = struct.unpack("<I", self.read(addr, 4))
self._relocated_addrs.add(relocated_addr)
def _set_section_for_vaddr(self, vaddr):
if self.last_section is not None and section_contains_vaddr(
self.last_section, self.imagebase, vaddr
):
def _set_section_for_vaddr(self, vaddr: int):
if self.last_section is not None and self.last_section.contains_vaddr(vaddr):
return
# TODO: assumes no potential for section overlap. reasonable?
self.last_section = next(
filter(
lambda section: section_contains_vaddr(section, self.imagebase, vaddr),
lambda section: section.contains_vaddr(vaddr),
self.sections,
),
None,
)
if self.last_section is None:
raise InvalidVirtualAddressError
raise InvalidVirtualAddressError(f"0x{vaddr:08x}")
def _get_section_by_name(self, name):
def _get_section_by_name(self, name: str):
section = next(
filter(lambda section: section_name_match(section, name), self.sections),
filter(lambda section: section.match_name(name), self.sections),
None,
)
@ -194,7 +212,7 @@ class Bin:
return section
def get_section_offset_by_index(self, index) -> int:
def get_section_offset_by_index(self, index: int) -> int:
"""The symbols output from cvdump gives addresses in this format: AAAA.BBBBBBBB
where A is the index (1-based) into the section table and B is the local offset.
This will return the virtual address for the start of the section at the given index
@ -202,29 +220,33 @@ class Bin:
"""
section = self.sections[index - 1]
return self.imagebase + section.VirtualAddress
return section.virtual_address
def get_section_offset_by_name(self, name) -> int:
def get_section_offset_by_name(self, name: str) -> int:
"""Same as above, but use the section name as the lookup"""
section = self._get_section_by_name(name)
return self.imagebase + section.VirtualAddress
return section.virtual_address
def get_raw_addr(self, vaddr) -> int:
def get_abs_addr(self, section: int, offset: int) -> int:
"""Convenience function for converting section:offset pairs from cvdump
into an absolute vaddr."""
return self.get_section_offset_by_index(section) + offset
def get_raw_addr(self, vaddr: int) -> int:
"""Returns the raw offset in the PE binary for the given virtual address."""
self._set_section_for_vaddr(vaddr)
return (
vaddr
- self.imagebase
- self.last_section.VirtualAddress
+ self.last_section.PointerToRawData
- self.last_section.virtual_address
+ self.last_section.pointer_to_raw_data
)
def is_valid_vaddr(self, vaddr) -> bool:
def is_valid_vaddr(self, vaddr: int) -> bool:
"""Does this virtual address point to anything in the exe?"""
section = next(
filter(
lambda section: section_contains_vaddr(section, self.imagebase, vaddr),
lambda section: section.contains_vaddr(vaddr),
self.sections,
),
None,
@ -232,9 +254,14 @@ class Bin:
return section is not None
def read(self, offset, size):
def read(self, offset: int, size: int) -> Optional[bytes]:
"""Read (at most) the given number of bytes at the given virtual address.
If we return None, the given address points to uninitialized data."""
self._set_section_for_vaddr(offset)
if self.last_section.addr_is_uninitialized(offset):
return None
raw_addr = self.get_raw_addr(offset)
self.file.seek(raw_addr)
@ -242,8 +269,8 @@ class Bin:
# Reading off the end will most likely misrepresent the virtual addressing.
_size = min(
size,
self.last_section.PointerToRawData
+ self.last_section.SizeOfRawData
self.last_section.pointer_to_raw_data
+ self.last_section.size_of_raw_data
- raw_addr,
)
return self.file.read(_size)

View file

@ -0,0 +1,2 @@
from .parser import CvdumpParser
from .runner import Cvdump

View file

@ -0,0 +1,163 @@
import re
from typing import Iterable
from collections import namedtuple
# e.g. `*** PUBLICS`
_section_change_regex = re.compile(r"^\*\*\* (?P<section>[A-Z/ ]+)")
# e.g. ` 27 00034EC0 28 00034EE2 29 00034EE7 30 00034EF4`
_line_addr_pairs_findall = re.compile(r"\s+(?P<line_no>\d+) (?P<addr>[A-F0-9]{8})")
# We assume no spaces in the file name
# e.g. ` Z:\lego-island\isle\LEGO1\viewmanager\viewroi.cpp (None), 0001:00034E90-00034E97, line/addr pairs = 2`
_lines_subsection_header = re.compile(
r"^\s*(?P<filename>\S+).*?, (?P<section>[A-F0-9]{4}):(?P<start>[A-F0-9]{8})-(?P<end>[A-F0-9]{8}), line/addr pairs = (?P<len>\d+)"
)
# e.g. `S_PUB32: [0001:0003FF60], Flags: 00000000, __read`
_publics_line_regex = re.compile(
r"^(?P<type>\w+): \[(?P<section>\w{4}):(?P<offset>\w{8})], Flags: (?P<flags>\w{8}), (?P<name>\S+)"
)
# e.g. `(00008C) S_GPROC32: [0001:00034E90], Cb: 00000007, Type: 0x1024, ViewROI::IntrinsicImportance`
_symbol_line_regex = re.compile(
r"\(\w+\) (?P<type>\S+): \[(?P<section>\w{4}):(?P<offset>\w{8})\], Cb: (?P<size>\w+), Type:\s+\S+, (?P<name>.+)"
)
# e.g. ` Debug start: 00000008, Debug end: 0000016E`
_gproc_debug_regex = re.compile(
r"\s*Debug start: (?P<start>\w{8}), Debug end: (?P<end>\w{8})"
)
# e.g. ` 00DA 0001:00000000 00000073 60501020`
_section_contrib_regex = re.compile(
r"\s*(?P<module>\w{4}) (?P<section>\w{4}):(?P<offset>\w{8}) (?P<size>\w{8}) (?P<flags>\w{8})"
)
# e.g. `S_GDATA32: [0003:000004A4], Type: T_32PRCHAR(0470), g_set`
_gdata32_regex = re.compile(
r"S_GDATA32: \[(?P<section>\w{4}):(?P<offset>\w{8})\], Type:\s*(?P<type>\S+), (?P<name>\S+)"
)
LinesEntry = namedtuple("LinesEntry", "filename line_no addr")
PublicsEntry = namedtuple("PublicsEntry", "type section offset flags name")
SymbolsEntry = namedtuple("SymbolsEntry", "type section offset size name")
SizeRefEntry = namedtuple("SizeRefEntry", "section offset size")
GdataEntry = namedtuple("GdataEntry", "section offset type name")
class CvdumpParser:
def __init__(self) -> None:
self._section: str = ""
self._lines_filename: str = ""
self.lines = []
self.publics = []
self.symbols = []
self.sizerefs = []
self.globals = []
def _lines_section(self, line: str):
"""Parsing entries from the LINES section. We only care about the pairs of
line_number and address and the subsection header to indicate which code file
we are in."""
# Subheader indicates a new function and possibly a new code filename.
if (match := _lines_subsection_header.match(line)) is not None:
self._lines_filename = match.group(1)
return
if (matches := _line_addr_pairs_findall.findall(line)) is not None:
for line_no, addr in matches:
self.lines.append(
LinesEntry(
filename=self._lines_filename,
line_no=int(line_no),
addr=int(addr, 16),
)
)
def _publics_section(self, line: str):
"""Match each line from PUBLICS and pull out the symbol information.
These are MSVC mangled symbol names. String constants and vtable
addresses can only be found here."""
if (match := _publics_line_regex.match(line)) is not None:
self.publics.append(
PublicsEntry(
type=match.group("type"),
section=int(match.group("section"), 16),
offset=int(match.group("offset"), 16),
flags=int(match.group("flags"), 16),
name=match.group("name"),
)
)
def _globals_section(self, line: str):
"""S_PROCREF may be useful later.
Right now we just want S_GDATA32 symbols because it is the simplest
way to access global variables."""
if (match := _gdata32_regex.match(line)) is not None:
self.globals.append(
GdataEntry(
section=int(match.group("section"), 16),
offset=int(match.group("offset"), 16),
type=match.group("type"),
name=match.group("name"),
)
)
def _symbols_section(self, line: str):
"""We are interested in S_GPROC32 symbols only."""
if (match := _symbol_line_regex.match(line)) is not None:
if match.group("type") == "S_GPROC32":
self.symbols.append(
SymbolsEntry(
type=match.group("type"),
section=int(match.group("section"), 16),
offset=int(match.group("offset"), 16),
size=int(match.group("size"), 16),
name=match.group("name"),
)
)
def _section_contributions(self, line: str):
"""Gives the size of elements across all sections of the binary.
This is the easiest way to get the data size for .data and .rdata
members that do not have a primitive data type."""
if (match := _section_contrib_regex.match(line)) is not None:
self.sizerefs.append(
SizeRefEntry(
section=int(match.group("section"), 16),
offset=int(match.group("offset"), 16),
size=int(match.group("size"), 16),
)
)
def read_line(self, line: str):
# Blank lines are there to help the reader; they have no context significance
if line.strip() == "":
return
if (match := _section_change_regex.match(line)) is not None:
self._section = match.group(1)
return
if self._section == "LINES":
self._lines_section(line)
elif self._section == "PUBLICS":
self._publics_section(line)
elif self._section == "SYMBOLS":
self._symbols_section(line)
elif self._section == "SECTION CONTRIBUTIONS":
self._section_contributions(line)
elif self._section == "GLOBALS":
self._globals_section(line)
def read_lines(self, lines: Iterable[str]):
for line in lines:
self.read_line(line)

View file

@ -0,0 +1,66 @@
from os import name as os_name
from enum import Enum
from typing import List
import subprocess
from isledecomp.lib import lib_path_join
from isledecomp.dir import winepath_unix_to_win
from .parser import CvdumpParser
class DumpOpt(Enum):
LINES = 0
SYMBOLS = 1
GLOBALS = 2
PUBLICS = 3
SECTION_CONTRIB = 4
cvdump_opt_map = {
DumpOpt.LINES: "-l",
DumpOpt.SYMBOLS: "-s",
DumpOpt.GLOBALS: "-g",
DumpOpt.PUBLICS: "-p",
DumpOpt.SECTION_CONTRIB: "-seccontrib",
}
class Cvdump:
def __init__(self, pdb: str) -> None:
self._pdb: str = pdb
self._options = set()
def lines(self):
self._options.add(DumpOpt.LINES)
return self
def symbols(self):
self._options.add(DumpOpt.SYMBOLS)
return self
def globals(self):
self._options.add(DumpOpt.GLOBALS)
return self
def publics(self):
self._options.add(DumpOpt.PUBLICS)
return self
def section_contributions(self):
self._options.add(DumpOpt.SECTION_CONTRIB)
return self
def cmd_line(self) -> List[str]:
cvdump_exe = lib_path_join("cvdump.exe")
flags = [cvdump_opt_map[opt] for opt in self._options]
if os_name == "nt":
return [cvdump_exe, *flags, self._pdb]
return ["wine", *flags, cvdump_exe, winepath_unix_to_win(self._pdb)]
def run(self) -> CvdumpParser:
p = CvdumpParser()
call = self.cmd_line()
lines = subprocess.check_output(call).decode("utf-8").split("\r\n")
p.read_lines(lines)
return p

View file

@ -1,7 +1,6 @@
import os
import subprocess
from isledecomp.lib import lib_path_join
from isledecomp.dir import PathResolver, winepath_unix_to_win
from isledecomp.dir import PathResolver
from isledecomp.cvdump import Cvdump
class RecompiledInfo:
@ -20,81 +19,55 @@ class SymInfo:
def __init__(self, pdb, sym_recompfile, sym_logger, base_dir):
self.logger = sym_logger
path_resolver = PathResolver(base_dir)
call = [lib_path_join("cvdump.exe"), "-l", "-s"]
if os.name != "nt":
# Run cvdump through wine and convert path to Windows-friendly wine path
call.insert(0, "wine")
call.append(winepath_unix_to_win(pdb))
else:
call.append(pdb)
self.logger.info("Parsing %s ...", pdb)
self.logger.debug("Command = %s", call)
line_dump = subprocess.check_output(call).decode("utf-8").split("\r\n")
current_section = None
self.logger.debug("Parsing output of cvdump.exe ...")
for i, line in enumerate(line_dump):
if line.startswith("***"):
current_section = line[4:]
if current_section == "SYMBOLS" and "S_GPROC32" in line:
sym_section = int(line[21:25], 16)
sym_addr = int(line[26:34], 16)
info = RecompiledInfo()
info.addr = sym_addr + sym_recompfile.get_section_offset_by_index(
sym_section
)
use_dbg_offs = False
if use_dbg_offs:
debug_offs = line_dump[i + 2]
debug_start = int(debug_offs[22:30], 16)
debug_end = int(debug_offs[43:], 16)
info.start = debug_start
info.size = debug_end - debug_start
else:
info.start = 0
info.size = int(line[41:49], 16)
info.name = line[77:]
self.names[info.name] = info
self.funcs[sym_addr] = info
elif (
current_section == "LINES"
and line.startswith(" ")
and not line.startswith(" ")
):
sourcepath = line.split()[0]
sourcepath = path_resolver.resolve_cvdump(sourcepath)
if sourcepath not in self.lines:
self.lines[sourcepath] = {}
j = i + 2
while True:
ll = line_dump[j].split()
if len(ll) == 0:
break
k = 0
while k < len(ll):
linenum = int(ll[k + 0])
address = int(ll[k + 1], 16)
if linenum not in self.lines[sourcepath]:
self.lines[sourcepath][linenum] = address
k += 2
j += 1
cv = Cvdump(pdb).lines().symbols().publics().section_contributions().run()
self.logger.debug("... Parsing output of cvdump.exe finished")
contrib_dict = {(s.section, s.offset): s.size for s in cv.sizerefs}
for pub in cv.publics:
if (
pub.type == "S_PUB32"
and pub.name.startswith("_")
and (pub.section, pub.offset) in contrib_dict
):
size = contrib_dict[(pub.section, pub.offset)]
info = RecompiledInfo()
info.addr = sym_recompfile.get_abs_addr(pub.section, pub.offset)
info.start = 0
info.size = size
info.name = pub.name
self.names[pub.name] = info
self.funcs[pub.offset] = info
for proc in cv.symbols:
if proc.type != "S_GPROC32":
continue
info = RecompiledInfo()
info.addr = sym_recompfile.get_abs_addr(proc.section, proc.offset)
info.start = 0
info.size = proc.size
info.name = proc.name
self.names[proc.name] = info
self.funcs[proc.offset] = info
for sourcepath, line_no, offset in cv.lines:
sourcepath = path_resolver.resolve_cvdump(sourcepath)
if sourcepath not in self.lines:
self.lines[sourcepath] = {}
if line_no not in self.lines[sourcepath]:
self.lines[sourcepath][line_no] = offset
def get_recompiled_address(self, filename, line):
recompiled_addr = None