Cvdump parser and comparing library functions (#383)

* Cvdump wrapper and parser. Matching library functions * Remove 'Self' type int (3.11+) * Add temp reference for entrypoints * ISLE using multithreaded libc * 🙄
2025-04-21 11:00:52 -04:00 · 2023-12-28 16:10:57 -05:00 · 2023-12-28 16:10:57 -05:00 · 9a6d555508
commit 9a6d555508
parent ff4845a6ea
7 changed files with 395 additions and 117 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -294,7 +294,7 @@ if (MSVC)
  # game was originally built with) and tweaked slightly to produce more debugging info for reccmp.
  # They ensure a recompilation that can be byte/instruction accurate to the original binaries.
  if (ISLE_BUILD_APP)
-    target_compile_options(isle PRIVATE "/ML$<$<CONFIG:Debug>:d>")
+    target_compile_options(isle PRIVATE "/MT$<$<CONFIG:Debug>:d>")
  endif()
  target_compile_options(lego1 PRIVATE "/MT$<$<CONFIG:Debug>:d>")

--- a/LEGO1/library_msvc.h
+++ b/LEGO1/library_msvc.h
@ -0,0 +1,47 @@
+#ifdef 0
+
+// LIBRARY: ISLE 0x402f80
+// LIBRARY: LEGO1 0x10086240
+// _malloc
+
+// LIBRARY: ISLE 0x402fa0
+// LIBRARY: LEGO1 0x10086260
+// _free
+
+// LIBRARY: ISLE 0x408220
+// LIBRARY: LEGO1 0x1008b400
+// _atol
+
+// LIBRARY: ISLE 0x4082d0
+// LIBRARY: LEGO1 0x1008b4b0
+// _atoi
+
+// LIBRARY: LEGO1 0x1008b4c0
+// _strtok
+
+// LIBRARY: ISLE 0x4085c0
+// LIBRARY: LEGO1 0x1008b5a0
+// _sprintf
+
+// LIBRARY: ISLE 0x4081e0
+// _srand
+
+// LIBRARY: ISLE 0x4081f0
+// LIBRARY: LEGO1 0x1008b640
+// _rand
+
+// entry
+// LIBRARY: ISLE 0x4082e0
+// _WinMainCRTStartup
+
+// entry
+// LIBRARY: LEGO1 0x1008c860
+// __DllMainCRTStartup@12
+
+// LIBRARY: ISLE 0x409110
+// __mtinit
+
+// LIBRARY: ISLE 0x409190
+// __getptd
+
+#endif
--- a/tools/isledecomp/isledecomp/bin.py
+++ b/tools/isledecomp/isledecomp/bin.py
@ -1,4 +1,6 @@
 import struct
+from typing import List, Optional
+from dataclasses import dataclass
 from collections import namedtuple


@ -33,44 +35,56 @@ PEHeader = namedtuple(
    ],
 )

-ImageSectionHeader = namedtuple(
-    "ImageSectionHeader",
-    [
-        "Name",
-        "Misc",
-        "VirtualAddress",
-        "SizeOfRawData",
-        "PointerToRawData",
-        "PointerToRelocations",
-        "PointerToLineNumbers",
-        "NumberOfRelocations",
-        "NumberOfLineNumbers",
-        "Characteristics",
-    ],
-)

+@dataclass
+class ImageSectionHeader:
+    # pylint: disable=too-many-instance-attributes
+    # Most attributes are unused, but this is the struct format
+    name: bytes
+    virtual_size: int
+    virtual_address: int
+    size_of_raw_data: int
+    pointer_to_raw_data: int
+    pointer_to_relocations: int
+    pointer_to_line_numbers: int
+    number_of_relocations: int
+    number_of_line_numbers: int
+    characteristics: int

-def section_name_match(section, name):
-    return section.Name == struct.pack("8s", name.encode("ascii"))
+    def match_name(self, name: str) -> bool:
+        return self.name == struct.pack("8s", name.encode("ascii"))

+    def contains_vaddr(self, vaddr: int) -> bool:
+        ofs = vaddr - self.virtual_address
+        return 0 <= ofs < max(self.size_of_raw_data, self.virtual_size)

-def section_contains_vaddr(section, imagebase, vaddr) -> bool:
-    debased = vaddr - imagebase
-    ofs = debased - section.VirtualAddress
-    return 0 <= ofs < section.SizeOfRawData
+    def addr_is_uninitialized(self, vaddr: int) -> bool:
+        """We cannot rely on the IMAGE_SCN_CNT_UNINITIALIZED_DATA flag (0x80) in
+        the characteristics field so instead we determine it this way."""
+        if not self.contains_vaddr(vaddr):
+            return False
+
+        # Should include the case where size_of_raw_data == 0,
+        # meaning the entire section is uninitialized
+        return (self.virtual_size > self.size_of_raw_data) and (
+            vaddr - self.virtual_address >= self.size_of_raw_data
+        )


 class Bin:
    """Parses a PE format EXE and allows reading data from a virtual address.
    Reference: https://learn.microsoft.com/en-us/windows/win32/debug/pe-format"""

-    def __init__(self, filename, logger=None):
+    # pylint: disable=too-many-instance-attributes
+
+    def __init__(self, filename: str, logger=None) -> None:
        self.logger = logger
        self._debuglog(f'Parsing headers of "{filename}"... ')
        self.filename = filename
        self.file = None
        self.imagebase = None
-        self.sections = []
+        self.entry = None
+        self.sections: List[ImageSectionHeader] = []
        self.last_section = None
        self._relocated_addrs = set()

@ -95,12 +109,18 @@ class Bin:

        optional_hdr = self.file.read(pe_hdr.SizeOfOptionalHeader)
        (self.imagebase,) = struct.unpack("<i", optional_hdr[0x1C:0x20])
+        (entry,) = struct.unpack("<i", optional_hdr[0x10:0x14])
+        self.entry = entry + self.imagebase

        self.sections = [
            ImageSectionHeader(*struct.unpack("<8s6I2HI", self.file.read(0x28)))
            for i in range(pe_hdr.NumberOfSections)
        ]

+        # Add the imagebase here because we almost never need the base vaddr without it
+        for sect in self.sections:
+            sect.virtual_address += self.imagebase
+
        self._populate_relocations()

        text_section = self._get_section_by_name(".text")
@ -119,7 +139,7 @@ class Bin:
        if self.logger is not None:
            self.logger.debug(msg)

-    def get_relocated_addresses(self):
+    def get_relocated_addresses(self) -> List[int]:
        return sorted(self._relocated_addrs)

    def is_relocated_addr(self, vaddr) -> bool:
@ -165,27 +185,25 @@ class Bin:
            (relocated_addr,) = struct.unpack("<I", self.read(addr, 4))
            self._relocated_addrs.add(relocated_addr)

-    def _set_section_for_vaddr(self, vaddr):
-        if self.last_section is not None and section_contains_vaddr(
-            self.last_section, self.imagebase, vaddr
-        ):
+    def _set_section_for_vaddr(self, vaddr: int):
+        if self.last_section is not None and self.last_section.contains_vaddr(vaddr):
            return

        # TODO: assumes no potential for section overlap. reasonable?
        self.last_section = next(
            filter(
-                lambda section: section_contains_vaddr(section, self.imagebase, vaddr),
+                lambda section: section.contains_vaddr(vaddr),
                self.sections,
            ),
            None,
        )

        if self.last_section is None:
-            raise InvalidVirtualAddressError
+            raise InvalidVirtualAddressError(f"0x{vaddr:08x}")

-    def _get_section_by_name(self, name):
+    def _get_section_by_name(self, name: str):
        section = next(
-            filter(lambda section: section_name_match(section, name), self.sections),
+            filter(lambda section: section.match_name(name), self.sections),
            None,
        )

@ -194,7 +212,7 @@ class Bin:

        return section

-    def get_section_offset_by_index(self, index) -> int:
+    def get_section_offset_by_index(self, index: int) -> int:
        """The symbols output from cvdump gives addresses in this format: AAAA.BBBBBBBB
        where A is the index (1-based) into the section table and B is the local offset.
        This will return the virtual address for the start of the section at the given index
@ -202,29 +220,33 @@ class Bin:
        """

        section = self.sections[index - 1]
-        return self.imagebase + section.VirtualAddress
+        return section.virtual_address

-    def get_section_offset_by_name(self, name) -> int:
+    def get_section_offset_by_name(self, name: str) -> int:
        """Same as above, but use the section name as the lookup"""

        section = self._get_section_by_name(name)
-        return self.imagebase + section.VirtualAddress
+        return section.virtual_address

-    def get_raw_addr(self, vaddr) -> int:
+    def get_abs_addr(self, section: int, offset: int) -> int:
+        """Convenience function for converting section:offset pairs from cvdump
+        into an absolute vaddr."""
+        return self.get_section_offset_by_index(section) + offset
+
+    def get_raw_addr(self, vaddr: int) -> int:
        """Returns the raw offset in the PE binary for the given virtual address."""
        self._set_section_for_vaddr(vaddr)
        return (
            vaddr
-            - self.imagebase
-            - self.last_section.VirtualAddress
-            + self.last_section.PointerToRawData
+            - self.last_section.virtual_address
+            + self.last_section.pointer_to_raw_data
        )

-    def is_valid_vaddr(self, vaddr) -> bool:
+    def is_valid_vaddr(self, vaddr: int) -> bool:
        """Does this virtual address point to anything in the exe?"""
        section = next(
            filter(
-                lambda section: section_contains_vaddr(section, self.imagebase, vaddr),
+                lambda section: section.contains_vaddr(vaddr),
                self.sections,
            ),
            None,
@ -232,9 +254,14 @@ class Bin:

        return section is not None

-    def read(self, offset, size):
+    def read(self, offset: int, size: int) -> Optional[bytes]:
+        """Read (at most) the given number of bytes at the given virtual address.
+        If we return None, the given address points to uninitialized data."""
        self._set_section_for_vaddr(offset)

+        if self.last_section.addr_is_uninitialized(offset):
+            return None
+
        raw_addr = self.get_raw_addr(offset)
        self.file.seek(raw_addr)

@ -242,8 +269,8 @@ class Bin:
        # Reading off the end will most likely misrepresent the virtual addressing.
        _size = min(
            size,
-            self.last_section.PointerToRawData
-            + self.last_section.SizeOfRawData
+            self.last_section.pointer_to_raw_data
+            + self.last_section.size_of_raw_data
            - raw_addr,
        )
        return self.file.read(_size)
--- a/tools/isledecomp/isledecomp/cvdump/init.py
+++ b/tools/isledecomp/isledecomp/cvdump/init.py
@ -0,0 +1,2 @@
+from .parser import CvdumpParser
+from .runner import Cvdump
--- a/tools/isledecomp/isledecomp/cvdump/parser.py
+++ b/tools/isledecomp/isledecomp/cvdump/parser.py
@ -0,0 +1,163 @@
+import re
+from typing import Iterable
+from collections import namedtuple
+
+# e.g. `*** PUBLICS`
+_section_change_regex = re.compile(r"^\*\*\* (?P<section>[A-Z/ ]+)")
+
+# e.g. `     27 00034EC0     28 00034EE2     29 00034EE7     30 00034EF4`
+_line_addr_pairs_findall = re.compile(r"\s+(?P<line_no>\d+) (?P<addr>[A-F0-9]{8})")
+
+# We assume no spaces in the file name
+# e.g. `  Z:\lego-island\isle\LEGO1\viewmanager\viewroi.cpp (None), 0001:00034E90-00034E97, line/addr pairs = 2`
+_lines_subsection_header = re.compile(
+    r"^\s*(?P<filename>\S+).*?, (?P<section>[A-F0-9]{4}):(?P<start>[A-F0-9]{8})-(?P<end>[A-F0-9]{8}), line/addr pairs = (?P<len>\d+)"
+)
+
+# e.g. `S_PUB32: [0001:0003FF60], Flags: 00000000, __read`
+_publics_line_regex = re.compile(
+    r"^(?P<type>\w+): \[(?P<section>\w{4}):(?P<offset>\w{8})], Flags: (?P<flags>\w{8}), (?P<name>\S+)"
+)
+
+# e.g. `(00008C) S_GPROC32: [0001:00034E90], Cb: 00000007, Type:             0x1024, ViewROI::IntrinsicImportance`
+_symbol_line_regex = re.compile(
+    r"\(\w+\) (?P<type>\S+): \[(?P<section>\w{4}):(?P<offset>\w{8})\], Cb: (?P<size>\w+), Type:\s+\S+, (?P<name>.+)"
+)
+
+# e.g. `         Debug start: 00000008, Debug end: 0000016E`
+_gproc_debug_regex = re.compile(
+    r"\s*Debug start: (?P<start>\w{8}), Debug end: (?P<end>\w{8})"
+)
+
+# e.g. `  00DA  0001:00000000  00000073  60501020`
+_section_contrib_regex = re.compile(
+    r"\s*(?P<module>\w{4})  (?P<section>\w{4}):(?P<offset>\w{8})  (?P<size>\w{8})  (?P<flags>\w{8})"
+)
+
+# e.g. `S_GDATA32: [0003:000004A4], Type:   T_32PRCHAR(0470), g_set`
+_gdata32_regex = re.compile(
+    r"S_GDATA32: \[(?P<section>\w{4}):(?P<offset>\w{8})\], Type:\s*(?P<type>\S+), (?P<name>\S+)"
+)
+
+
+LinesEntry = namedtuple("LinesEntry", "filename line_no addr")
+PublicsEntry = namedtuple("PublicsEntry", "type section offset flags name")
+SymbolsEntry = namedtuple("SymbolsEntry", "type section offset size name")
+SizeRefEntry = namedtuple("SizeRefEntry", "section offset size")
+GdataEntry = namedtuple("GdataEntry", "section offset type name")
+
+
+class CvdumpParser:
+    def __init__(self) -> None:
+        self._section: str = ""
+        self._lines_filename: str = ""
+
+        self.lines = []
+        self.publics = []
+        self.symbols = []
+        self.sizerefs = []
+        self.globals = []
+
+    def _lines_section(self, line: str):
+        """Parsing entries from the LINES section. We only care about the pairs of
+        line_number and address and the subsection header to indicate which code file
+        we are in."""
+
+        # Subheader indicates a new function and possibly a new code filename.
+        if (match := _lines_subsection_header.match(line)) is not None:
+            self._lines_filename = match.group(1)
+            return
+
+        if (matches := _line_addr_pairs_findall.findall(line)) is not None:
+            for line_no, addr in matches:
+                self.lines.append(
+                    LinesEntry(
+                        filename=self._lines_filename,
+                        line_no=int(line_no),
+                        addr=int(addr, 16),
+                    )
+                )
+
+    def _publics_section(self, line: str):
+        """Match each line from PUBLICS and pull out the symbol information.
+        These are MSVC mangled symbol names. String constants and vtable
+        addresses can only be found here."""
+        if (match := _publics_line_regex.match(line)) is not None:
+            self.publics.append(
+                PublicsEntry(
+                    type=match.group("type"),
+                    section=int(match.group("section"), 16),
+                    offset=int(match.group("offset"), 16),
+                    flags=int(match.group("flags"), 16),
+                    name=match.group("name"),
+                )
+            )
+
+    def _globals_section(self, line: str):
+        """S_PROCREF may be useful later.
+        Right now we just want S_GDATA32 symbols because it is the simplest
+        way to access global variables."""
+        if (match := _gdata32_regex.match(line)) is not None:
+            self.globals.append(
+                GdataEntry(
+                    section=int(match.group("section"), 16),
+                    offset=int(match.group("offset"), 16),
+                    type=match.group("type"),
+                    name=match.group("name"),
+                )
+            )
+
+    def _symbols_section(self, line: str):
+        """We are interested in S_GPROC32 symbols only."""
+        if (match := _symbol_line_regex.match(line)) is not None:
+            if match.group("type") == "S_GPROC32":
+                self.symbols.append(
+                    SymbolsEntry(
+                        type=match.group("type"),
+                        section=int(match.group("section"), 16),
+                        offset=int(match.group("offset"), 16),
+                        size=int(match.group("size"), 16),
+                        name=match.group("name"),
+                    )
+                )
+
+    def _section_contributions(self, line: str):
+        """Gives the size of elements across all sections of the binary.
+        This is the easiest way to get the data size for .data and .rdata
+        members that do not have a primitive data type."""
+        if (match := _section_contrib_regex.match(line)) is not None:
+            self.sizerefs.append(
+                SizeRefEntry(
+                    section=int(match.group("section"), 16),
+                    offset=int(match.group("offset"), 16),
+                    size=int(match.group("size"), 16),
+                )
+            )
+
+    def read_line(self, line: str):
+        # Blank lines are there to help the reader; they have no context significance
+        if line.strip() == "":
+            return
+
+        if (match := _section_change_regex.match(line)) is not None:
+            self._section = match.group(1)
+            return
+
+        if self._section == "LINES":
+            self._lines_section(line)
+
+        elif self._section == "PUBLICS":
+            self._publics_section(line)
+
+        elif self._section == "SYMBOLS":
+            self._symbols_section(line)
+
+        elif self._section == "SECTION CONTRIBUTIONS":
+            self._section_contributions(line)
+
+        elif self._section == "GLOBALS":
+            self._globals_section(line)
+
+    def read_lines(self, lines: Iterable[str]):
+        for line in lines:
+            self.read_line(line)
--- a/tools/isledecomp/isledecomp/cvdump/runner.py
+++ b/tools/isledecomp/isledecomp/cvdump/runner.py
@ -0,0 +1,66 @@
+from os import name as os_name
+from enum import Enum
+from typing import List
+import subprocess
+from isledecomp.lib import lib_path_join
+from isledecomp.dir import winepath_unix_to_win
+from .parser import CvdumpParser
+
+
+class DumpOpt(Enum):
+    LINES = 0
+    SYMBOLS = 1
+    GLOBALS = 2
+    PUBLICS = 3
+    SECTION_CONTRIB = 4
+
+
+cvdump_opt_map = {
+    DumpOpt.LINES: "-l",
+    DumpOpt.SYMBOLS: "-s",
+    DumpOpt.GLOBALS: "-g",
+    DumpOpt.PUBLICS: "-p",
+    DumpOpt.SECTION_CONTRIB: "-seccontrib",
+}
+
+
+class Cvdump:
+    def __init__(self, pdb: str) -> None:
+        self._pdb: str = pdb
+        self._options = set()
+
+    def lines(self):
+        self._options.add(DumpOpt.LINES)
+        return self
+
+    def symbols(self):
+        self._options.add(DumpOpt.SYMBOLS)
+        return self
+
+    def globals(self):
+        self._options.add(DumpOpt.GLOBALS)
+        return self
+
+    def publics(self):
+        self._options.add(DumpOpt.PUBLICS)
+        return self
+
+    def section_contributions(self):
+        self._options.add(DumpOpt.SECTION_CONTRIB)
+        return self
+
+    def cmd_line(self) -> List[str]:
+        cvdump_exe = lib_path_join("cvdump.exe")
+        flags = [cvdump_opt_map[opt] for opt in self._options]
+
+        if os_name == "nt":
+            return [cvdump_exe, *flags, self._pdb]
+
+        return ["wine", *flags, cvdump_exe, winepath_unix_to_win(self._pdb)]
+
+    def run(self) -> CvdumpParser:
+        p = CvdumpParser()
+        call = self.cmd_line()
+        lines = subprocess.check_output(call).decode("utf-8").split("\r\n")
+        p.read_lines(lines)
+        return p
--- a/tools/isledecomp/isledecomp/syminfo.py
+++ b/tools/isledecomp/isledecomp/syminfo.py
@ -1,7 +1,6 @@
 import os
-import subprocess
-from isledecomp.lib import lib_path_join
-from isledecomp.dir import PathResolver, winepath_unix_to_win
+from isledecomp.dir import PathResolver
+from isledecomp.cvdump import Cvdump


 class RecompiledInfo:
@ -20,81 +19,55 @@ class SymInfo:
    def __init__(self, pdb, sym_recompfile, sym_logger, base_dir):
        self.logger = sym_logger
        path_resolver = PathResolver(base_dir)
-        call = [lib_path_join("cvdump.exe"), "-l", "-s"]
-
-        if os.name != "nt":
-            # Run cvdump through wine and convert path to Windows-friendly wine path
-            call.insert(0, "wine")
-            call.append(winepath_unix_to_win(pdb))
-        else:
-            call.append(pdb)

        self.logger.info("Parsing %s ...", pdb)
-        self.logger.debug("Command = %s", call)
-        line_dump = subprocess.check_output(call).decode("utf-8").split("\r\n")
-
-        current_section = None
-
        self.logger.debug("Parsing output of cvdump.exe ...")

-        for i, line in enumerate(line_dump):
-            if line.startswith("***"):
-                current_section = line[4:]
-
-            if current_section == "SYMBOLS" and "S_GPROC32" in line:
-                sym_section = int(line[21:25], 16)
-                sym_addr = int(line[26:34], 16)
-
-                info = RecompiledInfo()
-                info.addr = sym_addr + sym_recompfile.get_section_offset_by_index(
-                    sym_section
-                )
-
-                use_dbg_offs = False
-                if use_dbg_offs:
-                    debug_offs = line_dump[i + 2]
-                    debug_start = int(debug_offs[22:30], 16)
-                    debug_end = int(debug_offs[43:], 16)
-
-                    info.start = debug_start
-                    info.size = debug_end - debug_start
-                else:
-                    info.start = 0
-                    info.size = int(line[41:49], 16)
-
-                info.name = line[77:]
-
-                self.names[info.name] = info
-                self.funcs[sym_addr] = info
-            elif (
-                current_section == "LINES"
-                and line.startswith("  ")
-                and not line.startswith("   ")
-            ):
-                sourcepath = line.split()[0]
-                sourcepath = path_resolver.resolve_cvdump(sourcepath)
-
-                if sourcepath not in self.lines:
-                    self.lines[sourcepath] = {}
-
-                j = i + 2
-                while True:
-                    ll = line_dump[j].split()
-                    if len(ll) == 0:
-                        break
-
-                    k = 0
-                    while k < len(ll):
-                        linenum = int(ll[k + 0])
-                        address = int(ll[k + 1], 16)
-                        if linenum not in self.lines[sourcepath]:
-                            self.lines[sourcepath][linenum] = address
-                        k += 2
-
-                    j += 1
+        cv = Cvdump(pdb).lines().symbols().publics().section_contributions().run()

        self.logger.debug("... Parsing output of cvdump.exe finished")

+        contrib_dict = {(s.section, s.offset): s.size for s in cv.sizerefs}
+        for pub in cv.publics:
+            if (
+                pub.type == "S_PUB32"
+                and pub.name.startswith("_")
+                and (pub.section, pub.offset) in contrib_dict
+            ):
+                size = contrib_dict[(pub.section, pub.offset)]
+
+                info = RecompiledInfo()
+                info.addr = sym_recompfile.get_abs_addr(pub.section, pub.offset)
+
+                info.start = 0
+                info.size = size
+                info.name = pub.name
+                self.names[pub.name] = info
+                self.funcs[pub.offset] = info
+
+        for proc in cv.symbols:
+            if proc.type != "S_GPROC32":
+                continue
+
+            info = RecompiledInfo()
+            info.addr = sym_recompfile.get_abs_addr(proc.section, proc.offset)
+
+            info.start = 0
+            info.size = proc.size
+            info.name = proc.name
+
+            self.names[proc.name] = info
+            self.funcs[proc.offset] = info
+
+        for sourcepath, line_no, offset in cv.lines:
+            sourcepath = path_resolver.resolve_cvdump(sourcepath)
+
+            if sourcepath not in self.lines:
+                self.lines[sourcepath] = {}
+
+            if line_no not in self.lines[sourcepath]:
+                self.lines[sourcepath][line_no] = offset
+
    def get_recompiled_address(self, filename, line):
        recompiled_addr = None