From b66b02acb12121cc56054aa2b8896c75f30204b2 Mon Sep 17 00:00:00 2001 From: disinvite Date: Sat, 2 Mar 2024 19:28:29 -0500 Subject: [PATCH] Parse cvdump TYPES section. Add datacmp tool. --- tools/datacmp.py | 327 +++++++++++++ tools/isledecomp/isledecomp/compare/core.py | 23 + .../isledecomp/isledecomp/cvdump/__init__.py | 1 + .../isledecomp/isledecomp/cvdump/analysis.py | 60 +-- tools/isledecomp/isledecomp/cvdump/parser.py | 8 +- tools/isledecomp/isledecomp/cvdump/runner.py | 6 + tools/isledecomp/isledecomp/cvdump/types.py | 433 +++++++++++++++++ tools/isledecomp/tests/test_cvdump.py | 86 ++-- tools/isledecomp/tests/test_cvdump_types.py | 452 ++++++++++++++++++ 9 files changed, 1319 insertions(+), 77 deletions(-) create mode 100644 tools/datacmp.py create mode 100644 tools/isledecomp/isledecomp/cvdump/types.py create mode 100644 tools/isledecomp/tests/test_cvdump_types.py diff --git a/tools/datacmp.py b/tools/datacmp.py new file mode 100644 index 00000000..31bb24b0 --- /dev/null +++ b/tools/datacmp.py @@ -0,0 +1,327 @@ +# (New) Data comparison. + +import os +import argparse +import logging +from enum import Enum +from typing import Iterable, List, NamedTuple, Optional, Tuple +from struct import unpack +from isledecomp.compare import Compare as IsleCompare +from isledecomp.compare.db import MatchInfo +from isledecomp.cvdump import Cvdump +from isledecomp.cvdump.types import ( + CvdumpKeyError, + CvdumpIntegrityError, +) +from isledecomp.bin import Bin as IsleBin +import colorama + +colorama.init() + + +# Ignore all compare-db messages. +logging.getLogger("isledecomp.compare").addHandler(logging.NullHandler()) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Comparing data values.") + parser.add_argument( + "original", metavar="original-binary", help="The original binary" + ) + parser.add_argument( + "recompiled", metavar="recompiled-binary", help="The recompiled binary" + ) + parser.add_argument( + "pdb", metavar="recompiled-pdb", help="The PDB of the recompiled binary" + ) + parser.add_argument( + "decomp_dir", metavar="decomp-dir", help="The decompiled source tree" + ) + parser.add_argument( + "-v", + "--verbose", + action=argparse.BooleanOptionalAction, + default=False, + help="", + ) + parser.add_argument( + "--no-color", "-n", action="store_true", help="Do not color the output" + ) + parser.add_argument( + "--print-rec-addr", + action="store_true", + help="Print addresses of recompiled functions too", + ) + + (args, _) = parser.parse_known_args() + + if not os.path.isfile(args.original): + parser.error(f"Original binary {args.original} does not exist") + + if not os.path.isfile(args.recompiled): + parser.error(f"Recompiled binary {args.recompiled} does not exist") + + if not os.path.isfile(args.pdb): + parser.error(f"Symbols PDB {args.pdb} does not exist") + + if not os.path.isdir(args.decomp_dir): + parser.error(f"Source directory {args.decomp_dir} does not exist") + + return args + + +class CompareResult(Enum): + MATCH = 1 + DIFF = 2 + ERROR = 3 + WARN = 4 + + +class ComparedOffset(NamedTuple): + offset: int + # name is None for scalar types + name: Optional[str] + match: bool + values: Tuple[str, str] + + +class ComparisonItem(NamedTuple): + """Each variable that was compared""" + + orig_addr: int + recomp_addr: int + name: str + + # The list of items that were compared. + # For a complex type, these are the members. + # For a scalar type, this is a list of size one. + # If we could not retrieve type information, this is + # a list of size one but without any specific type. + compared: List[ComparedOffset] + + # If present, the error message from the types parser. + error: Optional[str] = None + + # If true, there is no type specified for this variable. (i.e. non-public) + # In this case, we can only compare the raw bytes. + # This is different from the situation where a type id _is_ given, but + # we could not retrieve it for some reason. (This is an error.) + raw_only: bool = False + + @property + def result(self) -> CompareResult: + if self.error is not None: + return CompareResult.ERROR + + if all(c.match for c in self.compared): + return CompareResult.MATCH + + # Prefer WARN for a diff without complete type information. + return CompareResult.WARN if self.raw_only else CompareResult.DIFF + + +def create_comparison_item( + var: MatchInfo, + compared: Optional[List[ComparedOffset]] = None, + error: Optional[str] = None, + raw_only: bool = False, +) -> ComparisonItem: + """Helper to create the ComparisonItem from the fields in MatchInfo.""" + if compared is None: + compared = [] + + return ComparisonItem( + orig_addr=var.orig_addr, + recomp_addr=var.recomp_addr, + name=var.name, + compared=compared, + error=error, + raw_only=raw_only, + ) + + +def do_the_comparison(args: argparse.Namespace) -> Iterable[ComparisonItem]: + """Run through each variable in our compare DB, then do the comparison + according to the variable's type. Emit the result.""" + with IsleBin(args.original, find_str=True) as origfile, IsleBin( + args.recompiled + ) as recompfile: + isle_compare = IsleCompare(origfile, recompfile, args.pdb, args.decomp_dir) + + # TODO: We don't currently retain the type information of each variable + # in our compare DB. To get those, we build this mini-lookup table that + # maps recomp addresses to their type. + # We still need to build the full compare DB though, because we may + # need the matched symbols to compare pointers (e.g. on strings) + mini_cvdump = Cvdump(args.pdb).globals().types().run() + + recomp_type_reference = { + recompfile.get_abs_addr(g.section, g.offset): g.type + for g in mini_cvdump.globals + if recompfile.is_valid_section(g.section) + } + + for var in isle_compare.get_variables(): + type_name = recomp_type_reference.get(var.recomp_addr) + + # Start by assuming we can only compare the raw bytes + data_size = var.size + is_type_aware = type_name is not None + + if is_type_aware: + try: + # If we are type-aware, we can get the precise + # data size for the variable. + data_type = mini_cvdump.types.get(type_name) + data_size = data_type.size + except (CvdumpKeyError, CvdumpIntegrityError) as ex: + yield create_comparison_item(var, error=repr(ex)) + continue + + orig_raw = origfile.read(var.orig_addr, data_size) + recomp_raw = recompfile.read(var.recomp_addr, data_size) + + # If both variables are uninitialized, we consider them equal. + # Otherwise, this is a diff but there is nothing to compare. + if orig_raw is None or recomp_raw is None: + match = orig_raw is None and recomp_raw is None + orig_value = "(uninitialized)" if orig_raw is None else "(initialized)" + recomp_value = ( + "(uninitialized)" if recomp_raw is None else "(initialized)" + ) + yield create_comparison_item( + var, + compared=[ + ComparedOffset( + offset=0, + name=None, + match=match, + values=(orig_value, recomp_value), + ) + ], + ) + continue + + if not is_type_aware: + # If there is no specific type information available + # (i.e. if this is a static or non-public variable) + # then we can only compare the raw bytes. + yield create_comparison_item( + var, + compared=[ + ComparedOffset( + offset=0, + name="(raw)", + match=orig_raw == recomp_raw, + values=(orig_raw, recomp_raw), + ) + ], + raw_only=True, + ) + continue + + # If we are here, we can do the type-aware comparison. + compared = [] + compare_items = mini_cvdump.types.get_scalars(type_name) + format_str = mini_cvdump.types.get_format_string(type_name) + + orig_data = unpack(format_str, orig_raw) + recomp_data = unpack(format_str, recomp_raw) + + def pointer_display(addr: int, is_orig: bool) -> str: + """Helper to streamline pointer textual display.""" + if addr == 0: + return "nullptr" + + ptr_match = ( + isle_compare.get_by_orig(addr) + if is_orig + else isle_compare.get_by_recomp(addr) + ) + + if ptr_match is not None: + return f"Pointer to {ptr_match.match_name()}" + + # This variable did not match if we do not have + # the pointer target in our DB. + return f"Unknown pointer 0x{addr:x}" + + # Could zip here + for i, member in enumerate(compare_items): + if member.is_pointer: + match = isle_compare.is_pointer_match(orig_data[i], recomp_data[i]) + + value_a = pointer_display(orig_data[i], True) + value_b = pointer_display(recomp_data[i], False) + + values = (value_a, value_b) + else: + match = orig_data[i] == recomp_data[i] + values = (orig_data[i], recomp_data[i]) + + compared.append( + ComparedOffset( + offset=member.offset, + name=member.name, + match=match, + values=values, + ) + ) + + yield create_comparison_item(var, compared=compared) + + +def value_get(value: Optional[str], default: str): + return value if value is not None else default + + +def main(): + args = parse_args() + + def display_match(result: CompareResult) -> str: + """Helper to return color string or not, depending on user preference""" + if args.no_color: + return result.name + + match_color = ( + colorama.Fore.GREEN + if result == CompareResult.MATCH + else ( + colorama.Fore.YELLOW + if result == CompareResult.WARN + else colorama.Fore.RED + ) + ) + return f"{match_color}{result.name}{colorama.Style.RESET_ALL}" + + for item in do_the_comparison(args): + if not args.verbose and item.result == CompareResult.MATCH: + continue + + address_display = ( + f"0x{item.orig_addr:x} / 0x{item.recomp_addr:x}" + if args.print_rec_addr + else f"0x{item.orig_addr:x}" + ) + + print(f"{item.name[:80]} ({address_display}) ... {display_match(item.result)} ") + if item.error is not None: + print(f" {item.error}") + + for c in item.compared: + if not args.verbose and c.match: + continue + + (value_a, value_b) = c.values + if c.match: + print(f" {c.offset:5} {value_get(c.name, '(value)'):30} {value_a}") + else: + print( + f" {c.offset:5} {value_get(c.name, '(value)'):30} {value_a} : {value_b}" + ) + + print() + + +if __name__ == "__main__": + main() diff --git a/tools/isledecomp/isledecomp/compare/core.py b/tools/isledecomp/isledecomp/compare/core.py index de332e43..53ba3fa0 100644 --- a/tools/isledecomp/isledecomp/compare/core.py +++ b/tools/isledecomp/isledecomp/compare/core.py @@ -95,6 +95,7 @@ def _load_cvdump(self): .publics() .symbols() .section_contributions() + .types() .run() ) res = CvdumpAnalysis(cv) @@ -454,6 +455,25 @@ def _compare_match(self, match: MatchInfo) -> Optional[DiffReport]: ## Public API + def is_pointer_match(self, orig_addr, recomp_addr) -> bool: + """Check whether these pointers point at the same thing""" + + # Null pointers considered matching + if orig_addr == 0 and recomp_addr == 0: + return True + + match = self._db.get_by_orig(orig_addr) + if match is None: + return False + + return match.recomp_addr == recomp_addr + + def get_by_orig(self, addr: int) -> Optional[MatchInfo]: + return self._db.get_by_orig(addr) + + def get_by_recomp(self, addr: int) -> Optional[MatchInfo]: + return self._db.get_by_recomp(addr) + def get_all(self) -> List[MatchInfo]: return self._db.get_all() @@ -463,6 +483,9 @@ def get_functions(self) -> List[MatchInfo]: def get_vtables(self) -> List[MatchInfo]: return self._db.get_matches_by_type(SymbolType.VTABLE) + def get_variables(self) -> List[MatchInfo]: + return self._db.get_matches_by_type(SymbolType.DATA) + def compare_address(self, addr: int) -> Optional[DiffReport]: match = self._db.get_one_match(addr) if match is None: diff --git a/tools/isledecomp/isledecomp/cvdump/__init__.py b/tools/isledecomp/isledecomp/cvdump/__init__.py index 635ef5cd..8e1fd78a 100644 --- a/tools/isledecomp/isledecomp/cvdump/__init__.py +++ b/tools/isledecomp/isledecomp/cvdump/__init__.py @@ -1,3 +1,4 @@ from .analysis import CvdumpAnalysis from .parser import CvdumpParser from .runner import Cvdump +from .types import CvdumpTypesParser diff --git a/tools/isledecomp/isledecomp/cvdump/analysis.py b/tools/isledecomp/isledecomp/cvdump/analysis.py index 330429dd..d3f8bd27 100644 --- a/tools/isledecomp/isledecomp/cvdump/analysis.py +++ b/tools/isledecomp/isledecomp/cvdump/analysis.py @@ -1,45 +1,9 @@ """For collating the results from parsing cvdump.exe into a more directly useful format.""" -from typing import List, Optional, Tuple +from typing import List, Optional from isledecomp.types import SymbolType from .parser import CvdumpParser from .demangler import demangle_string_const, demangle_vtable - - -def data_type_info(type_name: str) -> Optional[Tuple[int, bool]]: - """cvdump type aliases are listed here: - https://github.com/microsoft/microsoft-pdb/blob/master/include/cvinfo.h - For the given type, return tuple(size, is_pointer) if possible.""" - # pylint: disable=too-many-return-statements - # TODO: refactor to be as simple as possble - - # Ignore complex types. We can get the size of those from the TYPES section. - if not type_name.startswith("T"): - return None - - # if 32-bit pointer - if type_name.startswith("T_32P"): - return (4, True) - - if type_name.endswith("QUAD") or type_name.endswith("64"): - return (8, False) - - if ( - type_name.endswith("LONG") - or type_name.endswith("INT4") - or type_name.endswith("32") - ): - return (4, False) - - if type_name.endswith("SHORT") or type_name.endswith("WCHAR"): - return (2, False) - - if "CHAR" in type_name: - return (1, False) - - if type_name in ("T_NOTYPE", "T_VOID"): - return (0, False) - - return None +from .types import CvdumpKeyError, CvdumpIntegrityError class CvdumpNode: @@ -146,11 +110,21 @@ def __init__(self, parser: CvdumpParser): node_dict[key].node_type = SymbolType.DATA node_dict[key].friendly_name = glo.name - if (g_info := data_type_info(glo.type)) is not None: - (size, is_pointer) = g_info - node_dict[key].confirmed_size = size - if is_pointer: - node_dict[key].node_type = SymbolType.POINTER + try: + # Check our types database for type information. + # If we did not parse the TYPES section, we can only + # get information for built-in "T_" types. + g_info = parser.types.get(glo.type) + node_dict[key].confirmed_size = g_info.size + # Previously we set the symbol type to POINTER here if + # the variable was known to be a pointer. We can derive this + # information later when it's time to compare the variable, + # so let's set these to symbol type DATA instead. + # POINTER will be reserved for non-variable pointer data. + # e.g. thunks, unwind section. + except (CvdumpKeyError, CvdumpIntegrityError): + # No big deal if we don't have complete type information. + pass for lin in parser.lines: key = (lin.section, lin.offset) diff --git a/tools/isledecomp/isledecomp/cvdump/parser.py b/tools/isledecomp/isledecomp/cvdump/parser.py index 8d1c71bb..27554eda 100644 --- a/tools/isledecomp/isledecomp/cvdump/parser.py +++ b/tools/isledecomp/isledecomp/cvdump/parser.py @@ -1,9 +1,10 @@ import re from typing import Iterable, Tuple from collections import namedtuple +from .types import CvdumpTypesParser # e.g. `*** PUBLICS` -_section_change_regex = re.compile(r"^\*\*\* (?P
[A-Z/ ]+)") +_section_change_regex = re.compile(r"^\*\*\* (?P
[A-Z/ ]+)$") # e.g. ` 27 00034EC0 28 00034EE2 29 00034EE7 30 00034EF4` _line_addr_pairs_findall = re.compile(r"\s+(?P\d+) (?P[A-F0-9]{8})") @@ -76,6 +77,8 @@ def __init__(self) -> None: self.globals = [] self.modules = [] + self.types = CvdumpTypesParser() + def _lines_section(self, line: str): """Parsing entries from the LINES section. We only care about the pairs of line_number and address and the subsection header to indicate which code file @@ -198,6 +201,9 @@ def read_line(self, line: str): elif self._section == "MODULES": self._modules_section(line) + elif self._section == "TYPES": + self.types.read_line(line) + def read_lines(self, lines: Iterable[str]): for line in lines: self.read_line(line) diff --git a/tools/isledecomp/isledecomp/cvdump/runner.py b/tools/isledecomp/isledecomp/cvdump/runner.py index 6b2c2ff4..33e2d98d 100644 --- a/tools/isledecomp/isledecomp/cvdump/runner.py +++ b/tools/isledecomp/isledecomp/cvdump/runner.py @@ -14,6 +14,7 @@ class DumpOpt(Enum): PUBLICS = 3 SECTION_CONTRIB = 4 MODULES = 5 + TYPES = 6 cvdump_opt_map = { @@ -23,6 +24,7 @@ class DumpOpt(Enum): DumpOpt.PUBLICS: "-p", DumpOpt.SECTION_CONTRIB: "-seccontrib", DumpOpt.MODULES: "-m", + DumpOpt.TYPES: "-t", } @@ -55,6 +57,10 @@ def modules(self): self._options.add(DumpOpt.MODULES) return self + def types(self): + self._options.add(DumpOpt.TYPES) + return self + def cmd_line(self) -> List[str]: cvdump_exe = lib_path_join("cvdump.exe") flags = [cvdump_opt_map[opt] for opt in self._options] diff --git a/tools/isledecomp/isledecomp/cvdump/types.py b/tools/isledecomp/isledecomp/cvdump/types.py new file mode 100644 index 00000000..ed5a38b8 --- /dev/null +++ b/tools/isledecomp/isledecomp/cvdump/types.py @@ -0,0 +1,433 @@ +import re +from typing import Dict, Iterator, List, NamedTuple, Optional + + +class CvdumpTypeError(Exception): + pass + + +class CvdumpKeyError(KeyError): + pass + + +class CvdumpIntegrityError(Exception): + pass + + +class FieldListItem(NamedTuple): + """Member of a class or structure""" + + offset: int + name: str + type: str + + +class ScalarType(NamedTuple): + offset: int + name: Optional[str] + type: str + + @property + def size(self) -> int: + return scalar_type_size(self.type) + + @property + def format_char(self) -> str: + return scalar_type_format_char(self.type) + + @property + def is_pointer(self) -> bool: + return scalar_type_pointer(self.type) + + +class TypeInfo(NamedTuple): + key: str + size: int + name: Optional[str] = None + members: Optional[List[FieldListItem]] = None + + def is_scalar(self) -> bool: + # TODO: distinction between a class with zero members and no vtable? + return self.members is None + + +def normalize_type_id(key: str) -> str: + """Helper for TYPES parsing to ensure a consistent format. + If key begins with "T_" it is a built-in type. + Else it is a hex string. We prefer lower case letters and + no leading zeroes. (UDT identifier pads to 8 characters.)""" + if key.startswith("T_"): + # Remove numeric value for "T_" type. We don't use this. + return key[: key.index("(")] if "(" in key else key + + return hex(int(key, 16)).lower() + + +def scalar_type_pointer(type_name: str) -> bool: + return type_name.startswith("T_32P") + + +def scalar_type_size(type_name: str) -> int: + if scalar_type_pointer(type_name): + return 4 + + if "CHAR" in type_name: + return 2 if "WCHAR" in type_name else 1 + + if "SHORT" in type_name: + return 2 + + if "QUAD" in type_name or "64" in type_name: + return 8 + + return 4 + + +def scalar_type_signed(type_name: str) -> bool: + if scalar_type_pointer(type_name): + return False + + # According to cvinfo.h, T_WCHAR is unsigned + return not type_name.startswith("T_U") and not type_name.startswith("T_W") + + +def scalar_type_format_char(type_name: str) -> str: + if scalar_type_pointer(type_name): + return "L" + + # "Really a char" + if type_name.startswith("T_RCHAR"): + return "c" + + # floats + if type_name.startswith("T_REAL"): + return "d" if "64" in type_name else "f" + + size = scalar_type_size(type_name) + char = ({1: "b", 2: "h", 4: "l", 8: "q"}).get(size, "l") + + return char if scalar_type_signed(type_name) else char.upper() + + +def member_string_iter( + members: List[ScalarType], size: Optional[int] = None +) -> Iterator[str]: + if len(members) == 0: + yield "x" * (size or 0) + + last_offset = 0 + last_size = 0 + for m in members: + padding = m.offset - last_offset - last_size + if padding > 0: + yield "x" * padding + + yield m.format_char + last_offset = m.offset + last_size = m.size + + if size is not None: + padding = size - (last_offset + last_size) + if padding > 0: + yield "x" * padding + + +def member_list_to_struct_string( + members: List[ScalarType], size: Optional[int] = None +) -> str: + """Create a string for use with struct.unpack + Will pad to `size` bytes if present.""" + if len(members) == 0: + return "x" * (size or 0) + + format_string = "".join(list(member_string_iter(members, size))) + if len(format_string) > 0: + return "<" + format_string + + return "" + + +def join_member_names(parent: str, child: Optional[str]) -> str: + """Helper method to combine parent/child member names. + Child member name is None if the child is a scalar type.""" + + if child is None: + return parent + + # If the child is an array index, join without the dot + if child.startswith("["): + return f"{parent}{child}" + + return f"{parent}.{child}" + + +class CvdumpTypesParser: + """Parser for cvdump output, TYPES section. + Tricky enough that it demands its own parser.""" + + # Marks the start of a new type + INDEX_RE = re.compile(r"(?P0x\w+) : .* (?PLF_\w+)") + + # LF_FIELDLIST class/struct member (1/2) + LIST_RE = re.compile( + r"\s+list\[\d+\] = LF_MEMBER, (?P\w+), type = (?P.*), offset = (?P\d+)" + ) + + # LF_FIELDLIST vtable indicator + VTABLE_RE = re.compile(r"^\s+list\[\d+\] = LF_VFUNCTAB") + + # LF_FIELDLIST superclass indicator + SUPERCLASS_RE = re.compile( + r"^\s+list\[\d+\] = LF_BCLASS, (?P\w+), type = (?P.*), offset = (?P\d+)" + ) + + # LF_FIELDLIST member name (2/2) + MEMBER_RE = re.compile(r"^\s+member name = '(?P.*)'$") + + # LF_ARRAY element type + ARRAY_ELEMENT_RE = re.compile(r"^\s+Element type = (?P.*)") + + # LF_ARRAY total array size + ARRAY_LENGTH_RE = re.compile(r"^\s+length = (?P\d+)") + + # LF_CLASS/LF_STRUCTURE field list reference + CLASS_FIELD_RE = re.compile( + r"^\s+# members = \d+, field list type (?P0x\w+)," + ) + + # LF_CLASS/LF_STRUCTURE name and other info + CLASS_NAME_RE = re.compile( + r"^\s+Size = (?P\d+), class name = (?P.+), UDT\((?P0x\w+)\)" + ) + + # LF_MODIFIER, type being modified + MODIFIES_RE = re.compile(r".*modifies type (?P.*)$") + + def __init__(self) -> None: + self.mode = "" + self.last_key = "" + self.keys = {} + + def _new_type(self): + """Prepare a new dict for the type we just parsed. + The id is self.last_key and the "type" of type is self.mode. + e.g. LF_CLASS""" + self.keys[self.last_key] = {"type": self.mode} + + def _set(self, key: str, value): + self.keys[self.last_key][key] = value + + def _add_member(self, offset: int, type_: str): + obj = self.keys[self.last_key] + if "members" not in obj: + obj["members"] = [] + + obj["members"].append({"offset": offset, "type": type_}) + + def _set_member_name(self, name: str): + """Set name for most recently added member.""" + obj = self.keys[self.last_key] + obj["members"][-1]["name"] = name + + def _get_field_list(self, type_obj: Dict) -> List[FieldListItem]: + """Return the field list for the given LF_CLASS/LF_STRUCTURE reference""" + + if type_obj.get("type") == "LF_FIELDLIST": + field_obj = type_obj + else: + field_list_type = type_obj.get("field_list_type") + field_obj = self.keys[field_list_type] + + members: List[FieldListItem] = [] + + super_id = field_obj.get("super") + if super_id is not None: + # May need to resolve forward ref. + superclass = self.get(super_id) + if superclass.members is not None: + members = superclass.members + + raw_members = field_obj.get("members", []) + members += [ + FieldListItem( + offset=m["offset"], + type=m["type"], + name=m["name"], + ) + for m in raw_members + ] + + return sorted(members, key=lambda m: m.offset) + + def _mock_array_members(self, type_obj: Dict) -> List[FieldListItem]: + """LF_ARRAY elements provide the element type and the total size. + We want the list of "members" as if this was a struct.""" + + if type_obj.get("type") != "LF_ARRAY": + raise CvdumpTypeError("Type is not an LF_ARRAY") + + array_type = type_obj.get("array_type") + if array_type is None: + raise CvdumpIntegrityError("No array element type") + + array_element_size = self.get(array_type).size + + n_elements = type_obj["size"] // array_element_size + + return [ + FieldListItem( + offset=i * array_element_size, + type=array_type, + name=f"[{i}]", + ) + for i in range(n_elements) + ] + + def get(self, type_key: str) -> TypeInfo: + """Convert our dictionary values read from the cvdump output + into a consistent format for the given type.""" + + # Scalar type. Handled here because it makes the recursive steps + # much simpler. + if type_key.startswith("T_"): + size = scalar_type_size(type_key) + return TypeInfo( + key=type_key, + size=size, + ) + + # Go to our dictionary to find it. + obj = self.keys.get(type_key.lower()) + if obj is None: + raise CvdumpKeyError(type_key) + + # These type references are just a wrapper around a scalar + if obj.get("type") == "LF_ENUM": + return self.get("T_INT4") + + if obj.get("type") == "LF_POINTER": + return self.get("T_32PVOID") + + if obj.get("is_forward_ref", False): + # Get the forward reference to follow. + # If this is LF_CLASS/LF_STRUCTURE, it is the UDT value. + # For LF_MODIFIER, it is the type being modified. + forward_ref = obj.get("udt", None) or obj.get("modifies", None) + if forward_ref is None: + raise CvdumpIntegrityError(f"Null forward ref for type {type_key}") + + return self.get(forward_ref) + + # Else it is not a forward reference, so build out the object here. + if obj.get("type") == "LF_ARRAY": + members = self._mock_array_members(obj) + else: + members = self._get_field_list(obj) + + return TypeInfo( + key=type_key, + size=obj.get("size"), + name=obj.get("name"), + members=members, + ) + + def get_by_name(self, name: str) -> TypeInfo: + """Find the complex type with the given name.""" + # TODO + raise NotImplementedError + + def get_scalars(self, type_key: str) -> List[ScalarType]: + """Reduce the given type to a list of scalars so we can + compare each component value.""" + + obj = self.get(type_key) + if obj.is_scalar(): + # Use obj.key here for alias types like LF_POINTER + return [ScalarType(offset=0, type=obj.key, name=None)] + + # mypy? + assert obj.members is not None + + # Dedupe repeated offsets if this is a union type + unique_offsets = {m.offset: m for m in obj.members} + unique_members = [m for _, m in unique_offsets.items()] + + return [ + ScalarType( + offset=m.offset + cm.offset, + type=cm.type, + name=join_member_names(m.name, cm.name), + ) + for m in unique_members + for cm in self.get_scalars(m.type) + ] + + def get_format_string(self, type_key: str) -> str: + obj = self.get(type_key) + members = self.get_scalars(type_key) + # We need both to pad the data to size + return member_list_to_struct_string(members, obj.size) + + def read_line(self, line: str): + if (match := self.INDEX_RE.match(line)) is not None: + self.last_key = normalize_type_id(match.group("key")) + self.mode = match.group("type") + self._new_type() + + # We don't need to read anything else from here (for now) + if self.mode in ("LF_ENUM", "LF_POINTER"): + self._set("size", 4) + + if self.mode == "LF_MODIFIER": + if (match := self.MODIFIES_RE.match(line)) is not None: + # For convenience, because this is essentially the same thing + # as an LF_CLASS forward ref. + self._set("is_forward_ref", True) + self._set("modifies", normalize_type_id(match.group("type"))) + + if self.mode == "LF_ARRAY": + if (match := self.ARRAY_ELEMENT_RE.match(line)) is not None: + self._set("array_type", normalize_type_id(match.group("type"))) + + if (match := self.ARRAY_LENGTH_RE.match(line)) is not None: + self._set("size", int(match.group("length"))) + + if self.mode == "LF_FIELDLIST": + # If this class has a vtable, create a mock member at offset 0 + if (match := self.VTABLE_RE.match(line)) is not None: + # For our purposes, any pointer type will do + self._add_member(0, "T_32PVOID") + self._set_member_name("vftable") + + # Superclass is set here in the fieldlist rather than in LF_CLASS + if (match := self.SUPERCLASS_RE.match(line)) is not None: + self._set("super", normalize_type_id(match.group("type"))) + + # Member offset and type given on the first of two lines. + if (match := self.LIST_RE.match(line)) is not None: + self._add_member( + int(match.group("offset")), normalize_type_id(match.group("type")) + ) + + # Name of the member read on the second of two lines. + if (match := self.MEMBER_RE.match(line)) is not None: + self._set_member_name(match.group("name")) + + if self.mode in ("LF_STRUCTURE", "LF_CLASS"): + # Match the reference to the associated LF_FIELDLIST + if (match := self.CLASS_FIELD_RE.match(line)) is not None: + if match.group("field_type") == "0x0000": + # Not redundant. UDT might not match the key. + # These cases get reported as UDT mismatch. + self._set("is_forward_ref", True) + else: + field_list_type = normalize_type_id(match.group("field_type")) + self._set("field_list_type", field_list_type) + + # Last line has the vital information. + # If this is a FORWARD REF, we need to follow the UDT pointer + # to get the actual class details. + if (match := self.CLASS_NAME_RE.match(line)) is not None: + self._set("name", match.group("name")) + self._set("udt", normalize_type_id(match.group("udt"))) + self._set("size", int(match.group("size"))) diff --git a/tools/isledecomp/tests/test_cvdump.py b/tools/isledecomp/tests/test_cvdump.py index cfaff7a9..3670e495 100644 --- a/tools/isledecomp/tests/test_cvdump.py +++ b/tools/isledecomp/tests/test_cvdump.py @@ -1,39 +1,59 @@ import pytest -from isledecomp.cvdump.analysis import data_type_info +from isledecomp.cvdump.types import ( + scalar_type_size, + scalar_type_pointer, + scalar_type_signed, +) + +# These are all the types seen in the cvdump. +# We have char, short, int, long, long long, float, and double all represented +# in both signed and unsigned. +# We can also identify a 4 byte pointer with the T_32 prefix. +# The type T_VOID is used to designate a function's return type. +# T_NOTYPE is specified as the type of "this" for a static function in a class. + +# For reference: https://github.com/microsoft/microsoft-pdb/blob/master/include/cvinfo.h # fmt: off -type_check_cases = [ - ("T_32PINT4", 4, True), - ("T_32PLONG", 4, True), - ("T_32PRCHAR", 4, True), - ("T_32PREAL32", 4, True), - ("T_32PUCHAR", 4, True), - ("T_32PUINT4", 4, True), - ("T_32PULONG", 4, True), - ("T_32PUSHORT", 4, True), - ("T_32PVOID", 4, True), - ("T_CHAR", 1, False), - ("T_INT4", 4, False), - ("T_LONG", 4, False), - ("T_NOTYPE", 0, False), # ? - ("T_QUAD", 8, False), - ("T_RCHAR", 1, False), - ("T_REAL32", 4, False), - ("T_REAL64", 8, False), - ("T_SHORT", 2, False), - ("T_UCHAR", 1, False), - ("T_UINT4", 4, False), - ("T_ULONG", 4, False), - ("T_UQUAD", 8, False), - ("T_USHORT", 2, False), - ("T_VOID", 0, False), # ? - ("T_WCHAR", 2, False), -] +# Fields are: type_name, size, is_signed, is_pointer +type_check_cases = ( + ("T_32PINT4", 4, False, True), + ("T_32PLONG", 4, False, True), + ("T_32PRCHAR", 4, False, True), + ("T_32PREAL32", 4, False, True), + ("T_32PUCHAR", 4, False, True), + ("T_32PUINT4", 4, False, True), + ("T_32PULONG", 4, False, True), + ("T_32PUSHORT", 4, False, True), + ("T_32PVOID", 4, False, True), + ("T_CHAR", 1, True, False), + ("T_INT4", 4, True, False), + ("T_LONG", 4, True, False), + ("T_QUAD", 8, True, False), + ("T_RCHAR", 1, True, False), + ("T_REAL32", 4, True, False), + ("T_REAL64", 8, True, False), + ("T_SHORT", 2, True, False), + ("T_UCHAR", 1, False, False), + ("T_UINT4", 4, False, False), + ("T_ULONG", 4, False, False), + ("T_UQUAD", 8, False, False), + ("T_USHORT", 2, False, False), + ("T_WCHAR", 2, False, False), +) # fmt: on -@pytest.mark.parametrize("type_name, size, is_pointer", type_check_cases) -def test_type_check(type_name: str, size: int, is_pointer: bool): - assert (info := data_type_info(type_name)) is not None - assert info[0] == size - assert info[1] == is_pointer +@pytest.mark.parametrize("type_name, size, _, __", type_check_cases) +def test_scalar_size(type_name: str, size: int, _, __): + assert scalar_type_size(type_name) == size + + +@pytest.mark.parametrize("type_name, _, is_signed, __", type_check_cases) +def test_scalar_signed(type_name: str, _, is_signed: bool, __): + assert scalar_type_signed(type_name) == is_signed + + +@pytest.mark.parametrize("type_name, _, __, is_pointer", type_check_cases) +def test_scalar_pointer(type_name: str, _, __, is_pointer: bool): + assert scalar_type_pointer(type_name) == is_pointer diff --git a/tools/isledecomp/tests/test_cvdump_types.py b/tools/isledecomp/tests/test_cvdump_types.py new file mode 100644 index 00000000..d6182b25 --- /dev/null +++ b/tools/isledecomp/tests/test_cvdump_types.py @@ -0,0 +1,452 @@ +"""Specifically testing the Cvdump TYPES parser +and type dependency tree walker.""" + +import pytest +from isledecomp.cvdump.types import ( + CvdumpTypesParser, + CvdumpKeyError, + CvdumpIntegrityError, +) + +TEST_LINES = """ +0x1028 : Length = 10, Leaf = 0x1001 LF_MODIFIER + const, modifies type T_REAL32(0040) + +0x103b : Length = 14, Leaf = 0x1503 LF_ARRAY + Element type = T_REAL32(0040) + Index type = T_SHORT(0011) + length = 16 + Name = + +0x103c : Length = 14, Leaf = 0x1503 LF_ARRAY + Element type = 0x103B + Index type = T_SHORT(0011) + length = 64 + Name = + +0x10e0 : Length = 86, Leaf = 0x1203 LF_FIELDLIST + list[0] = LF_MEMBER, public, type = T_REAL32(0040), offset = 0 + member name = 'x' + list[1] = LF_MEMBER, public, type = T_REAL32(0040), offset = 0 + member name = 'dvX' + list[2] = LF_MEMBER, public, type = T_REAL32(0040), offset = 4 + member name = 'y' + list[3] = LF_MEMBER, public, type = T_REAL32(0040), offset = 4 + member name = 'dvY' + list[4] = LF_MEMBER, public, type = T_REAL32(0040), offset = 8 + member name = 'z' + list[5] = LF_MEMBER, public, type = T_REAL32(0040), offset = 8 + member name = 'dvZ' + +0x10e1 : Length = 34, Leaf = 0x1505 LF_STRUCTURE + # members = 6, field list type 0x10e0, + Derivation list type 0x0000, VT shape type 0x0000 + Size = 12, class name = _D3DVECTOR, UDT(0x000010e1) + +0x10e4 : Length = 14, Leaf = 0x1503 LF_ARRAY + Element type = T_UCHAR(0020) + Index type = T_SHORT(0011) + length = 8 + Name = + +0x10ea : Length = 14, Leaf = 0x1503 LF_ARRAY + Element type = 0x1028 + Index type = T_SHORT(0011) + length = 12 + Name = + +0x11f0 : Length = 30, Leaf = 0x1504 LF_CLASS + # members = 0, field list type 0x0000, FORWARD REF, + Derivation list type 0x0000, VT shape type 0x0000 + Size = 0, class name = MxRect32, UDT(0x00001214) + +0x11f2 : Length = 10, Leaf = 0x1001 LF_MODIFIER + const, modifies type 0x11F0 + +0x1213 : Length = 530, Leaf = 0x1203 LF_FIELDLIST + list[0] = LF_METHOD, count = 5, list = 0x1203, name = 'MxRect32' + list[1] = LF_ONEMETHOD, public, VANILLA, index = 0x1205, name = 'operator=' + list[2] = LF_ONEMETHOD, public, VANILLA, index = 0x11F5, name = 'Intersect' + list[3] = LF_ONEMETHOD, public, VANILLA, index = 0x1207, name = 'SetPoint' + list[4] = LF_ONEMETHOD, public, VANILLA, index = 0x1207, name = 'AddPoint' + list[5] = LF_ONEMETHOD, public, VANILLA, index = 0x1207, name = 'SubtractPoint' + list[6] = LF_ONEMETHOD, public, VANILLA, index = 0x11F5, name = 'UpdateBounds' + list[7] = LF_ONEMETHOD, public, VANILLA, index = 0x1209, name = 'IsValid' + list[8] = LF_ONEMETHOD, public, VANILLA, index = 0x120A, name = 'IntersectsWith' + list[9] = LF_ONEMETHOD, public, VANILLA, index = 0x120B, name = 'GetWidth' + list[10] = LF_ONEMETHOD, public, VANILLA, index = 0x120B, name = 'GetHeight' + list[11] = LF_ONEMETHOD, public, VANILLA, index = 0x120C, name = 'GetPoint' + list[12] = LF_ONEMETHOD, public, VANILLA, index = 0x120D, name = 'GetSize' + list[13] = LF_ONEMETHOD, public, VANILLA, index = 0x120B, name = 'GetLeft' + list[14] = LF_ONEMETHOD, public, VANILLA, index = 0x120B, name = 'GetTop' + list[15] = LF_ONEMETHOD, public, VANILLA, index = 0x120B, name = 'GetRight' + list[16] = LF_ONEMETHOD, public, VANILLA, index = 0x120B, name = 'GetBottom' + list[17] = LF_ONEMETHOD, public, VANILLA, index = 0x120E, name = 'SetLeft' + list[18] = LF_ONEMETHOD, public, VANILLA, index = 0x120E, name = 'SetTop' + list[19] = LF_ONEMETHOD, public, VANILLA, index = 0x120E, name = 'SetRight' + list[20] = LF_ONEMETHOD, public, VANILLA, index = 0x120E, name = 'SetBottom' + list[21] = LF_METHOD, count = 3, list = 0x1211, name = 'CopyFrom' + list[22] = LF_ONEMETHOD, private, STATIC, index = 0x1212, name = 'Min' + list[23] = LF_ONEMETHOD, private, STATIC, index = 0x1212, name = 'Max' + list[24] = LF_MEMBER, private, type = T_INT4(0074), offset = 0 + member name = 'm_left' + list[25] = LF_MEMBER, private, type = T_INT4(0074), offset = 4 + member name = 'm_top' + list[26] = LF_MEMBER, private, type = T_INT4(0074), offset = 8 + member name = 'm_right' + list[27] = LF_MEMBER, private, type = T_INT4(0074), offset = 12 + member name = 'm_bottom' + +0x1214 : Length = 30, Leaf = 0x1504 LF_CLASS + # members = 34, field list type 0x1213, CONSTRUCTOR, OVERLOAD, + Derivation list type 0x0000, VT shape type 0x0000 + Size = 16, class name = MxRect32, UDT(0x00001214) + +0x1220 : Length = 30, Leaf = 0x1504 LF_CLASS + # members = 0, field list type 0x0000, FORWARD REF, + Derivation list type 0x0000, VT shape type 0x0000 + Size = 0, class name = MxCore, UDT(0x00004060) + +0x14db : Length = 30, Leaf = 0x1504 LF_CLASS + # members = 0, field list type 0x0000, FORWARD REF, + Derivation list type 0x0000, VT shape type 0x0000 + Size = 0, class name = MxString, UDT(0x00004db6) + +0x19b0 : Length = 34, Leaf = 0x1505 LF_STRUCTURE + # members = 0, field list type 0x0000, FORWARD REF, + Derivation list type 0x0000, VT shape type 0x0000 + Size = 0, class name = ROIColorAlias, UDT(0x00002a76) + +0x19b1 : Length = 14, Leaf = 0x1503 LF_ARRAY + Element type = 0x19B0 + Index type = T_SHORT(0011) + length = 440 + Name = + +0x2a75 : Length = 98, Leaf = 0x1203 LF_FIELDLIST + list[0] = LF_MEMBER, public, type = T_32PRCHAR(0470), offset = 0 + member name = 'm_name' + list[1] = LF_MEMBER, public, type = T_INT4(0074), offset = 4 + member name = 'm_red' + list[2] = LF_MEMBER, public, type = T_INT4(0074), offset = 8 + member name = 'm_green' + list[3] = LF_MEMBER, public, type = T_INT4(0074), offset = 12 + member name = 'm_blue' + list[4] = LF_MEMBER, public, type = T_INT4(0074), offset = 16 + member name = 'm_unk0x10' + +0x2a76 : Length = 34, Leaf = 0x1505 LF_STRUCTURE + # members = 5, field list type 0x2a75, + Derivation list type 0x0000, VT shape type 0x0000 + Size = 20, class name = ROIColorAlias, UDT(0x00002a76) + +0x22d4 : Length = 154, Leaf = 0x1203 LF_FIELDLIST + list[0] = LF_VFUNCTAB, type = 0x20FC + list[1] = LF_METHOD, count = 3, list = 0x22D0, name = 'MxVariable' + list[2] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1F0F, + vfptr offset = 0, name = 'GetValue' + list[3] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1F10, + vfptr offset = 4, name = 'SetValue' + list[4] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1F11, + vfptr offset = 8, name = '~MxVariable' + list[5] = LF_ONEMETHOD, public, VANILLA, index = 0x22D3, name = 'GetKey' + list[6] = LF_MEMBER, protected, type = 0x14DB, offset = 4 + member name = 'm_key' + list[7] = LF_MEMBER, protected, type = 0x14DB, offset = 20 + member name = 'm_value' + +0x22d5 : Length = 34, Leaf = 0x1504 LF_CLASS + # members = 10, field list type 0x22d4, CONSTRUCTOR, + Derivation list type 0x0000, VT shape type 0x20fb + Size = 36, class name = MxVariable, UDT(0x00004041) + +0x3cc2 : Length = 38, Leaf = 0x1507 LF_ENUM + # members = 64, type = T_INT4(0074) field list type 0x3cc1 +NESTED, enum name = JukeBox::JukeBoxScript, UDT(0x00003cc2) + +0x3fab : Length = 10, Leaf = 0x1002 LF_POINTER + Pointer (NEAR32), Size: 0 + Element type : 0x3FAA + +0x405f : Length = 158, Leaf = 0x1203 LF_FIELDLIST + list[0] = LF_VFUNCTAB, type = 0x2090 + list[1] = LF_ONEMETHOD, public, VANILLA, index = 0x176A, name = 'MxCore' + list[2] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x176A, + vfptr offset = 0, name = '~MxCore' + list[3] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x176B, + vfptr offset = 4, name = 'Notify' + list[4] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x2087, + vfptr offset = 8, name = 'Tickle' + list[5] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x202F, + vfptr offset = 12, name = 'ClassName' + list[6] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x2030, + vfptr offset = 16, name = 'IsA' + list[7] = LF_ONEMETHOD, public, VANILLA, index = 0x2091, name = 'GetId' + list[8] = LF_MEMBER, private, type = T_UINT4(0075), offset = 4 + member name = 'm_id' + +0x4060 : Length = 30, Leaf = 0x1504 LF_CLASS + # members = 9, field list type 0x405f, CONSTRUCTOR, + Derivation list type 0x0000, VT shape type 0x1266 + Size = 8, class name = MxCore, UDT(0x00004060) + +0x4262 : Length = 14, Leaf = 0x1503 LF_ARRAY + Element type = 0x3CC2 + Index type = T_SHORT(0011) + length = 24 + Name = + +0x432f : Length = 14, Leaf = 0x1503 LF_ARRAY + Element type = T_INT4(0074) + Index type = T_SHORT(0011) + length = 12 + Name = + +0x4db5 : Length = 246, Leaf = 0x1203 LF_FIELDLIST + list[0] = LF_BCLASS, public, type = 0x1220, offset = 0 + list[1] = LF_METHOD, count = 3, list = 0x14E3, name = 'MxString' + list[2] = LF_ONEMETHOD, public, VIRTUAL, index = 0x14DE, name = '~MxString' + list[3] = LF_METHOD, count = 2, list = 0x14E7, name = 'operator=' + list[4] = LF_ONEMETHOD, public, VANILLA, index = 0x14DE, name = 'ToUpperCase' + list[5] = LF_ONEMETHOD, public, VANILLA, index = 0x14DE, name = 'ToLowerCase' + list[6] = LF_ONEMETHOD, public, VANILLA, index = 0x14E8, name = 'operator+' + list[7] = LF_ONEMETHOD, public, VANILLA, index = 0x14E9, name = 'operator+=' + list[8] = LF_ONEMETHOD, public, VANILLA, index = 0x14EB, name = 'Compare' + list[9] = LF_ONEMETHOD, public, VANILLA, index = 0x14EC, name = 'GetData' + list[10] = LF_ONEMETHOD, public, VANILLA, index = 0x4DB4, name = 'GetLength' + list[11] = LF_MEMBER, private, type = T_32PRCHAR(0470), offset = 8 + member name = 'm_data' + list[12] = LF_MEMBER, private, type = T_USHORT(0021), offset = 12 + member name = 'm_length' + +0x4db6 : Length = 30, Leaf = 0x1504 LF_CLASS + # members = 16, field list type 0x4db5, CONSTRUCTOR, OVERLOAD, + Derivation list type 0x0000, VT shape type 0x1266 + Size = 16, class name = MxString, UDT(0x00004db6) +""" + + +@pytest.fixture(name="parser") +def types_parser_fixture(): + parser = CvdumpTypesParser() + for line in TEST_LINES.split("\n"): + parser.read_line(line) + + return parser + + +def test_basic_parsing(parser): + obj = parser.keys["0x4db6"] + assert obj["type"] == "LF_CLASS" + assert obj["name"] == "MxString" + assert obj["udt"] == "0x4db6" + + assert len(parser.keys["0x4db5"]["members"]) == 2 + + +def test_scalar_types(parser): + """Full tests on the scalar_* methods are in another file. + Here we are just testing the passthrough of the "T_" types.""" + assert parser.get("T_CHAR").name is None + assert parser.get("T_CHAR").size == 1 + + assert parser.get("T_32PVOID").name is None + assert parser.get("T_32PVOID").size == 4 + + +def test_resolve_forward_ref(parser): + # Non-forward ref + assert parser.get("0x22d5").name == "MxVariable" + # Forward ref + assert parser.get("0x14db").name == "MxString" + assert parser.get("0x14db").size == 16 + + +def test_members(parser): + """Return the list of items to compare for a given complex type. + If the class has a superclass, add those members too.""" + # MxCore field list + mxcore_members = parser.get_scalars("0x405f") + assert mxcore_members == [ + (0, "vftable", "T_32PVOID"), + (4, "m_id", "T_UINT4"), + ] + + # MxCore class id. Should be the same members + assert mxcore_members == parser.get_scalars("0x4060") + + # MxString field list. Should add inherited members from MxCore + assert parser.get_scalars("0x4db5") == [ + (0, "vftable", "T_32PVOID"), + (4, "m_id", "T_UINT4"), + (8, "m_data", "T_32PRCHAR"), + (12, "m_length", "T_USHORT"), + ] + + +def test_members_recursive(parser): + """Make sure that we unwrap the dependency tree correctly.""" + # MxVariable field list + assert parser.get_scalars("0x22d4") == [ + (0, "vftable", "T_32PVOID"), + (4, "m_key.vftable", "T_32PVOID"), + (8, "m_key.m_id", "T_UINT4"), + (12, "m_key.m_data", "T_32PRCHAR"), + (16, "m_key.m_length", "T_USHORT"), # with padding + (20, "m_value.vftable", "T_32PVOID"), + (24, "m_value.m_id", "T_UINT4"), + (28, "m_value.m_data", "T_32PRCHAR"), + (32, "m_value.m_length", "T_USHORT"), # with padding + ] + + +def test_struct(parser): + """Basic test for converting type into struct.unpack format string.""" + # MxCore: vftable and uint32. The vftable pointer is read as uint32. + assert parser.get_format_string("0x4060") == "