From b898d985158f67e51d7aa9958743b6da6be6c79c Mon Sep 17 00:00:00 2001 From: jonschz <17198703+jonschz@users.noreply.github.com> Date: Thu, 29 Aug 2024 20:54:23 +0200 Subject: [PATCH] BETA10: reccomp support and Ghidra imports (#1091) * Implement core functionality (before linter) * run linter + formatter * Review: remove redundant code * Implement end of range check for vtables --------- Co-authored-by: jonschz --- .gitignore | 9 +- .../legoomni/src/race/legoracespecial.cpp | 2 +- LEGO1/realtime/vector.h | 6 +- .../import_functions_and_types_from_pdb.py | 52 ++++++++- .../lego_util/function_importer.py | 96 ++++++++++++---- .../ghidra_scripts/lego_util/ghidra_helper.py | 36 ++++-- tools/ghidra_scripts/lego_util/headers.pyi | 3 +- .../lego_util/pdb_extraction.py | 29 ++--- tools/isledecomp/isledecomp/compare/core.py | 107 +++++++++++------- tools/isledecomp/isledecomp/compare/db.py | 5 + util/decomp.h | 6 + 11 files changed, 254 insertions(+), 97 deletions(-) diff --git a/.gitignore b/.gitignore index d335e177..b5e87a00 100644 --- a/.gitignore +++ b/.gitignore @@ -12,11 +12,16 @@ ENV/ VENV/ env.bak/ venv.bak/ -ISLE.EXE -LEGO1.DLL /build/ +/build_debug/ +/legobin/ *.swp LEGO1PROGRESS.* ISLEPROGRESS.* *.pyc tools/ghidra_scripts/import.log + +# By convention we put the retail binaries into ./legobin. +# These entries are kept for now since that convention has not always been around. +ISLE.EXE +LEGO1.DLL diff --git a/LEGO1/lego/legoomni/src/race/legoracespecial.cpp b/LEGO1/lego/legoomni/src/race/legoracespecial.cpp index 4c7d0c19..6bfb2565 100644 --- a/LEGO1/lego/legoomni/src/race/legoracespecial.cpp +++ b/LEGO1/lego/legoomni/src/race/legoracespecial.cpp @@ -209,7 +209,7 @@ void LegoCarRaceActor::SwitchBoundary(LegoPathBoundary*& p_boundary, LegoUnknown } // FUNCTION: LEGO1 0x10080b70 -// FUNCTION: BETA10 0x1000366b +// FUNCTION: BETA10 0x100cdbae void LegoCarRaceActor::VTable0x70(float p_float) { // m_unk0x0c is not an MxBool, there are places where it is set to 2 or higher diff --git a/LEGO1/realtime/vector.h b/LEGO1/realtime/vector.h index 9c73f31f..2db647c3 100644 --- a/LEGO1/realtime/vector.h +++ b/LEGO1/realtime/vector.h @@ -89,7 +89,7 @@ class Vector2 { virtual float Dot(float* p_a, float* p_b) const { return DotImpl(p_a, p_b); } // vtable+0x3c // FUNCTION: LEGO1 0x100020f0 - // FUNCTION: BETA10 0x100028f6 + // FUNCTION: BETA10 0x100108c0 virtual float Dot(Vector2* p_a, Vector2* p_b) const { return DotImpl(p_a->m_data, p_b->m_data); } // vtable+0x38 // FUNCTION: LEGO1 0x10002110 @@ -188,7 +188,7 @@ class Vector3 : public Vector2 { // in reverse order of appearance. // FUNCTION: LEGO1 0x10002270 - // FUNCTION: BETA10 0x100064a1 + // FUNCTION: BETA10 0x10011350 virtual void EqualsCrossImpl(float* p_a, float* p_b) { m_data[0] = p_a[1] * p_b[2] - p_a[2] * p_b[1]; @@ -275,7 +275,7 @@ class Vector3 : public Vector2 { void EqualsImpl(float* p_data) override { memcpy(m_data, p_data, sizeof(float) * 3); } // vtable+0x20 // FUNCTION: LEGO1 0x10003bc0 - // FUNCTION: BETA10 0x1000132a + // FUNCTION: BETA10 0x100114f0 void Clear() override { memset(m_data, 0, sizeof(float) * 3); } // vtable+0x2c // FUNCTION: LEGO1 0x10003bd0 diff --git a/tools/ghidra_scripts/import_functions_and_types_from_pdb.py b/tools/ghidra_scripts/import_functions_and_types_from_pdb.py index 0d77e7f6..54bb7e9f 100644 --- a/tools/ghidra_scripts/import_functions_and_types_from_pdb.py +++ b/tools/ghidra_scripts/import_functions_and_types_from_pdb.py @@ -21,6 +21,7 @@ # Disable spurious warnings in vscode / pylance # pyright: reportMissingModuleSource=false +from enum import Enum import importlib from dataclasses import dataclass, field import logging.handlers @@ -64,6 +65,25 @@ class Globals: statistics: Statistics = field(default_factory=Statistics) +class SupportedModules(Enum): + LEGO1 = 1 + BETA10 = 2 + + def orig_filename(self): + if self == self.LEGO1: + return "LEGO1.DLL" + return "BETA10.DLL" + + def recomp_filename_without_extension(self): + # in case we want to support more functions + return "LEGO1" + + def build_dir_name(self): + if self == self.BETA10: + return "build_debug" + return "build" + + # hard-coded settings that we don't want to prompt in Ghidra every time GLOBALS = Globals( verbose=False, @@ -133,7 +153,7 @@ def import_function_into_ghidra( # Find the Ghidra function at that address ghidra_address = getAddressFactory().getAddress(hex_original_address) # pylint: disable=possibly-used-before-assignment - function_importer = PdbFunctionImporter(api, pdb_function, type_importer) + function_importer = PdbFunctionImporter.build(api, pdb_function, type_importer) ghidra_function = getFunctionAt(ghidra_address) if ghidra_function is None: @@ -208,11 +228,29 @@ def log_and_track_failure( def main(): + if GLOBALS.running_from_ghidra: + origfile_name = getProgramFile().getName() + + if origfile_name == "LEGO1.DLL": + module = SupportedModules.LEGO1 + elif origfile_name in ["LEGO1D.DLL", "BETA10.DLL"]: + module = SupportedModules.BETA10 + else: + raise Lego1Exception( + f"Unsupported file name in import script: {origfile_name}" + ) + else: + module = SupportedModules.LEGO1 + + logger.info("Importing file: %s", module.orig_filename()) + repo_root = get_repository_root() - origfile_path = repo_root.joinpath("LEGO1.DLL") - build_path = repo_root.joinpath("build") - recompiledfile_path = build_path.joinpath("LEGO1.DLL") - pdb_path = build_path.joinpath("LEGO1.pdb") + origfile_path = repo_root.joinpath("legobin").joinpath(module.orig_filename()) + build_directory = repo_root.joinpath(module.build_dir_name()) + recompiledfile_name = f"{module.recomp_filename_without_extension()}.DLL" + recompiledfile_path = build_directory.joinpath(recompiledfile_name) + pdbfile_name = f"{module.recomp_filename_without_extension()}.PDB" + pdbfile_path = build_directory.joinpath(pdbfile_name) if not GLOBALS.verbose: logging.getLogger("isledecomp.bin").setLevel(logging.WARNING) @@ -225,7 +263,9 @@ def main(): with Bin(str(origfile_path), find_str=True) as origfile, Bin( str(recompiledfile_path) ) as recompfile: - isle_compare = IsleCompare(origfile, recompfile, str(pdb_path), str(repo_root)) + isle_compare = IsleCompare( + origfile, recompfile, str(pdbfile_path), str(repo_root) + ) logger.info("Comparison complete.") diff --git a/tools/ghidra_scripts/lego_util/function_importer.py b/tools/ghidra_scripts/lego_util/function_importer.py index 6ccc6935..379d412f 100644 --- a/tools/ghidra_scripts/lego_util/function_importer.py +++ b/tools/ghidra_scripts/lego_util/function_importer.py @@ -5,6 +5,7 @@ import logging from typing import Optional +from abc import ABC, abstractmethod from ghidra.program.model.listing import Function, Parameter from ghidra.program.flatapi import FlatProgramAPI @@ -24,6 +25,7 @@ ) from lego_util.ghidra_helper import ( add_data_type_or_reuse_existing, + create_ghidra_namespace, get_or_add_pointer_type, get_ghidra_namespace, sanitize_name, @@ -32,12 +34,10 @@ from lego_util.exceptions import StackOffsetMismatchError, Lego1Exception from lego_util.type_importer import PdbTypeImporter - logger = logging.getLogger(__name__) -# pylint: disable=too-many-instance-attributes -class PdbFunctionImporter: +class PdbFunctionImporter(ABC): """A representation of a function from the PDB with each type replaced by a Ghidra type instance.""" def __init__( @@ -48,20 +48,79 @@ def __init__( ): self.api = api self.match_info = func.match_info - self.signature = func.signature - self.is_stub = func.is_stub self.type_importer = type_importer - if self.signature.class_type is not None: - # Import the base class so the namespace exists - self.type_importer.import_pdb_type_into_ghidra(self.signature.class_type) - assert self.match_info.name is not None colon_split = sanitize_name(self.match_info.name).split("::") self.name = colon_split.pop() namespace_hierachy = colon_split - self.namespace = get_ghidra_namespace(api, namespace_hierachy) + self.namespace = self._do_get_namespace(namespace_hierachy) + + def _do_get_namespace(self, namespace_hierarchy: list[str]): + return get_ghidra_namespace(self.api, namespace_hierarchy) + + def get_full_name(self) -> str: + return f"{self.namespace.getName()}::{self.name}" + + @staticmethod + def build(api: FlatProgramAPI, func: PdbFunction, type_importer: "PdbTypeImporter"): + return ( + ThunkPdbFunctionImport(api, func, type_importer) + if func.signature is None + else FullPdbFunctionImporter(api, func, type_importer) + ) + + @abstractmethod + def matches_ghidra_function(self, ghidra_function: Function) -> bool: + ... + + @abstractmethod + def overwrite_ghidra_function(self, ghidra_function: Function): + ... + + +class ThunkPdbFunctionImport(PdbFunctionImporter): + """For importing thunk functions (like vtordisp or debug build thunks) into Ghidra. + Only the name of the function will be imported.""" + + def _do_get_namespace(self, namespace_hierarchy: list[str]): + """We need to create the namespace because we don't import the return type here""" + return create_ghidra_namespace(self.api, namespace_hierarchy) + + def matches_ghidra_function(self, ghidra_function: Function) -> bool: + name_match = self.name == ghidra_function.getName(False) + namespace_match = self.namespace == ghidra_function.getParentNamespace() + + logger.debug("Matches: namespace=%s name=%s", namespace_match, name_match) + + return name_match and namespace_match + + def overwrite_ghidra_function(self, ghidra_function: Function): + ghidra_function.setName(self.name, SourceType.USER_DEFINED) + ghidra_function.setParentNamespace(self.namespace) + + +# pylint: disable=too-many-instance-attributes +class FullPdbFunctionImporter(PdbFunctionImporter): + """For importing functions into Ghidra where all information are available.""" + + def __init__( + self, + api: FlatProgramAPI, + func: PdbFunction, + type_importer: "PdbTypeImporter", + ): + super().__init__(api, func, type_importer) + + assert func.signature is not None + self.signature = func.signature + + self.is_stub = func.is_stub + + if self.signature.class_type is not None: + # Import the base class so the namespace exists + self.type_importer.import_pdb_type_into_ghidra(self.signature.class_type) self.return_type = type_importer.import_pdb_type_into_ghidra( self.signature.return_type @@ -75,17 +134,6 @@ def __init__( for (index, type_name) in enumerate(self.signature.arglist) ] - @property - def call_type(self): - return self.signature.call_type - - @property - def stack_symbols(self): - return self.signature.stack_symbols - - def get_full_name(self) -> str: - return f"{self.namespace.getName()}::{self.name}" - def matches_ghidra_function(self, ghidra_function: Function) -> bool: """Checks whether this function declaration already matches the description in Ghidra""" name_match = self.name == ghidra_function.getName(False) @@ -235,7 +283,7 @@ def overwrite_ghidra_function(self, ghidra_function: Function): ghidra_function.setName(self.name, SourceType.USER_DEFINED) ghidra_function.setParentNamespace(self.namespace) ghidra_function.setReturnType(self.return_type, SourceType.USER_DEFINED) - ghidra_function.setCallingConvention(self.call_type) + ghidra_function.setCallingConvention(self.signature.call_type) if self.is_stub: logger.debug( @@ -306,7 +354,7 @@ def get_matching_stack_symbol(self, stack_offset: int) -> Optional[CppStackSymbo return next( ( symbol - for symbol in self.stack_symbols + for symbol in self.signature.stack_symbols if isinstance(symbol, CppStackSymbol) and symbol.stack_offset == stack_offset ), @@ -319,7 +367,7 @@ def get_matching_register_symbol( return next( ( symbol - for symbol in self.stack_symbols + for symbol in self.signature.stack_symbols if isinstance(symbol, CppRegisterSymbol) and symbol.register == register ), None, diff --git a/tools/ghidra_scripts/lego_util/ghidra_helper.py b/tools/ghidra_scripts/lego_util/ghidra_helper.py index f6726482..24e70c65 100644 --- a/tools/ghidra_scripts/lego_util/ghidra_helper.py +++ b/tools/ghidra_scripts/lego_util/ghidra_helper.py @@ -1,6 +1,7 @@ """A collection of helper functions for the interaction with Ghidra.""" import logging +import re from lego_util.exceptions import ( ClassOrNamespaceNotFoundInGhidraError, @@ -80,25 +81,42 @@ def create_ghidra_namespace( return namespace +# These appear in debug builds +THUNK_OF_RE = re.compile(r"^Thunk of '(.*)'$") + + def sanitize_name(name: str) -> str: """ Takes a full class or function name and replaces characters not accepted by Ghidra. - Applies mostly to templates and names like `vbase destructor`. + Applies mostly to templates, names like `vbase destructor`, and thunks in debug build. """ - new_class_name = ( + if (match := THUNK_OF_RE.fullmatch(name)) is not None: + is_thunk = True + name = match.group(1) + else: + is_thunk = False + + # Replace characters forbidden in Ghidra + new_name = ( name.replace("<", "[") .replace(">", "]") .replace("*", "#") .replace(" ", "_") .replace("`", "'") ) - if "<" in name: - new_class_name = "_template_" + new_class_name - if new_class_name != name: - logger.warning( - "Class or function name contains characters forbidden by Ghidra, changing from '%s' to '%s'", + if "<" in name: + new_name = "_template_" + new_name + + if is_thunk: + split = new_name.split("::") + split[-1] = "_thunk_" + split[-1] + new_name = "::".join(split) + + if new_name != name: + logger.info( + "Changed class or function name from '%s' to '%s' to avoid Ghidra issues", name, - new_class_name, + new_name, ) - return new_class_name + return new_name diff --git a/tools/ghidra_scripts/lego_util/headers.pyi b/tools/ghidra_scripts/lego_util/headers.pyi index 89960443..b1655437 100644 --- a/tools/ghidra_scripts/lego_util/headers.pyi +++ b/tools/ghidra_scripts/lego_util/headers.pyi @@ -1,4 +1,4 @@ -from typing import TypeVar +from typing import TypeVar, Any import ghidra # pylint: disable=invalid-name,unused-argument @@ -17,3 +17,4 @@ def getFunctionAt( def createFunction( entryPoint: ghidra.program.model.address.Address, name: str ) -> ghidra.program.model.listing.Function: ... +def getProgramFile() -> Any: ... # actually java.io.File diff --git a/tools/ghidra_scripts/lego_util/pdb_extraction.py b/tools/ghidra_scripts/lego_util/pdb_extraction.py index 4ba1ac71..a45242b4 100644 --- a/tools/ghidra_scripts/lego_util/pdb_extraction.py +++ b/tools/ghidra_scripts/lego_util/pdb_extraction.py @@ -3,6 +3,7 @@ from typing import Any, Optional import logging +from isledecomp.bin import InvalidVirtualAddressError from isledecomp.cvdump.symbols import SymbolsEntry from isledecomp.compare import Compare as IsleCompare from isledecomp.compare.db import MatchInfo @@ -43,7 +44,7 @@ class FunctionSignature: @dataclass class PdbFunction: match_info: MatchInfo - signature: FunctionSignature + signature: Optional[FunctionSignature] is_stub: bool @@ -74,9 +75,7 @@ def _get_cvdump_type(self, type_name: Optional[str]) -> Optional[dict[str, Any]] def get_func_signature(self, fn: SymbolsEntry) -> Optional[FunctionSignature]: function_type_str = fn.func_type if function_type_str == "T_NOTYPE(0000)": - logger.debug( - "Skipping a NOTYPE (synthetic or template + synthetic): %s", fn.name - ) + logger.debug("Treating NOTYPE function as thunk: %s", fn.name) return None # get corresponding function type @@ -145,8 +144,6 @@ def handle_matched_function(self, match_info: MatchInfo) -> Optional[PdbFunction assert match_info.orig_addr is not None match_options = self.compare.get_match_options(match_info.orig_addr) assert match_options is not None - if match_options.get("skip", False): - return None function_data = next( ( @@ -156,11 +153,19 @@ def handle_matched_function(self, match_info: MatchInfo) -> Optional[PdbFunction ), None, ) - if not function_data: - logger.error( - "Did not find function in nodes, skipping: %s", match_info.name - ) - return None + if function_data is None: + try: + # this can be either a thunk (which we want) or an external function + # (which we don't want), so we tell them apart based on the validity of their address. + self.compare.orig_bin.get_relative_addr(match_info.orig_addr) + return PdbFunction(match_info, None, False) + except InvalidVirtualAddressError: + logger.debug( + "Skipping external function %s (address 0x%x not in original binary)", + match_info.name, + match_info.orig_addr, + ) + return None function_symbol = function_data.symbol_entry if function_symbol is None: @@ -171,8 +176,6 @@ def handle_matched_function(self, match_info: MatchInfo) -> Optional[PdbFunction return None function_signature = self.get_func_signature(function_symbol) - if function_signature is None: - return None is_stub = match_options.get("stub", False) diff --git a/tools/isledecomp/isledecomp/compare/core.py b/tools/isledecomp/isledecomp/compare/core.py index 3cf827e9..8a1fc153 100644 --- a/tools/isledecomp/isledecomp/compare/core.py +++ b/tools/isledecomp/isledecomp/compare/core.py @@ -479,14 +479,31 @@ def _find_vtordisp(self): construct the name of the vtordisp function and match based on that.""" for match in self._db.get_matches_by_type(SymbolType.VTABLE): + assert ( + match.name is not None + and match.orig_addr is not None + and match.recomp_addr is not None + and match.size is not None + ) # We need some method of identifying vtables that # might have thunks, and this ought to work okay. if "{for" not in match.name: continue + next_orig = self._db.get_next_orig_addr(match.orig_addr) + assert next_orig is not None + orig_upper_size_limit = next_orig - match.orig_addr + if orig_upper_size_limit < match.size: + # This could happen in debug builds due to code changes between BETA10 and LEGO1, + # but we have not seen it yet as of 2024-08-28. + logger.warning( + "Recomp vtable is larger than orig vtable for %s", + match.name, + ) + # TODO: We might want to fix this at the source (cvdump) instead. # Any problem will be logged later when we compare the vtable. - vtable_size = 4 * (match.size // 4) + vtable_size = 4 * (min(match.size, orig_upper_size_limit) // 4) orig_table = self.orig_bin.read(match.orig_addr, vtable_size) recomp_table = self.recomp_bin.read(match.recomp_addr, vtable_size) @@ -497,51 +514,65 @@ def _find_vtordisp(self): # Now walk both vtables looking for thunks. for orig_addr, recomp_addr in raw_addrs: - if not self._db.is_vtordisp(recomp_addr): - continue + if orig_addr == 0: + # This happens in debug builds due to code changes between BETA10 and LEGO1. + # Note that there is a risk of running into the next vtable if there is no gap in between, + # which we cannot protect against at the moment. + logger.warning( + "Recomp vtable is larger than orig vtable for %s", match.name + ) + break - thunk_fn = self.get_by_recomp(recomp_addr) + if self._db.is_vtordisp(recomp_addr): + self._match_vtordisp_in_vtable(orig_addr, recomp_addr) - # Read the function bytes here. - # In practice, the adjuster thunk will be under 16 bytes. - # If we have thunks of unequal size, we can still tell whether - # they are thunking the same function by grabbing the - # JMP instruction at the end. - thunk_presumed_size = max(thunk_fn.size, 16) + def _match_vtordisp_in_vtable(self, orig_addr, recomp_addr): + thunk_fn = self.get_by_recomp(recomp_addr) + assert thunk_fn is not None + assert thunk_fn.size is not None - # Strip off MSVC padding 0xcc bytes. - # This should be safe to do; it is highly unlikely that - # the MSB of the jump displacement would be 0xcc. (huge jump) - orig_thunk_bin = self.orig_bin.read( - orig_addr, thunk_presumed_size - ).rstrip(b"\xcc") + # Read the function bytes here. + # In practice, the adjuster thunk will be under 16 bytes. + # If we have thunks of unequal size, we can still tell whether they are thunking + # the same function by grabbing the JMP instruction at the end. + thunk_presumed_size = max(thunk_fn.size, 16) - recomp_thunk_bin = self.recomp_bin.read( - recomp_addr, thunk_presumed_size - ).rstrip(b"\xcc") + # Strip off MSVC padding 0xcc bytes. + # This should be safe to do; it is highly unlikely that + # the MSB of the jump displacement would be 0xcc. (huge jump) + orig_thunk_bin = self.orig_bin.read(orig_addr, thunk_presumed_size).rstrip( + b"\xcc" + ) - # Read jump opcode and displacement (last 5 bytes) - (orig_jmp, orig_disp) = struct.unpack(" bool: if "`vtordisp" in name: return True + if decorated_name is None: + # happens in debug builds, e.g. for "Thunk of 'LegoAnimActor::ClassName'" + return False + new_name = get_vtordisp_name(decorated_name) if new_name is None: return False diff --git a/util/decomp.h b/util/decomp.h index 81cb1d13..673df553 100644 --- a/util/decomp.h +++ b/util/decomp.h @@ -1,6 +1,12 @@ #ifndef DECOMP_H #define DECOMP_H +#ifndef NDEBUG +// Disable size assertions for debug builds because the sizes differ between debug and release builds. +// The release LEGO1.DLL is what we ultimately want to decompile, so this is what we assert against. +#undef ENABLE_DECOMP_ASSERTS +#endif + #if defined(ENABLE_DECOMP_ASSERTS) #define DECOMP_STATIC_ASSERT(V) \ namespace \