Decomp parser: indirect globals and string markers (#446)

* Enable string annotations and indirect globals

* Adding some STRING annotations

* Library functions
This commit is contained in:
MS 2024-01-17 06:56:34 -05:00 committed by GitHub
parent 6af0c6cb1a
commit aaa18bc9e2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
28 changed files with 738 additions and 93 deletions

63
ISLE/library_msvc.h Normal file
View file

@ -0,0 +1,63 @@
#ifdef 0
// For ISLE symbols only
// aka `operator new`
// LIBRARY: ISLE 0x402f80
// ??2@YAPAXI@Z
// aka `operator delete`
// LIBRARY: ISLE 0x402fa0
// ??3@YAXPAX@Z
// LIBRARY: ISLE 0x406dd0
// _malloc
// LIBRARY: ISLE 0x406f00
// _free
// LIBRARY: ISLE 0x407ec0
// ___CxxFrameHandler
// LIBRARY: ISLE 0x4081e0
// _srand
// LIBRARY: ISLE 0x4081f0
// _rand
// LIBRARY: ISLE 0x408220
// _atol
// LIBRARY: ISLE 0x4082d0
// _atoi
// LIBRARY: ISLE 0x4084c0
// ?_query_new_handler@@YAP6AHI@ZXZ
// LIBRARY: ISLE 0x4084d0
// ?_query_new_mode@@YAHXZ
// LIBRARY: ISLE 0x4085c0
// _sprintf
// LIBRARY: ISLE 0x408630
// _abort
// LIBRARY: ISLE 0x409110
// __mtinit
// LIBRARY: ISLE 0x409190
// __getptd
// GLOBAL: ISLE 0x4108e8
// __osver
// GLOBAL: ISLE 0x4108f0
// __winmajor
// GLOBAL: ISLE 0x4108f4
// __winminor
// GLOBAL: ISLE 0x410d50
// __newmode
#endif

312
ISLE/library_smartheap.h Normal file
View file

@ -0,0 +1,312 @@
#ifdef 0
// LIBRARY: ISLE 0x402f10
// ?shi_New@@YAPAXKIPAU_SHI_Pool@@@Z
// LIBRARY: ISLE 0x402fb0
// _MemInitDefaultPool@0
// LIBRARY: ISLE 0x403020
// _shi_call_new_handler_msc
// LIBRARY: ISLE 0x403050
// _MemPoolShrink@4
// LIBRARY: ISLE 0x403180
// _MemPoolPreAllocate@12
// LIBRARY: ISLE 0x403300
// @_shi_initPageHeaders@4
// LIBRARY: ISLE 0x403570
// @shi_allocPageHeader@4
// LIBRARY: ISLE 0x4035a0
// @shi_freePageHeader@8
// LIBRARY: ISLE 0x403750
// @_shi_deletePage@8
// LIBRARY: ISLE 0x403830
// @_shi_allocExternal@12
// LIBRARY: ISLE 0x403a50
// @_shi_initPageVariable@8
// LIBRARY: ISLE 0x403b00
// _MemAllocPtr@12
// LIBRARY: ISLE 0x403d60
// @_shi_allocVar@12
// LIBRARY: ISLE 0x403ef0
// @_shi_allocBlock@12
// LIBRARY: ISLE 0x4040c0
// _MemFreePtr@4
// LIBRARY: ISLE 0x404170
// @_shi_freeVar@4
// LIBRARY: ISLE 0x404260
// _MemReAllocPtr@12
// LIBRARY: ISLE 0x4043b0
// @_shi_resizeAny@16
// LIBRARY: ISLE 0x404650
// @_shi_resizeVar@8
// LIBRARY: ISLE 0x404820
// _MemSizePtr@4
// LIBRARY: ISLE 0x4048d0
// @shi_findAllocAddress@4
// LIBRARY: ISLE 0x404910
// @_shi_sysAlloc@8
// LIBRARY: ISLE 0x4049a0
// @_shi_sysFree@4
// LIBRARY: ISLE 0x404a00
// @_shi_sysRealloc@12
// LIBRARY: ISLE 0x404ab0
// @_shi_sysResize@12
// LIBRARY: ISLE 0x404b90
// @_shi_sysSize@4
// LIBRARY: ISLE 0x404bd0
// @_shi_sysAllocNear@4
// LIBRARY: ISLE 0x404bf0
// @_shi_sysFreeNear@4
// LIBRARY: ISLE 0x404c10
// @_shi_sysValidatePtr@12
// LIBRARY: ISLE 0x404d10
// @_shi_sysValidateFunction@4
// LIBRARY: ISLE 0x405300
// @_shi_sysAllocPool@12
// LIBRARY: ISLE 0x405520
// @_shi_sysResizePool@16
// LIBRARY: ISLE 0x405690
// @_shi_sysFreePage@4
// LIBRARY: ISLE 0x4057b0
// @_shi_sysSizePage@4
// LIBRARY: ISLE 0x4057e0
// @_shi_sysSizePool@8
// LIBRARY: ISLE 0x405800
// @_shi_registerShared@16
// LIBRARY: ISLE 0x405a00
// @_shi_unregisterShared@8
// LIBRARY: ISLE 0x405b20
// @_shi_getNextPool@4
// LIBRARY: ISLE 0x405b30
// @shi_delNextPool@4
// LIBRARY: ISLE 0x405d30
// @shi_createAndEnterMutexShr@12
// LIBRARY: ISLE 0x405e20
// @shi_termPoolMutexShr@4
// LIBRARY: ISLE 0x405e40
// @shi_enterPoolMutexShr@4
// LIBRARY: ISLE 0x405e60
// @shi_leavePoolMutexShr@4
// LIBRARY: ISLE 0x405e80
// __shi_enterCriticalSection@0
// LIBRARY: ISLE 0x405ea0
// __shi_leaveCriticalSection@0
// LIBRARY: ISLE 0x405ec0
// __shi_createAndEnterMutex
// LIBRARY: ISLE 0x405ef0
// _shi_enterPoolMutexSafely
// LIBRARY: ISLE 0x405fd0
// _shi_enterPoolInitMutexReader
// LIBRARY: ISLE 0x406060
// _shi_leavePoolInitMutexReader
// LIBRARY: ISLE 0x406090
// _shi_enterPoolInitMutexWriter
// LIBRARY: ISLE 0x406160
// _shi_leavePoolInitMutexWriter
// LIBRARY: ISLE 0x406180
// _shi_isNT
// LIBRARY: ISLE 0x4061b0
// _MemPoolInit@4
// LIBRARY: ISLE 0x406520
// _MemPoolSetPageSize@8
// LIBRARY: ISLE 0x406630
// _MemPoolSetBlockSizeFS@8
// LIBRARY: ISLE 0x406710
// @_shi_poolFree@8
// LIBRARY: ISLE 0x4068c0
// @_shi_invokeErrorHandler1@8
// LIBRARY: ISLE 0x406be0
// _MemErrorUnwind@0
// LIBRARY: ISLE 0x406c30
// _MemDefaultErrorHandler@4
// LIBRARY: ISLE 0x406cb0
// @_shi_taskRemovePool@4
// LIBRARY: ISLE 0x406d50
// @_shi_getCurrentThreadContext@8
// LIBRARY: ISLE 0x406db0
// @_shi_deleteThreadContext@8
// LIBRARY: ISLE 0x406e40
// _calloc
// LIBRARY: ISLE 0x406ea0
// _realloc
// LIBRARY: ISLE 0x406f10
// __expand
// LIBRARY: ISLE 0x406f50
// __heapadd
// LIBRARY: ISLE 0x406f60
// __heapwalk
// LIBRARY: ISLE 0x406ff0
// __heapused
// LIBRARY: ISLE 0x407020
// __heapmin
// LIBRARY: ISLE 0x407040
// __msize
// LIBRARY: ISLE 0x407050
// __heapchk
// LIBRARY: ISLE 0x407080
// __heapset
// LIBRARY: ISLE 0x407090
// @_shi_sysReportError@16
// LIBRARY: ISLE 0x407110
// _MemPoolSize@4
// LIBRARY: ISLE 0x4071a0
// _MemPoolWalk@8
// LIBRARY: ISLE 0x407240
// @_shi_walkPool@16
// LIBRARY: ISLE 0x407540
// @shi_isBlockInUseSmall@8
// LIBRARY: ISLE 0x407800
// @_shi_isBlockInUseFS@12
// LIBRARY: ISLE 0x407880
// _MemPoolCheck@4
// LIBRARY: ISLE 0x407b20
// _MemCheckPtr@8
// LIBRARY: ISLE 0x4084e0
// __except_handler3
// GLOBAL: ISLE 0x40f0a0
// _szLibName
// GLOBAL: ISLE 0x4102f4
// ?_new_handler@@3P6AXXZA
// GLOBAL: ISLE 0x4102fc
// _MemDefaultPool
// GLOBAL: ISLE 0x41031c
// __shi_compactPoolFn
// GLOBAL: ISLE 0x410320
// __shi_compactPageFn
// GLOBAL: ISLE 0x410324
// _MemDefaultPoolFlags
// GLOBAL: ISLE 0x41032c
// __shi_mutexGlobalInit
// GLOBAL: ISLE 0x410330
// __shi_mutexMovInit
// GLOBAL: ISLE 0x410334
// __shi_mutexMovLockCount
// GLOBAL: ISLE 0x410338
// _shi_initPoolReaders
// GLOBAL: ISLE 0x41033c
// _shi_eventInitPool
// GLOBAL: ISLE 0x410340
// _shi_mutexMovShr
// GLOBAL: ISLE 0x410368
// _shi_deferFreePools
// GLOBAL: ISLE 0x410378
// __shi_poolTerminating
// GLOBAL: ISLE 0x41037c
// _MemDefaultPoolBlockSizeFS
// GLOBAL: ISLE 0x410380
// _MemDefaultPoolPageSize
// GLOBAL: ISLE 0x410384
// _SmartHeap_malloc
// GLOBAL: ISLE 0x4105b0
// __shi_TaskRecord
// GLOBAL: ISLE 0x4125f8
// ?_pnhHeap@@3P6AHI@ZA
// GLOBAL: ISLE 0x412830
// __shi_mutexMov
// GLOBAL: ISLE 0x412850
// _shi_mutexPoolSynch
// GLOBAL: ISLE 0x412870
// __shi_mutexGlobal
#endif

View file

@ -11,19 +11,25 @@ MxS32 g_mxcoreCount[101] = {0, -6643, -5643, -5058, -4643, -4321, -4058, -38
-136, -120, -104, -89, -74, -58, -43, -29, -14, 0};
// GLOBAL: LEGO1 0x10102048
// STRING: LEGO1 0x10102040
const char* g_strACTION = "ACTION";
// GLOBAL: LEGO1 0x1010209c
// STRING: LEGO1 0x10101f58
const char* g_strOBJECT = "OBJECT";
// GLOBAL: LEGO1 0x101020b0
// STRING: LEGO1 0x10101f20
const char* g_strSOUND = "SOUND";
// GLOBAL: LEGO1 0x101020cc
// STRING: LEGO1 0x100f3808
const char* g_strVISIBILITY = "VISIBILITY";
// GLOBAL: LEGO1 0x101020d0
// STRING: LEGO1 0x10101edc
const char* g_strWORLD = "WORLD";
// GLOBAL: LEGO1 0x101020e4
// STRING: LEGO1 0x10101eac
const char* g_parseExtraTokens = ":;";

View file

@ -8,12 +8,15 @@
DECOMP_SIZE_ASSERT(LegoBackgroundColor, 0x30)
// GLOBAL: LEGO1 0x100f3fb0
const char* g_delimiter = "\t";
// STRING: LEGO1 0x100f3a18
const char* g_delimiter = " \t";
// GLOBAL: LEGO1 0x100f3fb4
// STRING: LEGO1 0x100f3bf0
const char* g_set = "set";
// GLOBAL: LEGO1 0x100f3fb8
// STRING: LEGO1 0x100f0cdc
const char* g_reset = "reset";
// FUNCTION: LEGO1 0x1003bfb0

View file

@ -8,9 +8,11 @@
DECOMP_SIZE_ASSERT(LegoFullScreenMovie, 0x24)
// GLOBAL: LEGO1 0x100f3fbc
// STRING: LEGO1 0x100f3be8
const char* g_strEnable = "enable";
// GLOBAL: LEGO1 0x100f3fc0
// STRING: LEGO1 0x100f3bf4
const char* g_strDisable = "disable";
// FUNCTION: LEGO1 0x1003c500

View file

@ -20,12 +20,15 @@
DECOMP_SIZE_ASSERT(LegoGameState, 0x430)
// GLOBAL: LEGO1 0x100f3e40
// STRING: LEGO1 0x100f3e3c
const char* g_fileExtensionGS = ".GS";
// GLOBAL: LEGO1 0x100f3e44
// STRING: LEGO1 0x100f3e30
const char* g_playersGSI = "Players.gsi";
// GLOBAL: LEGO1 0x100f3e48
// STRING: LEGO1 0x100f3e24
const char* g_historyGSI = "History.gsi";
// GLOBAL: LEGO1 0x100f3e58

View file

@ -10,6 +10,7 @@
// the text "END_OF_VARIABLES" in it.
// TODO: make g_endOfVariables reference the actual end of the variable array.
// GLOBAL: LEGO1 0x100f3e50
// STRING: LEGO1 0x100f3e00
const char* g_endOfVariables = "END_OF_VARIABLES";
// Very likely but not certain sizes.

View file

@ -110,6 +110,7 @@ MxAtomId* g_creditsScript = NULL;
MxAtomId* g_nocdSourceName = NULL;
// GLOBAL: LEGO1 0x100f6718
// STRING: LEGO1 0x100f6710
const char* g_current = "current";
// GLOBAL: LEGO1 0x100f4c58

View file

@ -7,24 +7,31 @@
DECOMP_SIZE_ASSERT(LegoMeterPresenter, 0x94)
// GLOBAL: LEGO1 0x1010207c
// STRING: LEGO1 0x10101fb4
const char* g_filterIndex = "FILTER_INDEX";
// GLOBAL: LEGO1 0x10102094
// STRING: LEGO1 0x10101f70
const char* g_type = "TYPE";
// GLOBAL: LEGO1 0x10102088
// STRING: LEGO1 0x10101f94
const char* g_leftToRight = "LEFT_TO_RIGHT";
// GLOBAL: LEGO1 0x101020ac
// STRING: LEGO1 0x10101f28
const char* g_rightToLeft = "RIGHT_TO_LEFT";
// GLOBAL: LEGO1 0x1010205c
// STRING: LEGO1 0x10102000
const char* g_bottomToTop = "BOTTOM_TO_TOP";
// GLOBAL: LEGO1 0x101020c0
// STRING: LEGO1 0x10101f00
const char* g_topToBottom = "TOP_TO_BOTTOM";
// GLOBAL: LEGO1 0x101020c8
// STRING: LEGO1 0x10101ee4
const char* g_variable = "VARIABLE";
// FUNCTION: LEGO1 0x10043430

View file

@ -1,39 +1,32 @@
#ifdef 0
// For LEGO1 symbols only
// aka `operator new`
// LIBRARY: ISLE 0x402f80
// LIBRARY: LEGO1 0x10086240
// ??2@YAPAXI@Z
// aka `operator delete`
// LIBRARY: ISLE 0x402fa0
// LIBRARY: LEGO1 0x10086260
// ??3@YAXPAX@Z
// LIBRARY: ISLE 0x406dd0
// LIBRARY: LEGO1 0x1008a090
// _malloc
// LIBRARY: ISLE 0x406f00
// LIBRARY: LEGO1 0x1008a1c0
// _free
// LIBRARY: ISLE 0x407ec0
// LIBRARY: LEGO1 0x1008b020
// ___CxxFrameHandler
// LIBRARY: ISLE 0x408220
// LIBRARY: LEGO1 0x1008b400
// _atol
// LIBRARY: ISLE 0x4082d0
// LIBRARY: LEGO1 0x1008b4b0
// _atoi
// LIBRARY: LEGO1 0x1008b4c0
// _strtok
// LIBRARY: ISLE 0x4085c0
// LIBRARY: LEGO1 0x1008b5a0
// _sprintf
@ -43,6 +36,9 @@
// LIBRARY: LEGO1 0x1008b630
// _srand
// LIBRARY: LEGO1 0x1008b640
// _rand
// LIBRARY: LEGO1 0x1008b680
// _strncmp
@ -91,19 +87,6 @@
// LIBRARY: LEGO1 0x10097b10
// _strchr
// LIBRARY: ISLE 0x4081e0
// _srand
// LIBRARY: ISLE 0x4081f0
// LIBRARY: LEGO1 0x1008b640
// _rand
// LIBRARY: ISLE 0x409110
// __mtinit
// LIBRARY: ISLE 0x409190
// __getptd
// LIBRARY: LEGO1 0x100d1ed0
// _strnicmp

View file

@ -50,6 +50,8 @@ void MxVariableTable::SetVariable(MxVariable* p_var)
// FUNCTION: LEGO1 0x100b78f0
const char* MxVariableTable::GetVariable(const char* p_key)
{
// STRING: ISLE 0x41008c
// STRING: LEGO1 0x100f01d4
const char* value = "";
MxHashTableCursor<MxVariable*> cursor(this);
MxVariable* var = new MxVariable(p_key);

View file

@ -11,6 +11,7 @@
DECOMP_SIZE_ASSERT(MxStillPresenter, 0x6c);
// GLOBAL: LEGO1 0x101020e0
// STRING: LEGO1 0x10101eb0
const char* g_strBmpIsmap = "BMP_ISMAP";
// FUNCTION: LEGO1 0x100b9c70

View file

@ -85,13 +85,19 @@ def _load_cvdump(self):
if sym.node_type == SymbolType.STRING:
string_info = demangle_string_const(sym.decorated_name)
if string_info is None:
logger.debug(
"Could not demangle string symbol: %s", sym.decorated_name
)
continue
# TODO: skip unicode for now. will need to handle these differently.
if string_info.is_utf16:
continue
raw = self.recomp_bin.read(addr, sym.size())
try:
sym.friendly_name = raw.decode("latin1")
sym.friendly_name = raw.decode("latin1").rstrip("\x00")
except UnicodeDecodeError:
pass
@ -134,6 +140,26 @@ def _load_markers(self):
for tbl in codebase.iter_vtables():
self._db.match_vtable(tbl.offset, tbl.name)
for string in codebase.iter_strings():
# Not that we don't trust you, but we're checking the string
# annotation to make sure it is accurate.
try:
# TODO: would presumably fail for wchar_t strings
orig = self.orig_bin.read_string(string.offset).decode("latin1")
string_correct = string.name == orig
except UnicodeDecodeError:
string_correct = False
if not string_correct:
logger.error(
"Data at 0x%x does not match string %s",
string.offset,
repr(string.name),
)
continue
self._db.match_string(string.offset, string.name)
def _find_original_strings(self):
"""Go to the original binary and look for the specified string constants
to find a match. This is a (relatively) expensive operation so we only

View file

@ -43,7 +43,8 @@ def match_name(self) -> str:
return None
ctype = self.compare_type.name if self.compare_type is not None else "UNK"
return f"{self.name} ({ctype})"
name = repr(self.name) if ctype == "STRING" else self.name
return f"{name} ({ctype})"
def matchinfo_factory(_, row):
@ -197,3 +198,5 @@ def match_string(self, addr: int, value: str) -> bool:
if not did_match:
escaped = repr(value)
logger.error("Failed to find string: %s", escaped)
return did_match

View file

@ -94,7 +94,11 @@ def set_decorated(self, name: str):
def name(self) -> Optional[str]:
"""Prefer "friendly" name if we have it.
This is what we have been using to match functions."""
return self.friendly_name or self.decorated_name
return (
self.friendly_name
if self.friendly_name is not None
else self.decorated_name
)
def size(self) -> Optional[int]:
if self.confirmed_size is not None:

View file

@ -4,6 +4,7 @@
"""
import re
from collections import namedtuple
from typing import Optional
class InvalidEncodedNumberError(Exception):
@ -30,13 +31,12 @@ def parse_encoded_number(string: str) -> int:
StringConstInfo = namedtuple("StringConstInfo", "len is_utf16")
def demangle_string_const(symbol: str) -> StringConstInfo:
def demangle_string_const(symbol: str) -> Optional[StringConstInfo]:
"""Don't bother to decode the string text from the symbol.
We can just read it from the binary once we have the length."""
match = string_const_regex.match(symbol)
if match is None:
# See below
return StringConstInfo(0, False)
return None
try:
strlen = (
@ -45,10 +45,7 @@ def demangle_string_const(symbol: str) -> StringConstInfo:
else int(match.group("len"))
)
except (ValueError, InvalidEncodedNumberError):
# This would be an annoying error to fail on if we get a bad symbol.
# For now, just assume a zero length string because this will probably
# raise some eyebrows during the comparison.
strlen = 0
return None
is_utf16 = match.group("is_utf16") == "1"
return StringConstInfo(len=strlen, is_utf16=is_utf16)

View file

@ -6,6 +6,7 @@
ParserFunction,
ParserVtable,
ParserVariable,
ParserString,
)
@ -42,3 +43,6 @@ def iter_vtables(self) -> Iterator[ParserVtable]:
def iter_variables(self) -> Iterator[ParserVariable]:
return filter(lambda s: isinstance(s, ParserVariable), self._symbols)
def iter_strings(self) -> Iterator[ParserString]:
return filter(lambda s: isinstance(s, ParserString), self._symbols)

View file

@ -70,6 +70,10 @@ class ParserError(Enum):
# a comment -- i.e. VTABLE or GLOBAL -- could not extract the name
NO_SUITABLE_NAME = 204
# ERROR: Two STRING markers have the same module and offset, but the strings
# they annotate are different.
WRONG_STRING = 205
@dataclass
class ParserAlert:

View file

@ -1,7 +1,7 @@
from typing import List, Optional
from .parser import DecompParser
from .error import ParserAlert, ParserError
from .node import ParserSymbol
from .node import ParserSymbol, ParserString
def get_checkorder_filter(module):
@ -19,6 +19,9 @@ def __init__(self) -> None:
# This is _not_ reset between files and is intended to report offset reuse
# when scanning the entire directory.
self._offsets_used = set()
# Keep track of strings we have seen. Persists across files.
# Module/offset can be repeated for string markers but the strings must match.
self._strings = {}
def reset(self, full_reset: bool = False):
self.alerts = []
@ -28,6 +31,7 @@ def reset(self, full_reset: bool = False):
if full_reset:
self._offsets_used.clear()
self._strings = {}
def file_is_header(self):
return self._filename.lower().endswith(".h")
@ -36,17 +40,31 @@ def _load_offsets_from_list(self, marker_list: List[ParserSymbol]):
"""Helper for loading (module, offset) tuples while the DecompParser
has them broken up into three different lists."""
for marker in marker_list:
is_string = isinstance(marker, ParserString)
value = (marker.module, marker.offset)
if value in self._offsets_used:
self.alerts.append(
ParserAlert(
code=ParserError.DUPLICATE_OFFSET,
line_number=marker.line_number,
line=f"0x{marker.offset:08x}",
if is_string:
if self._strings[value] != marker.name:
self.alerts.append(
ParserAlert(
code=ParserError.WRONG_STRING,
line_number=marker.line_number,
line=f"0x{marker.offset:08x}, {repr(self._strings[value])} vs. {repr(marker.name)}",
)
)
else:
self.alerts.append(
ParserAlert(
code=ParserError.DUPLICATE_OFFSET,
line_number=marker.line_number,
line=f"0x{marker.offset:08x}",
)
)
)
else:
self._offsets_used.add(value)
if is_string:
self._strings[value] = marker.name
def _check_function_order(self):
"""Rules:
@ -82,6 +100,7 @@ def _check_offset_uniqueness(self):
self._load_offsets_from_list(self._parser.functions)
self._load_offsets_from_list(self._parser.vtables)
self._load_offsets_from_list(self._parser.variables)
self._load_offsets_from_list(self._parser.strings)
def _check_byname_allowed(self):
if self.file_is_header():

View file

@ -3,6 +3,19 @@
from enum import Enum
class MarkerCategory(Enum):
"""For the purposes of grouping multiple different DecompMarkers together,
assign a rough "category" for the MarkerType values below.
It's really only the function types that have to get folded down, but
we'll do that in a structured way to permit future expansion."""
FUNCTION = 1
VARIABLE = 2
STRING = 3
VTABLE = 4
ADDRESS = 100 # i.e. no comparison required or possible
class MarkerType(Enum):
UNKNOWN = -100
FUNCTION = 1
@ -51,6 +64,23 @@ def module(self) -> str:
def offset(self) -> int:
return self._offset
@property
def category(self) -> MarkerCategory:
if self.is_vtable():
return MarkerCategory.VTABLE
if self.is_variable():
return MarkerCategory.VARIABLE
if self.is_string():
return MarkerCategory.STRING
# TODO: worth another look if we add more types, but this covers it
if self.is_regular_function() or self.is_explicit_byname():
return MarkerCategory.FUNCTION
return MarkerCategory.ADDRESS
def is_regular_function(self) -> bool:
"""Regular function, meaning: not an explicit byname lookup. FUNCTION
markers can be _implicit_ byname.

View file

@ -55,3 +55,8 @@ class ParserVariable(ParserSymbol):
@dataclass
class ParserVtable(ParserSymbol):
pass
@dataclass
class ParserString(ParserSymbol):
pass

View file

@ -3,11 +3,11 @@
from typing import List, Iterable, Iterator, Optional
from enum import Enum
from .util import (
is_blank_or_comment,
get_class_name,
get_variable_name,
get_synthetic_name,
remove_trailing_comment,
get_string_contents,
)
from .marker import (
DecompMarker,
@ -19,6 +19,7 @@
ParserFunction,
ParserVariable,
ParserVtable,
ParserString,
)
from .error import ParserAlert, ParserError
@ -43,17 +44,16 @@ def __init__(self) -> None:
def insert(self, marker: DecompMarker) -> bool:
"""Return True if this insert would overwrite"""
module = marker.module
if module in self.markers:
key = (marker.category, marker.module)
if key in self.markers:
return True
# TODO: type converted back to string version here instead of using enum
self.markers[module] = (marker.type.name, marker.offset)
self.markers[key] = marker
return False
def iter(self) -> Iterator[DecompMarker]:
for module, (marker_type, offset) in self.markers.items():
yield DecompMarker(marker_type, module, offset)
for _, marker in self.markers.items():
yield marker
def empty(self):
self.markers = {}
@ -111,17 +111,21 @@ def reset(self):
self.function_sig = ""
@property
def functions(self) -> List[ParserSymbol]:
def functions(self) -> List[ParserFunction]:
return [s for s in self._symbols if isinstance(s, ParserFunction)]
@property
def vtables(self) -> List[ParserSymbol]:
def vtables(self) -> List[ParserVtable]:
return [s for s in self._symbols if isinstance(s, ParserVtable)]
@property
def variables(self) -> List[ParserSymbol]:
def variables(self) -> List[ParserVariable]:
return [s for s in self._symbols if isinstance(s, ParserVariable)]
@property
def strings(self) -> List[ParserString]:
return [s for s in self._symbols if isinstance(s, ParserString)]
def iter_symbols(self, module: Optional[str] = None) -> Iterator[ParserSymbol]:
for s in self._symbols:
if module is None or s.module == module:
@ -225,21 +229,35 @@ def _variable_marker(self, marker: DecompMarker):
else:
self.state = ReaderState.IN_GLOBAL
def _variable_done(self, name: str):
if not name.startswith("g_"):
self._syntax_warning(ParserError.GLOBAL_MISSING_PREFIX)
def _variable_done(
self, variable_name: Optional[str] = None, string_value: Optional[str] = None
):
if variable_name is None and string_value is None:
self._syntax_error(ParserError.NO_SUITABLE_NAME)
return
for marker in self.var_markers.iter():
self._symbols.append(
ParserVariable(
type=marker.type,
line_number=self.line_number,
module=marker.module,
offset=marker.offset,
name=name,
is_static=self.state == ReaderState.IN_FUNC_GLOBAL,
if marker.is_string():
self._symbols.append(
ParserString(
type=marker.type,
line_number=self.line_number,
module=marker.module,
offset=marker.offset,
name=string_value,
)
)
else:
self._symbols.append(
ParserVariable(
type=marker.type,
line_number=self.line_number,
module=marker.module,
offset=marker.offset,
name=variable_name,
is_static=self.state == ReaderState.IN_FUNC_GLOBAL,
)
)
)
self.var_markers.empty()
if self.state == ReaderState.IN_FUNC_GLOBAL:
@ -298,20 +316,8 @@ def _handle_marker(self, marker: DecompMarker):
else:
self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
elif marker.is_string():
# TODO: We are ignoring string markers for the moment.
# We already have a lot of them in the codebase, though, so we'll
# hang onto them for now in case we can use them later.
# To match up string constants, the strategy will be:
# 1. Use cvdump to find all string constants in the recomp
# 2. In the original binary, look at relocated vaddrs from .rdata
# 3. Try to match up string data from #1 with locations in #2
# Throw the syntax error we would throw if we were parsing these
if self.state not in (ReaderState.SEARCH, ReaderState.IN_FUNC):
self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
elif marker.is_variable():
# Strings and variables are almost the same thing
elif marker.is_string() or marker.is_variable():
if self.state in (
ReaderState.SEARCH,
ReaderState.IN_GLOBAL,
@ -418,24 +424,39 @@ def read_line(self, line: str):
# function we have already parsed if state == IN_FUNC_GLOBAL.
# However, we are not tolerant of _any_ syntax problems in our
# CI actions, so the solution is to just fix the invalid marker.
if is_blank_or_comment(line):
self._syntax_error(ParserError.NO_SUITABLE_NAME)
variable_name = None
global_markers_queued = any(
m.is_variable() for m in self.var_markers.iter()
)
if len(line_strip) == 0:
self._syntax_warning(ParserError.UNEXPECTED_BLANK_LINE)
return
# We don't have a foolproof mechanism to tell what is and is not a variable.
# If the GLOBAL is being declared on a `return` statement, though, this is
# not correct. It is either a string literal (which will be handled differently)
# or it is not the variable declaration, which is incorrect decomp syntax.
if line.strip().startswith("return"):
self._syntax_error(ParserError.GLOBAL_NOT_VARIABLE)
return
if global_markers_queued:
# Not the greatest solution, but a consequence of combining GLOBAL and
# STRING markers together. If the marker precedes a return statement, it is
# valid for a STRING marker to be here, but not a GLOBAL. We need to look
# ahead and tell whether this *would* fail.
if line_strip.startswith("return"):
self._syntax_error(ParserError.GLOBAL_NOT_VARIABLE)
return
if line_strip.startswith("//"):
# If we found a comment, assume implicit lookup-by-name
# function and end here. We know this is not a decomp marker
# because it would have been handled already.
variable_name = get_synthetic_name(line)
else:
variable_name = get_variable_name(line)
# This is out of our control for library variables, but all of our
# variables should start with "g_".
if variable_name is not None and not variable_name.startswith("g_"):
self._syntax_warning(ParserError.GLOBAL_MISSING_PREFIX)
name = get_variable_name(line)
if name is None:
self._syntax_error(ParserError.NO_SUITABLE_NAME)
return
string_name = get_string_contents(line)
self._variable_done(name)
self._variable_done(variable_name, string_name)
elif self.state == ReaderState.IN_VTABLE:
vtable_class = get_class_name(line)

View file

@ -1,6 +1,7 @@
# C++ Parser utility functions and data structures
import re
from typing import Optional
from ast import literal_eval
# The goal here is to just read whatever is on the next line, so some
# flexibility in the formatting seems OK
@ -12,6 +13,10 @@
trailingCommentRegex = re.compile(r"(\s*(?://|/\*).*)$")
# Get string contents, ignore escape characters that might interfere
doubleQuoteRegex = re.compile(r"(\"(?:[^\"\\]|\\.)*\")")
def get_synthetic_name(line: str) -> Optional[str]:
"""Synthetic names appear on a single line comment on the line after the marker.
If that's not what we have, return None"""
@ -86,3 +91,20 @@ def get_variable_name(line: str) -> Optional[str]:
return match.group("name")
return None
def get_string_contents(line: str) -> Optional[str]:
"""Return the first C string seen on this line.
We have to unescape the string, and a simple way to do that is to use
python's ast.literal_eval. I'm sure there are many pitfalls to doing
it this way, but hopefully the regex will ensure reasonably sane input."""
try:
if (match := doubleQuoteRegex.search(line)) is not None:
return literal_eval(match.group(1))
# pylint: disable=broad-exception-caught
# No way to predict what kind of exception could occur.
except Exception:
pass
return None

View file

@ -14,6 +14,7 @@
14,
True,
),
("??_C@_00A@?$AA@", 0, False),
]

View file

@ -112,3 +112,33 @@ def test_duplicate_offsets(linter):
# Full reset will forget seen offsets.
linter.reset(True)
assert linter.check_lines(lines, "test.h", "TEST") is True
def test_duplicate_strings(linter):
"""Duplicate string markers are okay if the string value is the same."""
string_lines = [
"// STRING: TEST 0x1000",
'return "hello world";',
]
# No problem to use this marker twice.
assert linter.check_lines(string_lines, "test.h", "TEST") is True
assert linter.check_lines(string_lines, "test.h", "TEST") is True
different_string = [
"// STRING: TEST 0x1000",
'return "hi there";',
]
# Same address but the string is different
assert linter.check_lines(different_string, "greeting.h", "TEST") is False
assert len(linter.alerts) == 1
assert linter.alerts[0].code == ParserError.WRONG_STRING
same_addr_reused = [
"// GLOBAL:TEXT 0x1000",
"int g_test = 123;",
]
# This will fail like any other offset reuse.
assert linter.check_lines(same_addr_reused, "other.h", "TEST") is False

View file

@ -442,3 +442,82 @@ def test_static_variable(parser):
)
assert len(parser.variables) == 2
assert parser.variables[1].is_static is True
def test_reject_global_return(parser):
"""Previously we had annotated strings with the GLOBAL marker.
For example: if a function returned a string. We now want these to be
annotated with the STRING marker."""
parser.read_lines(
[
"// FUNCTION: TEST 0x5555",
"void test_function() {",
" // GLOBAL: TEST 0x8888",
' return "test";',
"}",
]
)
assert len(parser.variables) == 0
assert len(parser.alerts) == 1
assert parser.alerts[0].code == ParserError.GLOBAL_NOT_VARIABLE
def test_global_string(parser):
"""We now allow GLOBAL and STRING markers for the same item."""
parser.read_lines(
[
"// GLOBAL: TEST 0x1234",
"// STRING: TEXT 0x5555",
'char* g_test = "hello";',
]
)
assert len(parser.variables) == 1
assert len(parser.strings) == 1
assert len(parser.alerts) == 0
assert parser.variables[0].name == "g_test"
assert parser.strings[0].name == "hello"
def test_comment_variables(parser):
"""Match on hidden variables from libraries."""
parser.read_lines(
[
"// GLOBAL: TEST 0x1234",
"// g_test",
]
)
assert len(parser.variables) == 1
assert parser.variables[0].name == "g_test"
def test_flexible_variable_prefix(parser):
"""Don't alert to library variables that lack the g_ prefix.
This is out of our control."""
parser.read_lines(
[
"// GLOBAL: TEST 0x1234",
"// some_other_variable",
]
)
assert len(parser.variables) == 1
assert len(parser.alerts) == 0
assert parser.variables[0].name == "some_other_variable"
def test_string_ignore_g_prefix(parser):
"""String annotations above a regular variable should not alert to
the missing g_ prefix. This is only required for GLOBAL markers."""
parser.read_lines(
[
"// STRING: TEST 0x1234",
'const char* value = "";',
]
)
assert len(parser.strings) == 1
assert len(parser.alerts) == 0

View file

@ -15,7 +15,7 @@
(_rs.SEARCH, "TEMPLATE", _rs.IN_TEMPLATE, None),
(_rs.SEARCH, "VTABLE", _rs.IN_VTABLE, None),
(_rs.SEARCH, "LIBRARY", _rs.IN_LIBRARY, None),
(_rs.SEARCH, "STRING", _rs.SEARCH, None),
(_rs.SEARCH, "STRING", _rs.IN_GLOBAL, None),
(_rs.WANT_SIG, "FUNCTION", _rs.WANT_SIG, None),
(_rs.WANT_SIG, "GLOBAL", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
@ -33,7 +33,7 @@
(_rs.IN_FUNC, "TEMPLATE", _rs.IN_TEMPLATE, _pe.MISSED_END_OF_FUNCTION),
(_rs.IN_FUNC, "VTABLE", _rs.IN_VTABLE, _pe.MISSED_END_OF_FUNCTION),
(_rs.IN_FUNC, "LIBRARY", _rs.IN_LIBRARY, _pe.MISSED_END_OF_FUNCTION),
(_rs.IN_FUNC, "STRING", _rs.IN_FUNC, None),
(_rs.IN_FUNC, "STRING", _rs.IN_FUNC_GLOBAL, None),
(_rs.IN_TEMPLATE, "FUNCTION", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
(_rs.IN_TEMPLATE, "GLOBAL", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
@ -60,7 +60,7 @@
(_rs.IN_GLOBAL, "TEMPLATE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
(_rs.IN_GLOBAL, "VTABLE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
(_rs.IN_GLOBAL, "LIBRARY", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
(_rs.IN_GLOBAL, "STRING", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
(_rs.IN_GLOBAL, "STRING", _rs.IN_GLOBAL, None),
(_rs.IN_FUNC_GLOBAL, "FUNCTION", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
(_rs.IN_FUNC_GLOBAL, "GLOBAL", _rs.IN_FUNC_GLOBAL, None),
@ -69,7 +69,7 @@
(_rs.IN_FUNC_GLOBAL, "TEMPLATE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
(_rs.IN_FUNC_GLOBAL, "VTABLE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
(_rs.IN_FUNC_GLOBAL, "LIBRARY", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
(_rs.IN_FUNC_GLOBAL, "STRING", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
(_rs.IN_FUNC_GLOBAL, "STRING", _rs.IN_FUNC_GLOBAL, None),
(_rs.IN_VTABLE, "FUNCTION", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),
(_rs.IN_VTABLE, "GLOBAL", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER),

View file

@ -10,6 +10,7 @@
is_blank_or_comment,
get_class_name,
get_variable_name,
get_string_contents,
)
@ -158,3 +159,18 @@ def test_get_class_name_none(line: str):
@pytest.mark.parametrize("line,name", variable_name_cases)
def test_get_variable_name(line: str, name: str):
assert get_variable_name(line) == name
string_match_cases = [
('return "hello world";', "hello world"),
('"hello\\\\"', "hello\\"),
('"hello \\"world\\""', 'hello "world"'),
('"hello\\nworld"', "hello\nworld"),
# Only match first string if there are multiple options
('Method("hello", "world");', "hello"),
]
@pytest.mark.parametrize("line, string", string_match_cases)
def test_get_string_contents(line: str, string: str):
assert get_string_contents(line) == string