Decomp parser: indirect globals and string markers (#446)

* Enable string annotations and indirect globals * Adding some STRING annotations * Library functions
2024-11-25 17:18:16 -05:00 · 2024-01-17 06:56:34 -05:00 · 2024-01-17 06:56:34 -05:00 · aaa18bc9e2
commit aaa18bc9e2
parent 6af0c6cb1a
28 changed files with 738 additions and 93 deletions
--- a/ISLE/library_msvc.h
+++ b/ISLE/library_msvc.h
@ -0,0 +1,63 @@
+#ifdef 0
+// For ISLE symbols only
+
+// aka `operator new`
+// LIBRARY: ISLE 0x402f80
+// ??2@YAPAXI@Z
+
+// aka `operator delete`
+// LIBRARY: ISLE 0x402fa0
+// ??3@YAXPAX@Z
+
+// LIBRARY: ISLE 0x406dd0
+// _malloc
+
+// LIBRARY: ISLE 0x406f00
+// _free
+
+// LIBRARY: ISLE 0x407ec0
+// ___CxxFrameHandler
+
+// LIBRARY: ISLE 0x4081e0
+// _srand
+
+// LIBRARY: ISLE 0x4081f0
+// _rand
+
+// LIBRARY: ISLE 0x408220
+// _atol
+
+// LIBRARY: ISLE 0x4082d0
+// _atoi
+
+// LIBRARY: ISLE 0x4084c0
+// ?_query_new_handler@@YAP6AHI@ZXZ
+
+// LIBRARY: ISLE 0x4084d0
+// ?_query_new_mode@@YAHXZ
+
+// LIBRARY: ISLE 0x4085c0
+// _sprintf
+
+// LIBRARY: ISLE 0x408630
+// _abort
+
+// LIBRARY: ISLE 0x409110
+// __mtinit
+
+// LIBRARY: ISLE 0x409190
+// __getptd
+
+// GLOBAL: ISLE 0x4108e8
+// __osver
+
+// GLOBAL: ISLE 0x4108f0
+// __winmajor
+
+// GLOBAL: ISLE 0x4108f4
+// __winminor
+
+// GLOBAL: ISLE 0x410d50
+// __newmode
+
+#endif
--- a/ISLE/library_smartheap.h
+++ b/ISLE/library_smartheap.h
@ -0,0 +1,312 @@
+#ifdef 0
+
+// LIBRARY: ISLE 0x402f10
+// ?shi_New@@YAPAXKIPAU_SHI_Pool@@@Z
+
+// LIBRARY: ISLE 0x402fb0
+// _MemInitDefaultPool@0
+
+// LIBRARY: ISLE 0x403020
+// _shi_call_new_handler_msc
+
+// LIBRARY: ISLE 0x403050
+// _MemPoolShrink@4
+
+// LIBRARY: ISLE 0x403180
+// _MemPoolPreAllocate@12
+
+// LIBRARY: ISLE 0x403300
+// @_shi_initPageHeaders@4
+
+// LIBRARY: ISLE 0x403570
+// @shi_allocPageHeader@4
+
+// LIBRARY: ISLE 0x4035a0
+// @shi_freePageHeader@8
+
+// LIBRARY: ISLE 0x403750
+// @_shi_deletePage@8
+
+// LIBRARY: ISLE 0x403830
+// @_shi_allocExternal@12
+
+// LIBRARY: ISLE 0x403a50
+// @_shi_initPageVariable@8
+
+// LIBRARY: ISLE 0x403b00
+// _MemAllocPtr@12
+
+// LIBRARY: ISLE 0x403d60
+// @_shi_allocVar@12
+
+// LIBRARY: ISLE 0x403ef0
+// @_shi_allocBlock@12
+
+// LIBRARY: ISLE 0x4040c0
+// _MemFreePtr@4
+
+// LIBRARY: ISLE 0x404170
+// @_shi_freeVar@4
+
+// LIBRARY: ISLE 0x404260
+// _MemReAllocPtr@12
+
+// LIBRARY: ISLE 0x4043b0
+// @_shi_resizeAny@16
+
+// LIBRARY: ISLE 0x404650
+// @_shi_resizeVar@8
+
+// LIBRARY: ISLE 0x404820
+// _MemSizePtr@4
+
+// LIBRARY: ISLE 0x4048d0
+// @shi_findAllocAddress@4
+
+// LIBRARY: ISLE 0x404910
+// @_shi_sysAlloc@8
+
+// LIBRARY: ISLE 0x4049a0
+// @_shi_sysFree@4
+
+// LIBRARY: ISLE 0x404a00
+// @_shi_sysRealloc@12
+
+// LIBRARY: ISLE 0x404ab0
+// @_shi_sysResize@12
+
+// LIBRARY: ISLE 0x404b90
+// @_shi_sysSize@4
+
+// LIBRARY: ISLE 0x404bd0
+// @_shi_sysAllocNear@4
+
+// LIBRARY: ISLE 0x404bf0
+// @_shi_sysFreeNear@4
+
+// LIBRARY: ISLE 0x404c10
+// @_shi_sysValidatePtr@12
+
+// LIBRARY: ISLE 0x404d10
+// @_shi_sysValidateFunction@4
+
+// LIBRARY: ISLE 0x405300
+// @_shi_sysAllocPool@12
+
+// LIBRARY: ISLE 0x405520
+// @_shi_sysResizePool@16
+
+// LIBRARY: ISLE 0x405690
+// @_shi_sysFreePage@4
+
+// LIBRARY: ISLE 0x4057b0
+// @_shi_sysSizePage@4
+
+// LIBRARY: ISLE 0x4057e0
+// @_shi_sysSizePool@8
+
+// LIBRARY: ISLE 0x405800
+// @_shi_registerShared@16
+
+// LIBRARY: ISLE 0x405a00
+// @_shi_unregisterShared@8
+
+// LIBRARY: ISLE 0x405b20
+// @_shi_getNextPool@4
+
+// LIBRARY: ISLE 0x405b30
+// @shi_delNextPool@4
+
+// LIBRARY: ISLE 0x405d30
+// @shi_createAndEnterMutexShr@12
+
+// LIBRARY: ISLE 0x405e20
+// @shi_termPoolMutexShr@4
+
+// LIBRARY: ISLE 0x405e40
+// @shi_enterPoolMutexShr@4
+
+// LIBRARY: ISLE 0x405e60
+// @shi_leavePoolMutexShr@4
+
+// LIBRARY: ISLE 0x405e80
+// __shi_enterCriticalSection@0
+
+// LIBRARY: ISLE 0x405ea0
+// __shi_leaveCriticalSection@0
+
+// LIBRARY: ISLE 0x405ec0
+// __shi_createAndEnterMutex
+
+// LIBRARY: ISLE 0x405ef0
+// _shi_enterPoolMutexSafely
+
+// LIBRARY: ISLE 0x405fd0
+// _shi_enterPoolInitMutexReader
+
+// LIBRARY: ISLE 0x406060
+// _shi_leavePoolInitMutexReader
+
+// LIBRARY: ISLE 0x406090
+// _shi_enterPoolInitMutexWriter
+
+// LIBRARY: ISLE 0x406160
+// _shi_leavePoolInitMutexWriter
+
+// LIBRARY: ISLE 0x406180
+// _shi_isNT
+
+// LIBRARY: ISLE 0x4061b0
+// _MemPoolInit@4
+
+// LIBRARY: ISLE 0x406520
+// _MemPoolSetPageSize@8
+
+// LIBRARY: ISLE 0x406630
+// _MemPoolSetBlockSizeFS@8
+
+// LIBRARY: ISLE 0x406710
+// @_shi_poolFree@8
+
+// LIBRARY: ISLE 0x4068c0
+// @_shi_invokeErrorHandler1@8
+
+// LIBRARY: ISLE 0x406be0
+// _MemErrorUnwind@0
+
+// LIBRARY: ISLE 0x406c30
+// _MemDefaultErrorHandler@4
+
+// LIBRARY: ISLE 0x406cb0
+// @_shi_taskRemovePool@4
+
+// LIBRARY: ISLE 0x406d50
+// @_shi_getCurrentThreadContext@8
+
+// LIBRARY: ISLE 0x406db0
+// @_shi_deleteThreadContext@8
+
+// LIBRARY: ISLE 0x406e40
+// _calloc
+
+// LIBRARY: ISLE 0x406ea0
+// _realloc
+
+// LIBRARY: ISLE 0x406f10
+// __expand
+
+// LIBRARY: ISLE 0x406f50
+// __heapadd
+
+// LIBRARY: ISLE 0x406f60
+// __heapwalk
+
+// LIBRARY: ISLE 0x406ff0
+// __heapused
+
+// LIBRARY: ISLE 0x407020
+// __heapmin
+
+// LIBRARY: ISLE 0x407040
+// __msize
+
+// LIBRARY: ISLE 0x407050
+// __heapchk
+
+// LIBRARY: ISLE 0x407080
+// __heapset
+
+// LIBRARY: ISLE 0x407090
+// @_shi_sysReportError@16
+
+// LIBRARY: ISLE 0x407110
+// _MemPoolSize@4
+
+// LIBRARY: ISLE 0x4071a0
+// _MemPoolWalk@8
+
+// LIBRARY: ISLE 0x407240
+// @_shi_walkPool@16
+
+// LIBRARY: ISLE 0x407540
+// @shi_isBlockInUseSmall@8
+
+// LIBRARY: ISLE 0x407800
+// @_shi_isBlockInUseFS@12
+
+// LIBRARY: ISLE 0x407880
+// _MemPoolCheck@4
+
+// LIBRARY: ISLE 0x407b20
+// _MemCheckPtr@8
+
+// LIBRARY: ISLE 0x4084e0
+// __except_handler3
+
+// GLOBAL: ISLE 0x40f0a0
+// _szLibName
+
+// GLOBAL: ISLE 0x4102f4
+// ?_new_handler@@3P6AXXZA
+
+// GLOBAL: ISLE 0x4102fc
+// _MemDefaultPool
+
+// GLOBAL: ISLE 0x41031c
+// __shi_compactPoolFn
+
+// GLOBAL: ISLE 0x410320
+// __shi_compactPageFn
+
+// GLOBAL: ISLE 0x410324
+// _MemDefaultPoolFlags
+
+// GLOBAL: ISLE 0x41032c
+// __shi_mutexGlobalInit
+
+// GLOBAL: ISLE 0x410330
+// __shi_mutexMovInit
+
+// GLOBAL: ISLE 0x410334
+// __shi_mutexMovLockCount
+
+// GLOBAL: ISLE 0x410338
+// _shi_initPoolReaders
+
+// GLOBAL: ISLE 0x41033c
+// _shi_eventInitPool
+
+// GLOBAL: ISLE 0x410340
+// _shi_mutexMovShr
+
+// GLOBAL: ISLE 0x410368
+// _shi_deferFreePools
+
+// GLOBAL: ISLE 0x410378
+// __shi_poolTerminating
+
+// GLOBAL: ISLE 0x41037c
+// _MemDefaultPoolBlockSizeFS
+
+// GLOBAL: ISLE 0x410380
+// _MemDefaultPoolPageSize
+
+// GLOBAL: ISLE 0x410384
+// _SmartHeap_malloc
+
+// GLOBAL: ISLE 0x4105b0
+// __shi_TaskRecord
+
+// GLOBAL: ISLE 0x4125f8
+// ?_pnhHeap@@3P6AHI@ZA
+
+// GLOBAL: ISLE 0x412830
+// __shi_mutexMov
+
+// GLOBAL: ISLE 0x412850
+// _shi_mutexPoolSynch
+
+// GLOBAL: ISLE 0x412870
+// __shi_mutexGlobal
+
+#endif
--- a/LEGO1/define.cpp
+++ b/LEGO1/define.cpp
@ -11,19 +11,25 @@ MxS32 g_mxcoreCount[101] = {0,     -6643, -5643, -5058, -4643, -4321, -4058, -38
 							-136,  -120,  -104,  -89,   -74,   -58,   -43,   -29,   -14,   0};

 // GLOBAL: LEGO1 0x10102048
+// STRING: LEGO1 0x10102040
 const char* g_strACTION = "ACTION";

 // GLOBAL: LEGO1 0x1010209c
+// STRING: LEGO1 0x10101f58
 const char* g_strOBJECT = "OBJECT";

 // GLOBAL: LEGO1 0x101020b0
+// STRING: LEGO1 0x10101f20
 const char* g_strSOUND = "SOUND";

 // GLOBAL: LEGO1 0x101020cc
+// STRING: LEGO1 0x100f3808
 const char* g_strVISIBILITY = "VISIBILITY";

 // GLOBAL: LEGO1 0x101020d0
+// STRING: LEGO1 0x10101edc
 const char* g_strWORLD = "WORLD";

 // GLOBAL: LEGO1 0x101020e4
+// STRING: LEGO1 0x10101eac
 const char* g_parseExtraTokens = ":;";
--- a/LEGO1/lego/legoomni/src/common/legobackgroundcolor.cpp
+++ b/LEGO1/lego/legoomni/src/common/legobackgroundcolor.cpp
@ -8,12 +8,15 @@
 DECOMP_SIZE_ASSERT(LegoBackgroundColor, 0x30)

 // GLOBAL: LEGO1 0x100f3fb0
+// STRING: LEGO1 0x100f3a18
 const char* g_delimiter = " \t";

 // GLOBAL: LEGO1 0x100f3fb4
+// STRING: LEGO1 0x100f3bf0
 const char* g_set = "set";

 // GLOBAL: LEGO1 0x100f3fb8
+// STRING: LEGO1 0x100f0cdc
 const char* g_reset = "reset";

 // FUNCTION: LEGO1 0x1003bfb0
--- a/LEGO1/lego/legoomni/src/common/legofullscreenmovie.cpp
+++ b/LEGO1/lego/legoomni/src/common/legofullscreenmovie.cpp
@ -8,9 +8,11 @@
 DECOMP_SIZE_ASSERT(LegoFullScreenMovie, 0x24)

 // GLOBAL: LEGO1 0x100f3fbc
+// STRING: LEGO1 0x100f3be8
 const char* g_strEnable = "enable";

 // GLOBAL: LEGO1 0x100f3fc0
+// STRING: LEGO1 0x100f3bf4
 const char* g_strDisable = "disable";

 // FUNCTION: LEGO1 0x1003c500
--- a/LEGO1/lego/legoomni/src/common/legogamestate.cpp
+++ b/LEGO1/lego/legoomni/src/common/legogamestate.cpp
@ -20,12 +20,15 @@
 DECOMP_SIZE_ASSERT(LegoGameState, 0x430)

 // GLOBAL: LEGO1 0x100f3e40
+// STRING: LEGO1 0x100f3e3c
 const char* g_fileExtensionGS = ".GS";

 // GLOBAL: LEGO1 0x100f3e44
+// STRING: LEGO1 0x100f3e30
 const char* g_playersGSI = "Players.gsi";

 // GLOBAL: LEGO1 0x100f3e48
+// STRING: LEGO1 0x100f3e24
 const char* g_historyGSI = "History.gsi";

 // GLOBAL: LEGO1 0x100f3e58
--- a/LEGO1/lego/legoomni/src/common/legostream.cpp
+++ b/LEGO1/lego/legoomni/src/common/legostream.cpp
@ -10,6 +10,7 @@
 // the text "END_OF_VARIABLES" in it.
 // TODO: make g_endOfVariables reference the actual end of the variable array.
 // GLOBAL: LEGO1 0x100f3e50
+// STRING: LEGO1 0x100f3e00
 const char* g_endOfVariables = "END_OF_VARIABLES";

 // Very likely but not certain sizes.
--- a/LEGO1/lego/legoomni/src/main/legoomni.cpp
+++ b/LEGO1/lego/legoomni/src/main/legoomni.cpp
@ -110,6 +110,7 @@ MxAtomId* g_creditsScript = NULL;
 MxAtomId* g_nocdSourceName = NULL;

 // GLOBAL: LEGO1 0x100f6718
+// STRING: LEGO1 0x100f6710
 const char* g_current = "current";

 // GLOBAL: LEGO1 0x100f4c58
--- a/LEGO1/lego/legoomni/src/video/legometerpresenter.cpp
+++ b/LEGO1/lego/legoomni/src/video/legometerpresenter.cpp
@ -7,24 +7,31 @@
 DECOMP_SIZE_ASSERT(LegoMeterPresenter, 0x94)

 // GLOBAL: LEGO1 0x1010207c
+// STRING: LEGO1 0x10101fb4
 const char* g_filterIndex = "FILTER_INDEX";

 // GLOBAL: LEGO1 0x10102094
+// STRING: LEGO1 0x10101f70
 const char* g_type = "TYPE";

 // GLOBAL: LEGO1 0x10102088
+// STRING: LEGO1 0x10101f94
 const char* g_leftToRight = "LEFT_TO_RIGHT";

 // GLOBAL: LEGO1 0x101020ac
+// STRING: LEGO1 0x10101f28
 const char* g_rightToLeft = "RIGHT_TO_LEFT";

 // GLOBAL: LEGO1 0x1010205c
+// STRING: LEGO1 0x10102000
 const char* g_bottomToTop = "BOTTOM_TO_TOP";

 // GLOBAL: LEGO1 0x101020c0
+// STRING: LEGO1 0x10101f00
 const char* g_topToBottom = "TOP_TO_BOTTOM";

 // GLOBAL: LEGO1 0x101020c8
+// STRING: LEGO1 0x10101ee4
 const char* g_variable = "VARIABLE";

 // FUNCTION: LEGO1 0x10043430
--- a/LEGO1/library_msvc.h
+++ b/LEGO1/library_msvc.h
@ -1,39 +1,32 @@
 #ifdef 0
+// For LEGO1 symbols only

 // aka `operator new`
-// LIBRARY: ISLE 0x402f80
 // LIBRARY: LEGO1 0x10086240
 // ??2@YAPAXI@Z

 // aka `operator delete`
-// LIBRARY: ISLE 0x402fa0
 // LIBRARY: LEGO1 0x10086260
 // ??3@YAXPAX@Z

-// LIBRARY: ISLE 0x406dd0
 // LIBRARY: LEGO1 0x1008a090
 // _malloc

-// LIBRARY: ISLE 0x406f00
 // LIBRARY: LEGO1 0x1008a1c0
 // _free

-// LIBRARY: ISLE 0x407ec0
 // LIBRARY: LEGO1 0x1008b020
 // ___CxxFrameHandler

-// LIBRARY: ISLE 0x408220
 // LIBRARY: LEGO1 0x1008b400
 // _atol

-// LIBRARY: ISLE 0x4082d0
 // LIBRARY: LEGO1 0x1008b4b0
 // _atoi

 // LIBRARY: LEGO1 0x1008b4c0
 // _strtok

-// LIBRARY: ISLE 0x4085c0
 // LIBRARY: LEGO1 0x1008b5a0
 // _sprintf

@ -43,6 +36,9 @@
 // LIBRARY: LEGO1 0x1008b630
 // _srand

+// LIBRARY: LEGO1 0x1008b640
+// _rand
+
 // LIBRARY: LEGO1 0x1008b680
 // _strncmp

@ -91,19 +87,6 @@
 // LIBRARY: LEGO1 0x10097b10
 // _strchr

-// LIBRARY: ISLE 0x4081e0
-// _srand
-
-// LIBRARY: ISLE 0x4081f0
-// LIBRARY: LEGO1 0x1008b640
-// _rand
-
-// LIBRARY: ISLE 0x409110
-// __mtinit
-
-// LIBRARY: ISLE 0x409190
-// __getptd
-
 // LIBRARY: LEGO1 0x100d1ed0
 // _strnicmp

--- a/LEGO1/omni/src/common/mxvariabletable.cpp
+++ b/LEGO1/omni/src/common/mxvariabletable.cpp
@ -50,6 +50,8 @@ void MxVariableTable::SetVariable(MxVariable* p_var)
 // FUNCTION: LEGO1 0x100b78f0
 const char* MxVariableTable::GetVariable(const char* p_key)
 {
+	// STRING: ISLE 0x41008c
+	// STRING: LEGO1 0x100f01d4
 	const char* value = "";
 	MxHashTableCursor<MxVariable*> cursor(this);
 	MxVariable* var = new MxVariable(p_key);
--- a/LEGO1/omni/src/video/mxstillpresenter.cpp
+++ b/LEGO1/omni/src/video/mxstillpresenter.cpp
@ -11,6 +11,7 @@
 DECOMP_SIZE_ASSERT(MxStillPresenter, 0x6c);

 // GLOBAL: LEGO1 0x101020e0
+// STRING: LEGO1 0x10101eb0
 const char* g_strBmpIsmap = "BMP_ISMAP";

 // FUNCTION: LEGO1 0x100b9c70
--- a/tools/isledecomp/isledecomp/compare/core.py
+++ b/tools/isledecomp/isledecomp/compare/core.py
@ -85,13 +85,19 @@ def _load_cvdump(self):

            if sym.node_type == SymbolType.STRING:
                string_info = demangle_string_const(sym.decorated_name)
+                if string_info is None:
+                    logger.debug(
+                        "Could not demangle string symbol: %s", sym.decorated_name
+                    )
+                    continue
+
                # TODO: skip unicode for now. will need to handle these differently.
                if string_info.is_utf16:
                    continue

                raw = self.recomp_bin.read(addr, sym.size())
                try:
-                    sym.friendly_name = raw.decode("latin1")
+                    sym.friendly_name = raw.decode("latin1").rstrip("\x00")
                except UnicodeDecodeError:
                    pass

@ -134,6 +140,26 @@ def _load_markers(self):
        for tbl in codebase.iter_vtables():
            self._db.match_vtable(tbl.offset, tbl.name)

+        for string in codebase.iter_strings():
+            # Not that we don't trust you, but we're checking the string
+            # annotation to make sure it is accurate.
+            try:
+                # TODO: would presumably fail for wchar_t strings
+                orig = self.orig_bin.read_string(string.offset).decode("latin1")
+                string_correct = string.name == orig
+            except UnicodeDecodeError:
+                string_correct = False
+
+            if not string_correct:
+                logger.error(
+                    "Data at 0x%x does not match string %s",
+                    string.offset,
+                    repr(string.name),
+                )
+                continue
+
+            self._db.match_string(string.offset, string.name)
+
    def _find_original_strings(self):
        """Go to the original binary and look for the specified string constants
        to find a match. This is a (relatively) expensive operation so we only
--- a/tools/isledecomp/isledecomp/compare/db.py
+++ b/tools/isledecomp/isledecomp/compare/db.py
@ -43,7 +43,8 @@ def match_name(self) -> str:
            return None

        ctype = self.compare_type.name if self.compare_type is not None else "UNK"
-        return f"{self.name} ({ctype})"
+        name = repr(self.name) if ctype == "STRING" else self.name
+        return f"{name} ({ctype})"


 def matchinfo_factory(_, row):
@ -197,3 +198,5 @@ def match_string(self, addr: int, value: str) -> bool:
        if not did_match:
            escaped = repr(value)
            logger.error("Failed to find string: %s", escaped)
+
+        return did_match
--- a/tools/isledecomp/isledecomp/cvdump/analysis.py
+++ b/tools/isledecomp/isledecomp/cvdump/analysis.py
@ -94,7 +94,11 @@ def set_decorated(self, name: str):
    def name(self) -> Optional[str]:
        """Prefer "friendly" name if we have it.
        This is what we have been using to match functions."""
-        return self.friendly_name or self.decorated_name
+        return (
+            self.friendly_name
+            if self.friendly_name is not None
+            else self.decorated_name
+        )

    def size(self) -> Optional[int]:
        if self.confirmed_size is not None:
--- a/tools/isledecomp/isledecomp/cvdump/demangler.py
+++ b/tools/isledecomp/isledecomp/cvdump/demangler.py
@ -4,6 +4,7 @@
 """
 import re
 from collections import namedtuple
+from typing import Optional


 class InvalidEncodedNumberError(Exception):
@ -30,13 +31,12 @@ def parse_encoded_number(string: str) -> int:
 StringConstInfo = namedtuple("StringConstInfo", "len is_utf16")


-def demangle_string_const(symbol: str) -> StringConstInfo:
+def demangle_string_const(symbol: str) -> Optional[StringConstInfo]:
    """Don't bother to decode the string text from the symbol.
    We can just read it from the binary once we have the length."""
    match = string_const_regex.match(symbol)
    if match is None:
-        # See below
-        return StringConstInfo(0, False)
+        return None

    try:
        strlen = (
@ -45,10 +45,7 @@ def demangle_string_const(symbol: str) -> StringConstInfo:
            else int(match.group("len"))
        )
    except (ValueError, InvalidEncodedNumberError):
-        # This would be an annoying error to fail on if we get a bad symbol.
-        # For now, just assume a zero length string because this will probably
-        # raise some eyebrows during the comparison.
-        strlen = 0
+        return None

    is_utf16 = match.group("is_utf16") == "1"
    return StringConstInfo(len=strlen, is_utf16=is_utf16)
--- a/tools/isledecomp/isledecomp/parser/codebase.py
+++ b/tools/isledecomp/isledecomp/parser/codebase.py
@ -6,6 +6,7 @@
    ParserFunction,
    ParserVtable,
    ParserVariable,
+    ParserString,
 )


@ -42,3 +43,6 @@ def iter_vtables(self) -> Iterator[ParserVtable]:

    def iter_variables(self) -> Iterator[ParserVariable]:
        return filter(lambda s: isinstance(s, ParserVariable), self._symbols)
+
+    def iter_strings(self) -> Iterator[ParserString]:
+        return filter(lambda s: isinstance(s, ParserString), self._symbols)
--- a/tools/isledecomp/isledecomp/parser/error.py
+++ b/tools/isledecomp/isledecomp/parser/error.py
@ -70,6 +70,10 @@ class ParserError(Enum):
    # a comment -- i.e. VTABLE or GLOBAL -- could not extract the name
    NO_SUITABLE_NAME = 204

+    # ERROR: Two STRING markers have the same module and offset, but the strings
+    # they annotate are different.
+    WRONG_STRING = 205
+

@dataclass
 class ParserAlert:
--- a/tools/isledecomp/isledecomp/parser/linter.py
+++ b/tools/isledecomp/isledecomp/parser/linter.py
@ -1,7 +1,7 @@
 from typing import List, Optional
 from .parser import DecompParser
 from .error import ParserAlert, ParserError
-from .node import ParserSymbol
+from .node import ParserSymbol, ParserString


 def get_checkorder_filter(module):
@ -19,6 +19,9 @@ def __init__(self) -> None:
        # This is _not_ reset between files and is intended to report offset reuse
        # when scanning the entire directory.
        self._offsets_used = set()
+        # Keep track of strings we have seen. Persists across files.
+        # Module/offset can be repeated for string markers but the strings must match.
+        self._strings = {}

    def reset(self, full_reset: bool = False):
        self.alerts = []
@ -28,6 +31,7 @@ def reset(self, full_reset: bool = False):

        if full_reset:
            self._offsets_used.clear()
+            self._strings = {}

    def file_is_header(self):
        return self._filename.lower().endswith(".h")
@ -36,8 +40,20 @@ def _load_offsets_from_list(self, marker_list: List[ParserSymbol]):
        """Helper for loading (module, offset) tuples while the DecompParser
        has them broken up into three different lists."""
        for marker in marker_list:
+            is_string = isinstance(marker, ParserString)
+
            value = (marker.module, marker.offset)
            if value in self._offsets_used:
+                if is_string:
+                    if self._strings[value] != marker.name:
+                        self.alerts.append(
+                            ParserAlert(
+                                code=ParserError.WRONG_STRING,
+                                line_number=marker.line_number,
+                                line=f"0x{marker.offset:08x}, {repr(self._strings[value])} vs. {repr(marker.name)}",
+                            )
+                        )
+                else:
                    self.alerts.append(
                        ParserAlert(
                            code=ParserError.DUPLICATE_OFFSET,
@ -47,6 +63,8 @@ def _load_offsets_from_list(self, marker_list: List[ParserSymbol]):
                    )
            else:
                self._offsets_used.add(value)
+                if is_string:
+                    self._strings[value] = marker.name

    def _check_function_order(self):
        """Rules:
@ -82,6 +100,7 @@ def _check_offset_uniqueness(self):
        self._load_offsets_from_list(self._parser.functions)
        self._load_offsets_from_list(self._parser.vtables)
        self._load_offsets_from_list(self._parser.variables)
+        self._load_offsets_from_list(self._parser.strings)

    def _check_byname_allowed(self):
        if self.file_is_header():
--- a/tools/isledecomp/isledecomp/parser/marker.py
+++ b/tools/isledecomp/isledecomp/parser/marker.py
@ -3,6 +3,19 @@
 from enum import Enum


+class MarkerCategory(Enum):
+    """For the purposes of grouping multiple different DecompMarkers together,
+    assign a rough "category" for the MarkerType values below.
+    It's really only the function types that have to get folded down, but
+    we'll do that in a structured way to permit future expansion."""
+
+    FUNCTION = 1
+    VARIABLE = 2
+    STRING = 3
+    VTABLE = 4
+    ADDRESS = 100  # i.e. no comparison required or possible
+
+
 class MarkerType(Enum):
    UNKNOWN = -100
    FUNCTION = 1
@ -51,6 +64,23 @@ def module(self) -> str:
    def offset(self) -> int:
        return self._offset

+    @property
+    def category(self) -> MarkerCategory:
+        if self.is_vtable():
+            return MarkerCategory.VTABLE
+
+        if self.is_variable():
+            return MarkerCategory.VARIABLE
+
+        if self.is_string():
+            return MarkerCategory.STRING
+
+        # TODO: worth another look if we add more types, but this covers it
+        if self.is_regular_function() or self.is_explicit_byname():
+            return MarkerCategory.FUNCTION
+
+        return MarkerCategory.ADDRESS
+
    def is_regular_function(self) -> bool:
        """Regular function, meaning: not an explicit byname lookup. FUNCTION
        markers can be _implicit_ byname.
--- a/tools/isledecomp/isledecomp/parser/node.py
+++ b/tools/isledecomp/isledecomp/parser/node.py
@ -55,3 +55,8 @@ class ParserVariable(ParserSymbol):
@dataclass
 class ParserVtable(ParserSymbol):
    pass
+
+
+@dataclass
+class ParserString(ParserSymbol):
+    pass
--- a/tools/isledecomp/isledecomp/parser/parser.py
+++ b/tools/isledecomp/isledecomp/parser/parser.py
@ -3,11 +3,11 @@
 from typing import List, Iterable, Iterator, Optional
 from enum import Enum
 from .util import (
-    is_blank_or_comment,
    get_class_name,
    get_variable_name,
    get_synthetic_name,
    remove_trailing_comment,
+    get_string_contents,
 )
 from .marker import (
    DecompMarker,
@ -19,6 +19,7 @@
    ParserFunction,
    ParserVariable,
    ParserVtable,
+    ParserString,
 )
 from .error import ParserAlert, ParserError

@ -43,17 +44,16 @@ def __init__(self) -> None:

    def insert(self, marker: DecompMarker) -> bool:
        """Return True if this insert would overwrite"""
-        module = marker.module
-        if module in self.markers:
+        key = (marker.category, marker.module)
+        if key in self.markers:
            return True

-        # TODO: type converted back to string version here instead of using enum
-        self.markers[module] = (marker.type.name, marker.offset)
+        self.markers[key] = marker
        return False

    def iter(self) -> Iterator[DecompMarker]:
-        for module, (marker_type, offset) in self.markers.items():
-            yield DecompMarker(marker_type, module, offset)
+        for _, marker in self.markers.items():
+            yield marker

    def empty(self):
        self.markers = {}
@ -111,17 +111,21 @@ def reset(self):
        self.function_sig = ""

    @property
-    def functions(self) -> List[ParserSymbol]:
+    def functions(self) -> List[ParserFunction]:
        return [s for s in self._symbols if isinstance(s, ParserFunction)]

    @property
-    def vtables(self) -> List[ParserSymbol]:
+    def vtables(self) -> List[ParserVtable]:
        return [s for s in self._symbols if isinstance(s, ParserVtable)]

    @property
-    def variables(self) -> List[ParserSymbol]:
+    def variables(self) -> List[ParserVariable]:
        return [s for s in self._symbols if isinstance(s, ParserVariable)]

+    @property
+    def strings(self) -> List[ParserString]:
+        return [s for s in self._symbols if isinstance(s, ParserString)]
+
    def iter_symbols(self, module: Optional[str] = None) -> Iterator[ParserSymbol]:
        for s in self._symbols:
            if module is None or s.module == module:
@ -225,18 +229,32 @@ def _variable_marker(self, marker: DecompMarker):
        else:
            self.state = ReaderState.IN_GLOBAL

-    def _variable_done(self, name: str):
-        if not name.startswith("g_"):
-            self._syntax_warning(ParserError.GLOBAL_MISSING_PREFIX)
+    def _variable_done(
+        self, variable_name: Optional[str] = None, string_value: Optional[str] = None
+    ):
+        if variable_name is None and string_value is None:
+            self._syntax_error(ParserError.NO_SUITABLE_NAME)
+            return

        for marker in self.var_markers.iter():
+            if marker.is_string():
+                self._symbols.append(
+                    ParserString(
+                        type=marker.type,
+                        line_number=self.line_number,
+                        module=marker.module,
+                        offset=marker.offset,
+                        name=string_value,
+                    )
+                )
+            else:
                self._symbols.append(
                    ParserVariable(
                        type=marker.type,
                        line_number=self.line_number,
                        module=marker.module,
                        offset=marker.offset,
-                    name=name,
+                        name=variable_name,
                        is_static=self.state == ReaderState.IN_FUNC_GLOBAL,
                    )
                )
@ -298,20 +316,8 @@ def _handle_marker(self, marker: DecompMarker):
            else:
                self._syntax_error(ParserError.INCOMPATIBLE_MARKER)

-        elif marker.is_string():
-            # TODO: We are ignoring string markers for the moment.
-            # We already have a lot of them in the codebase, though, so we'll
-            # hang onto them for now in case we can use them later.
-            # To match up string constants, the strategy will be:
-            # 1. Use cvdump to find all string constants in the recomp
-            # 2. In the original binary, look at relocated vaddrs from .rdata
-            # 3. Try to match up string data from #1 with locations in #2
-
-            # Throw the syntax error we would throw if we were parsing these
-            if self.state not in (ReaderState.SEARCH, ReaderState.IN_FUNC):
-                self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
-
-        elif marker.is_variable():
+        # Strings and variables are almost the same thing
+        elif marker.is_string() or marker.is_variable():
            if self.state in (
                ReaderState.SEARCH,
                ReaderState.IN_GLOBAL,
@ -418,24 +424,39 @@ def read_line(self, line: str):
            # function we have already parsed if state == IN_FUNC_GLOBAL.
            # However, we are not tolerant of _any_ syntax problems in our
            # CI actions, so the solution is to just fix the invalid marker.
-            if is_blank_or_comment(line):
-                self._syntax_error(ParserError.NO_SUITABLE_NAME)
+            variable_name = None
+
+            global_markers_queued = any(
+                m.is_variable() for m in self.var_markers.iter()
+            )
+
+            if len(line_strip) == 0:
+                self._syntax_warning(ParserError.UNEXPECTED_BLANK_LINE)
                return

-            # We don't have a foolproof mechanism to tell what is and is not a variable.
-            # If the GLOBAL is being declared on a `return` statement, though, this is
-            # not correct. It is either a string literal (which will be handled differently)
-            # or it is not the variable declaration, which is incorrect decomp syntax.
-            if line.strip().startswith("return"):
+            if global_markers_queued:
+                # Not the greatest solution, but a consequence of combining GLOBAL and
+                # STRING markers together. If the marker precedes a return statement, it is
+                # valid for a STRING marker to be here, but not a GLOBAL. We need to look
+                # ahead and tell whether this *would* fail.
+                if line_strip.startswith("return"):
                    self._syntax_error(ParserError.GLOBAL_NOT_VARIABLE)
                    return
+                if line_strip.startswith("//"):
+                    # If we found a comment, assume implicit lookup-by-name
+                    # function and end here. We know this is not a decomp marker
+                    # because it would have been handled already.
+                    variable_name = get_synthetic_name(line)
+                else:
+                    variable_name = get_variable_name(line)
+                    # This is out of our control for library variables, but all of our
+                    # variables should start with "g_".
+                    if variable_name is not None and not variable_name.startswith("g_"):
+                        self._syntax_warning(ParserError.GLOBAL_MISSING_PREFIX)

-            name = get_variable_name(line)
-            if name is None:
-                self._syntax_error(ParserError.NO_SUITABLE_NAME)
-                return
+            string_name = get_string_contents(line)

-            self._variable_done(name)
+            self._variable_done(variable_name, string_name)

        elif self.state == ReaderState.IN_VTABLE:
            vtable_class = get_class_name(line)
--- a/tools/isledecomp/isledecomp/parser/util.py
+++ b/tools/isledecomp/isledecomp/parser/util.py
@ -1,6 +1,7 @@
 # C++ Parser utility functions and data structures
 import re
 from typing import Optional
+from ast import literal_eval

 # The goal here is to just read whatever is on the next line, so some
 # flexibility in the formatting seems OK
@ -12,6 +13,10 @@
 trailingCommentRegex = re.compile(r"(\s*(?://|/\*).*)$")


+# Get string contents, ignore escape characters that might interfere
+doubleQuoteRegex = re.compile(r"(\"(?:[^\"\\]|\\.)*\")")
+
+
 def get_synthetic_name(line: str) -> Optional[str]:
    """Synthetic names appear on a single line comment on the line after the marker.
    If that's not what we have, return None"""
@ -86,3 +91,20 @@ def get_variable_name(line: str) -> Optional[str]:
        return match.group("name")

    return None
+
+
+def get_string_contents(line: str) -> Optional[str]:
+    """Return the first C string seen on this line.
+    We have to unescape the string, and a simple way to do that is to use
+    python's ast.literal_eval. I'm sure there are many pitfalls to doing
+    it this way, but hopefully the regex will ensure reasonably sane input."""
+
+    try:
+        if (match := doubleQuoteRegex.search(line)) is not None:
+            return literal_eval(match.group(1))
+    # pylint: disable=broad-exception-caught
+    # No way to predict what kind of exception could occur.
+    except Exception:
+        pass
+
+    return None
--- a/tools/isledecomp/tests/test_demangler.py
+++ b/tools/isledecomp/tests/test_demangler.py
@ -14,6 +14,7 @@
        14,
        True,
    ),
+    ("??_C@_00A@?$AA@", 0, False),
 ]


--- a/tools/isledecomp/tests/test_linter.py
+++ b/tools/isledecomp/tests/test_linter.py
@ -112,3 +112,33 @@ def test_duplicate_offsets(linter):
    # Full reset will forget seen offsets.
    linter.reset(True)
    assert linter.check_lines(lines, "test.h", "TEST") is True
+
+
+def test_duplicate_strings(linter):
+    """Duplicate string markers are okay if the string value is the same."""
+    string_lines = [
+        "// STRING: TEST 0x1000",
+        'return "hello world";',
+    ]
+
+    # No problem to use this marker twice.
+    assert linter.check_lines(string_lines, "test.h", "TEST") is True
+    assert linter.check_lines(string_lines, "test.h", "TEST") is True
+
+    different_string = [
+        "// STRING: TEST 0x1000",
+        'return "hi there";',
+    ]
+
+    # Same address but the string is different
+    assert linter.check_lines(different_string, "greeting.h", "TEST") is False
+    assert len(linter.alerts) == 1
+    assert linter.alerts[0].code == ParserError.WRONG_STRING
+
+    same_addr_reused = [
+        "// GLOBAL:TEXT 0x1000",
+        "int g_test = 123;",
+    ]
+
+    # This will fail like any other offset reuse.
+    assert linter.check_lines(same_addr_reused, "other.h", "TEST") is False
--- a/tools/isledecomp/tests/test_parser.py
+++ b/tools/isledecomp/tests/test_parser.py
@ -442,3 +442,82 @@ def test_static_variable(parser):
    )
    assert len(parser.variables) == 2
    assert parser.variables[1].is_static is True
+
+
+def test_reject_global_return(parser):
+    """Previously we had annotated strings with the GLOBAL marker.
+    For example: if a function returned a string. We now want these to be
+    annotated with the STRING marker."""
+
+    parser.read_lines(
+        [
+            "// FUNCTION: TEST 0x5555",
+            "void test_function() {",
+            "  // GLOBAL: TEST 0x8888",
+            '  return "test";',
+            "}",
+        ]
+    )
+    assert len(parser.variables) == 0
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.GLOBAL_NOT_VARIABLE
+
+
+def test_global_string(parser):
+    """We now allow GLOBAL and STRING markers for the same item."""
+
+    parser.read_lines(
+        [
+            "// GLOBAL: TEST 0x1234",
+            "// STRING: TEXT 0x5555",
+            'char* g_test = "hello";',
+        ]
+    )
+    assert len(parser.variables) == 1
+    assert len(parser.strings) == 1
+    assert len(parser.alerts) == 0
+
+    assert parser.variables[0].name == "g_test"
+    assert parser.strings[0].name == "hello"
+
+
+def test_comment_variables(parser):
+    """Match on hidden variables from libraries."""
+
+    parser.read_lines(
+        [
+            "// GLOBAL: TEST 0x1234",
+            "// g_test",
+        ]
+    )
+    assert len(parser.variables) == 1
+    assert parser.variables[0].name == "g_test"
+
+
+def test_flexible_variable_prefix(parser):
+    """Don't alert to library variables that lack the g_ prefix.
+    This is out of our control."""
+
+    parser.read_lines(
+        [
+            "// GLOBAL: TEST 0x1234",
+            "// some_other_variable",
+        ]
+    )
+    assert len(parser.variables) == 1
+    assert len(parser.alerts) == 0
+    assert parser.variables[0].name == "some_other_variable"
+
+
+def test_string_ignore_g_prefix(parser):
+    """String annotations above a regular variable should not alert to
+    the missing g_ prefix. This is only required for GLOBAL markers."""
+
+    parser.read_lines(
+        [
+            "// STRING: TEST 0x1234",
+            'const char* value = "";',
+        ]
+    )
+    assert len(parser.strings) == 1
+    assert len(parser.alerts) == 0
--- a/tools/isledecomp/tests/test_parser_statechange.py
+++ b/tools/isledecomp/tests/test_parser_statechange.py
@ -15,7 +15,7 @@
    (_rs.SEARCH,          "TEMPLATE",   _rs.IN_TEMPLATE,     None),
    (_rs.SEARCH,          "VTABLE",     _rs.IN_VTABLE,       None),
    (_rs.SEARCH,          "LIBRARY",    _rs.IN_LIBRARY,      None),
-    (_rs.SEARCH,          "STRING",     _rs.SEARCH,          None),
+    (_rs.SEARCH,          "STRING",     _rs.IN_GLOBAL,       None),

    (_rs.WANT_SIG,        "FUNCTION",   _rs.WANT_SIG,        None),
    (_rs.WANT_SIG,        "GLOBAL",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
@ -33,7 +33,7 @@
    (_rs.IN_FUNC,         "TEMPLATE",   _rs.IN_TEMPLATE,     _pe.MISSED_END_OF_FUNCTION),
    (_rs.IN_FUNC,         "VTABLE",     _rs.IN_VTABLE,       _pe.MISSED_END_OF_FUNCTION),
    (_rs.IN_FUNC,         "LIBRARY",    _rs.IN_LIBRARY,      _pe.MISSED_END_OF_FUNCTION),
-    (_rs.IN_FUNC,         "STRING",     _rs.IN_FUNC,         None),
+    (_rs.IN_FUNC,         "STRING",     _rs.IN_FUNC_GLOBAL,  None),

    (_rs.IN_TEMPLATE,     "FUNCTION",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_TEMPLATE,     "GLOBAL",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
@ -60,7 +60,7 @@
    (_rs.IN_GLOBAL,       "TEMPLATE",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_GLOBAL,       "VTABLE",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_GLOBAL,       "LIBRARY",    _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
-    (_rs.IN_GLOBAL,       "STRING",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_GLOBAL,       "STRING",     _rs.IN_GLOBAL,       None),

    (_rs.IN_FUNC_GLOBAL,  "FUNCTION",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_FUNC_GLOBAL,  "GLOBAL",     _rs.IN_FUNC_GLOBAL,  None),
@ -69,7 +69,7 @@
    (_rs.IN_FUNC_GLOBAL,  "TEMPLATE",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_FUNC_GLOBAL,  "VTABLE",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_FUNC_GLOBAL,  "LIBRARY",    _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
-    (_rs.IN_FUNC_GLOBAL,  "STRING",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_FUNC_GLOBAL,  "STRING",     _rs.IN_FUNC_GLOBAL,  None),

    (_rs.IN_VTABLE,       "FUNCTION",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_VTABLE,       "GLOBAL",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
--- a/tools/isledecomp/tests/test_parser_util.py
+++ b/tools/isledecomp/tests/test_parser_util.py
@ -10,6 +10,7 @@
    is_blank_or_comment,
    get_class_name,
    get_variable_name,
+    get_string_contents,
 )


@ -158,3 +159,18 @@ def test_get_class_name_none(line: str):
@pytest.mark.parametrize("line,name", variable_name_cases)
 def test_get_variable_name(line: str, name: str):
    assert get_variable_name(line) == name
+
+
+string_match_cases = [
+    ('return "hello world";', "hello world"),
+    ('"hello\\\\"', "hello\\"),
+    ('"hello \\"world\\""', 'hello "world"'),
+    ('"hello\\nworld"', "hello\nworld"),
+    # Only match first string if there are multiple options
+    ('Method("hello", "world");', "hello"),
+]
+
+
+@pytest.mark.parametrize("line, string", string_match_cases)
+def test_get_string_contents(line: str, string: str):
+    assert get_string_contents(line) == string