From 30be1ed4b8e079786386e32ef54e43168fca905f Mon Sep 17 00:00:00 2001
From: MS <disinvite@users.noreply.github.com>
Date: Sun, 1 Sep 2024 16:34:58 -0400
Subject: [PATCH] Brute force string search for BETA10 (#1097)

* Brute force string search for BETA10

* improved string check

* Skip this unless source binary is debug

* remove misplaced comment
---
 tools/isledecomp/isledecomp/bin.py          | 16 +++++++++
 tools/isledecomp/isledecomp/compare/core.py | 39 +++++++++++++++++++--
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/tools/isledecomp/isledecomp/bin.py b/tools/isledecomp/isledecomp/bin.py
index 05ecfa92..2f20224b 100644
--- a/tools/isledecomp/isledecomp/bin.py
+++ b/tools/isledecomp/isledecomp/bin.py
@@ -465,6 +465,22 @@ class Bin:
             for (func_addr, name_addr) in combined
         ]
 
+    def iter_string(self, encoding: str = "ascii") -> Iterator[Tuple[int, str]]:
+        """Search for possible strings at each verified address in .data."""
+        section = self.get_section_by_name(".data")
+        for addr in self._relocated_addrs:
+            if section.contains_vaddr(addr):
+                raw = self.read_string(addr)
+                if raw is None:
+                    continue
+
+                try:
+                    string = raw.decode(encoding)
+                except UnicodeDecodeError:
+                    continue
+
+                yield (addr, string)
+
     def get_section_by_name(self, name: str) -> Section:
         section = next(
             filter(lambda section: section.match_name(name), self.sections),
diff --git a/tools/isledecomp/isledecomp/compare/core.py b/tools/isledecomp/isledecomp/compare/core.py
index 1ba77f27..c44f3987 100644
--- a/tools/isledecomp/isledecomp/compare/core.py
+++ b/tools/isledecomp/isledecomp/compare/core.py
@@ -82,8 +82,9 @@ class Compare:
 
         self._load_cvdump()
         self._load_markers()
-        self._find_original_strings()
+        # Detect floats first to eliminate potential overlap with string data
         self._find_float_const()
+        self._find_original_strings()
         self._match_imports()
         self._match_exports()
         self._match_thunks()
@@ -314,7 +315,7 @@ class Compare:
         """Go to the original binary and look for the specified string constants
         to find a match. This is a (relatively) expensive operation so we only
         look at strings that we have not already matched via a STRING annotation."""
-
+        # Release builds give each de-duped string a symbol so they are easy to find and match.
         for string in self._db.get_unmatched_strings():
             addr = self.orig_bin.find_string(string.encode("latin1"))
             if addr is None:
@@ -324,6 +325,40 @@ class Compare:
 
             self._db.match_string(addr, string)
 
+        def is_real_string(s: str) -> bool:
+            """Heuristic to ignore values that only look like strings.
+            This is mostly about short strings (len <= 4) that could be byte or word values.
+            """
+            # 0x10 is the MSB of the address space for DLLs (LEGO1), so this is a pointer
+            if len(s) == 0 or "\x10" in s:
+                return False
+
+            # assert(0) is common
+            if len(s) == 1 and s[0] != "0":
+                return False
+
+            # Hack because str.isprintable() will fail on strings with newlines or tabs
+            if len(s) <= 4 and "\\x" in repr(s):
+                return False
+
+            return True
+
+        # Debug builds do not de-dupe the strings, so we need to find them via brute force scan.
+        # We could try to match the string addrs if there is only one in orig and recomp.
+        # When we sanitize the asm, the result is the same regardless.
+        if self.orig_bin.is_debug:
+            for addr, string in self.orig_bin.iter_string("latin1"):
+                if is_real_string(string):
+                    self._db.set_orig_symbol(
+                        addr, SymbolType.STRING, string, len(string)
+                    )
+
+            for addr, string in self.recomp_bin.iter_string("latin1"):
+                if is_real_string(string):
+                    self._db.set_recomp_symbol(
+                        addr, SymbolType.STRING, string, None, len(string)
+                    )
+
     def _find_float_const(self):
         """Add floating point constants in each binary to the database.
         We are not matching anything right now because these values are not