parser: Identify namespaces (#499)

2025-04-07 04:04:26 -04:00 · 2024-01-28 09:25:45 -05:00 · 2024-01-28 09:25:45 -05:00 · 4137cd75e6
commit 4137cd75e6
parent d9c4151bb8
6 changed files with 280 additions and 7 deletions
--- a/tools/isledecomp/isledecomp/cvdump/demangler.py
+++ b/tools/isledecomp/isledecomp/cvdump/demangler.py
@ -68,4 +68,9 @@ def demangle_vtable(symbol: str) -> str:

        return f"{class_name}<{generic}>"

+    # If we have two classes listed, it is a namespace hierarchy.
+    # @@6B@ is a common generic suffix for these vtable symbols.
+    if t[1] != "" and t[1] != "6B":
+        return t[1] + "::" + t[0]
+
    return t[0]
--- a/tools/isledecomp/isledecomp/parser/parser.py
+++ b/tools/isledecomp/isledecomp/parser/parser.py
@ -8,6 +8,8 @@ from .util import (
    get_synthetic_name,
    remove_trailing_comment,
    get_string_contents,
+    sanitize_code_line,
+    scopeDetectRegex,
 )
 from .marker import (
    DecompMarker,
@ -59,6 +61,57 @@ class MarkerDict:
        self.markers = {}


+class CurlyManager:
+    """Overly simplified scope manager"""
+
+    def __init__(self):
+        self._stack = []
+
+    def reset(self):
+        self._stack = []
+
+    def _pop(self):
+        """Pop stack safely"""
+        try:
+            self._stack.pop()
+        except IndexError:
+            pass
+
+    def get_prefix(self, name: Optional[str] = None) -> str:
+        """Return the prefix for where we are."""
+
+        scopes = [t for t in self._stack if t != "{"]
+        if len(scopes) == 0:
+            return name if name is not None else ""
+
+        if name is not None and name not in scopes:
+            scopes.append(name)
+
+        return "::".join(scopes)
+
+    def read_line(self, raw_line: str):
+        """Read a line of code and update the stack."""
+        line = sanitize_code_line(raw_line)
+        if (match := scopeDetectRegex.match(line)) is not None:
+            if not line.endswith(";"):
+                self._stack.append(match.group("name"))
+
+        change = line.count("{") - line.count("}")
+        if change > 0:
+            for _ in range(change):
+                self._stack.append("{")
+        elif change < 0:
+            for _ in range(-change):
+                self._pop()
+
+            if len(self._stack) == 0:
+                return
+
+            last = self._stack[-1]
+            if last != "{":
+                self._pop()
+
+
 class DecompParser:
    # pylint: disable=too-many-instance-attributes
    # Could combine output lists into a single list to get under the limit,
@ -73,6 +126,8 @@ class DecompParser:

        self.last_line: str = ""

+        self.curly = CurlyManager()
+
        # To allow for multiple markers where code is shared across different
        # modules, save lists of compatible markers that appear in sequence
        self.fun_markers = MarkerDict()
@ -110,6 +165,8 @@ class DecompParser:
        self.function_start = 0
        self.function_sig = ""

+        self.curly.reset()
+
    @property
    def functions(self) -> List[ParserFunction]:
        return [s for s in self._symbols if isinstance(s, ParserFunction)]
@ -213,7 +270,7 @@ class DecompParser:
                    line_number=self.line_number,
                    module=marker.module,
                    offset=marker.offset,
-                    name=class_name,
+                    name=self.curly.get_prefix(class_name),
                )
            )

@ -254,7 +311,7 @@ class DecompParser:
                        line_number=self.line_number,
                        module=marker.module,
                        offset=marker.offset,
-                        name=variable_name,
+                        name=self.curly.get_prefix(variable_name),
                        is_static=self.state == ReaderState.IN_FUNC_GLOBAL,
                    )
                )
@ -353,6 +410,8 @@ class DecompParser:
            self._handle_marker(marker)
            return

+        self.curly.read_line(line)
+
        line_strip = line.strip()
        if self.state in (
            ReaderState.IN_SYNTHETIC,
@ -451,8 +510,11 @@ class DecompParser:
                    variable_name = get_variable_name(line)
                    # This is out of our control for library variables, but all of our
                    # variables should start with "g_".
-                    if variable_name is not None and not variable_name.startswith("g_"):
-                        self._syntax_warning(ParserError.GLOBAL_MISSING_PREFIX)
+                    if variable_name is not None:
+                        # Before checking for the prefix, remove the
+                        # namespace chain if there is one.
+                        if not variable_name.split("::")[-1].startswith("g_"):
+                            self._syntax_warning(ParserError.GLOBAL_MISSING_PREFIX)

            string_name = get_string_contents(line)

--- a/tools/isledecomp/isledecomp/parser/util.py
+++ b/tools/isledecomp/isledecomp/parser/util.py
@ -7,15 +7,25 @@ from ast import literal_eval
 # flexibility in the formatting seems OK
 templateCommentRegex = re.compile(r"\s*//\s+(.*)")

-
 # To remove any comment (//) or block comment (/*) and its leading spaces
 # from the end of a code line
 trailingCommentRegex = re.compile(r"(\s*(?://|/\*).*)$")

+# Get char contents, ignore escape characters
+singleQuoteRegex = re.compile(r"('(?:[^\'\\]|\\.)')")
+
+# Match contents of block comment on one line
+blockCommentRegex = re.compile(r"(/\*.*?\*/)")
+
+# Match contents of single comment on one line
+regularCommentRegex = re.compile(r"(//.*)")

 # Get string contents, ignore escape characters that might interfere
 doubleQuoteRegex = re.compile(r"(\"(?:[^\"\\]|\\.)*\")")

+# Detect a line that would cause us to enter a new scope
+scopeDetectRegex = re.compile(r"(?:class|struct|namespace) (?P<name>\w+).*(?:{)?")
+

 def get_synthetic_name(line: str) -> Optional[str]:
    """Synthetic names appear on a single line comment on the line after the marker.
@ -28,6 +38,20 @@ def get_synthetic_name(line: str) -> Optional[str]:
    return None


+def sanitize_code_line(line: str) -> str:
+    """Helper for scope manager. Removes sections from a code line
+    that would cause us to incorrectly detect curly brackets.
+    This is a very naive implementation and fails entirely on multi-line
+    strings or comments."""
+
+    line = singleQuoteRegex.sub("''", line)
+    line = doubleQuoteRegex.sub('""', line)
+    line = blockCommentRegex.sub("", line)
+    line = regularCommentRegex.sub("", line)
+
+    return line.strip()
+
+
 def remove_trailing_comment(line: str) -> str:
    return trailingCommentRegex.sub("", line)

@ -75,8 +99,8 @@ def get_class_name(line: str) -> Optional[str]:
    return None


-global_regex = re.compile(r"(?P<name>g_\w+)")
-less_strict_global_regex = re.compile(r"(?P<name>\w+)(?:\)\(|\[.*|\s*=.*|;)")
+global_regex = re.compile(r"(?P<name>(?:\w+::)*g_\w+)")
+less_strict_global_regex = re.compile(r"(?P<name>(?:\w+::)*\w+)(?:\)\(|\[.*|\s*=.*|;)")


 def get_variable_name(line: str) -> Optional[str]:
--- a/tools/isledecomp/tests/test_curly.py
+++ b/tools/isledecomp/tests/test_curly.py
@ -0,0 +1,73 @@
+# nyuk nyuk nyuk
+import pytest
+from isledecomp.parser.parser import CurlyManager
+from isledecomp.parser.util import sanitize_code_line
+
+
+@pytest.fixture(name="curly")
+def fixture_curly():
+    return CurlyManager()
+
+
+def test_simple(curly):
+    curly.read_line("namespace Test {")
+    assert curly.get_prefix() == "Test"
+    curly.read_line("}")
+    assert curly.get_prefix() == ""
+
+
+def test_oneliner(curly):
+    """Should not go down into a scope for a class forward reference"""
+    curly.read_line("class LegoEntity;")
+    assert curly.get_prefix() == ""
+    # Now make sure that we still would not consider that class name
+    # even after reading the opening curly brace
+    curly.read_line("if (true) {")
+    assert curly.get_prefix() == ""
+
+
+def test_ignore_comments(curly):
+    curly.read_line("namespace Test {")
+    curly.read_line("// }")
+    assert curly.get_prefix() == "Test"
+
+
+@pytest.mark.xfail(reason="todo: need a real lexer")
+def test_ignore_multiline_comments(curly):
+    curly.read_line("namespace Test {")
+    curly.read_line("/*")
+    curly.read_line("}")
+    curly.read_line("*/")
+    assert curly.get_prefix() == "Test"
+    curly.read_line("}")
+    assert curly.get_prefix() == ""
+
+
+def test_nested(curly):
+    curly.read_line("namespace Test {")
+    curly.read_line("namespace Foo {")
+    assert curly.get_prefix() == "Test::Foo"
+    curly.read_line("}")
+    assert curly.get_prefix() == "Test"
+
+
+sanitize_cases = [
+    ("", ""),
+    ("   ", ""),
+    ("{", "{"),
+    ("// comments {", ""),
+    ("{ // why comment here", "{"),
+    ("/* comments */ {", "{"),
+    ('"curly in a string {"', '""'),
+    ('if (!strcmp("hello { there }", g_test)) {', 'if (!strcmp("", g_test)) {'),
+    ("'{'", "''"),
+    ("weird_function('\"', hello, '\"')", "weird_function('', hello, '')"),
+]
+
+
+@pytest.mark.parametrize("start, end", sanitize_cases)
+def test_sanitize(start: str, end: str):
+    """Make sure that we can remove curly braces in places where they should
+    not be considered as part of the semantic structure of the file.
+    i.e. inside strings or chars, and inside comments"""
+    assert sanitize_code_line(start) == end
--- a/tools/isledecomp/tests/test_demangler.py
+++ b/tools/isledecomp/tests/test_demangler.py
@ -48,6 +48,7 @@ vtable_cases = [
    ("??_7LegoCarBuildAnimPresenter@@6B@", "LegoCarBuildAnimPresenter"),
    ("??_7?$MxCollection@PAVLegoWorld@@@@6B@", "MxCollection<LegoWorld *>"),
    ("??_7?$MxPtrList@VLegoPathController@@@@6B@", "MxPtrList<LegoPathController>"),
+    ("??_7Renderer@Tgl@@6B@", "Tgl::Renderer"),
 ]


--- a/tools/isledecomp/tests/test_parser.py
+++ b/tools/isledecomp/tests/test_parser.py
@ -521,3 +521,111 @@ def test_string_ignore_g_prefix(parser):
    )
    assert len(parser.strings) == 1
    assert len(parser.alerts) == 0
+
+
+def test_class_variable(parser):
+    """We should accurately name static variables that are class members."""
+
+    parser.read_lines(
+        [
+            "class Test {",
+            "protected:",
+            "  // GLOBAL: TEST 0x1234",
+            "  static int g_test;",
+            "};",
+        ]
+    )
+
+    assert len(parser.variables) == 1
+    assert parser.variables[0].name == "Test::g_test"
+
+
+def test_namespace_variable(parser):
+    """We should identify a namespace surrounding any global variables"""
+
+    parser.read_lines(
+        [
+            "namespace Test {",
+            "// GLOBAL: TEST 0x1234",
+            "int g_test = 1234;",
+            "}",
+            "// GLOBAL: TEST 0x5555",
+            "int g_second = 2;",
+        ]
+    )
+
+    assert len(parser.variables) == 2
+    assert parser.variables[0].name == "Test::g_test"
+    assert parser.variables[1].name == "g_second"
+
+
+def test_namespace_vtable(parser):
+    parser.read_lines(
+        [
+            "namespace Tgl {",
+            "// VTABLE: TEST 0x1234",
+            "class Renderer {",
+            "};",
+            "}",
+            "// VTABLE: TEST 0x5555",
+            "class Hello { };",
+        ]
+    )
+
+    assert len(parser.vtables) == 2
+    assert parser.vtables[0].name == "Tgl::Renderer"
+    assert parser.vtables[1].name == "Hello"
+
+
+def test_global_prefix_namespace(parser):
+    """Should correctly identify namespaces before checking for the g_ prefix"""
+
+    parser.read_lines(
+        [
+            "class Test {",
+            "  // GLOBAL: TEST 0x1234",
+            "  static int g_count = 0;",
+            "  // GLOBAL: TEST 0x5555",
+            "  static int count = 0;",
+            "};",
+        ]
+    )
+
+    assert len(parser.variables) == 2
+    assert parser.variables[0].name == "Test::g_count"
+    assert parser.variables[1].name == "Test::count"
+
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.GLOBAL_MISSING_PREFIX
+
+
+def test_nested_namespace(parser):
+    parser.read_lines(
+        [
+            "namespace Tgl {",
+            "class Renderer {",
+            "  // GLOBAL: TEST 0x1234",
+            "  static int g_count = 0;",
+            "};",
+            "};",
+        ]
+    )
+
+    assert len(parser.variables) == 1
+    assert parser.variables[0].name == "Tgl::Renderer::g_count"
+
+
+def test_match_qualified_variable(parser):
+    """If a variable belongs to a scope and we use a fully qualified reference
+    below a GLOBAL marker, make sure we capture the full name."""
+
+    parser.read_lines(
+        [
+            "// GLOBAL: TEST 0x1234",
+            "int MxTest::g_count = 0;",
+        ]
+    )
+
+    assert len(parser.variables) == 1
+    assert parser.variables[0].name == "MxTest::g_count"
+    assert len(parser.alerts) == 0