parser: Identify namespaces (#499)

This commit is contained in:
MS 2024-01-28 09:25:45 -05:00 committed by GitHub
parent d9c4151bb8
commit 4137cd75e6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 280 additions and 7 deletions

View file

@ -68,4 +68,9 @@ def demangle_vtable(symbol: str) -> str:
return f"{class_name}<{generic}>"
# If we have two classes listed, it is a namespace hierarchy.
# @@6B@ is a common generic suffix for these vtable symbols.
if t[1] != "" and t[1] != "6B":
return t[1] + "::" + t[0]
return t[0]

View file

@ -8,6 +8,8 @@
get_synthetic_name,
remove_trailing_comment,
get_string_contents,
sanitize_code_line,
scopeDetectRegex,
)
from .marker import (
DecompMarker,
@ -59,6 +61,57 @@ def empty(self):
self.markers = {}
class CurlyManager:
"""Overly simplified scope manager"""
def __init__(self):
self._stack = []
def reset(self):
self._stack = []
def _pop(self):
"""Pop stack safely"""
try:
self._stack.pop()
except IndexError:
pass
def get_prefix(self, name: Optional[str] = None) -> str:
"""Return the prefix for where we are."""
scopes = [t for t in self._stack if t != "{"]
if len(scopes) == 0:
return name if name is not None else ""
if name is not None and name not in scopes:
scopes.append(name)
return "::".join(scopes)
def read_line(self, raw_line: str):
"""Read a line of code and update the stack."""
line = sanitize_code_line(raw_line)
if (match := scopeDetectRegex.match(line)) is not None:
if not line.endswith(";"):
self._stack.append(match.group("name"))
change = line.count("{") - line.count("}")
if change > 0:
for _ in range(change):
self._stack.append("{")
elif change < 0:
for _ in range(-change):
self._pop()
if len(self._stack) == 0:
return
last = self._stack[-1]
if last != "{":
self._pop()
class DecompParser:
# pylint: disable=too-many-instance-attributes
# Could combine output lists into a single list to get under the limit,
@ -73,6 +126,8 @@ def __init__(self) -> None:
self.last_line: str = ""
self.curly = CurlyManager()
# To allow for multiple markers where code is shared across different
# modules, save lists of compatible markers that appear in sequence
self.fun_markers = MarkerDict()
@ -110,6 +165,8 @@ def reset(self):
self.function_start = 0
self.function_sig = ""
self.curly.reset()
@property
def functions(self) -> List[ParserFunction]:
return [s for s in self._symbols if isinstance(s, ParserFunction)]
@ -213,7 +270,7 @@ def _vtable_done(self, class_name: str = None):
line_number=self.line_number,
module=marker.module,
offset=marker.offset,
name=class_name,
name=self.curly.get_prefix(class_name),
)
)
@ -254,7 +311,7 @@ def _variable_done(
line_number=self.line_number,
module=marker.module,
offset=marker.offset,
name=variable_name,
name=self.curly.get_prefix(variable_name),
is_static=self.state == ReaderState.IN_FUNC_GLOBAL,
)
)
@ -353,6 +410,8 @@ def read_line(self, line: str):
self._handle_marker(marker)
return
self.curly.read_line(line)
line_strip = line.strip()
if self.state in (
ReaderState.IN_SYNTHETIC,
@ -451,8 +510,11 @@ def read_line(self, line: str):
variable_name = get_variable_name(line)
# This is out of our control for library variables, but all of our
# variables should start with "g_".
if variable_name is not None and not variable_name.startswith("g_"):
self._syntax_warning(ParserError.GLOBAL_MISSING_PREFIX)
if variable_name is not None:
# Before checking for the prefix, remove the
# namespace chain if there is one.
if not variable_name.split("::")[-1].startswith("g_"):
self._syntax_warning(ParserError.GLOBAL_MISSING_PREFIX)
string_name = get_string_contents(line)

View file

@ -7,15 +7,25 @@
# flexibility in the formatting seems OK
templateCommentRegex = re.compile(r"\s*//\s+(.*)")
# To remove any comment (//) or block comment (/*) and its leading spaces
# from the end of a code line
trailingCommentRegex = re.compile(r"(\s*(?://|/\*).*)$")
# Get char contents, ignore escape characters
singleQuoteRegex = re.compile(r"('(?:[^\'\\]|\\.)')")
# Match contents of block comment on one line
blockCommentRegex = re.compile(r"(/\*.*?\*/)")
# Match contents of single comment on one line
regularCommentRegex = re.compile(r"(//.*)")
# Get string contents, ignore escape characters that might interfere
doubleQuoteRegex = re.compile(r"(\"(?:[^\"\\]|\\.)*\")")
# Detect a line that would cause us to enter a new scope
scopeDetectRegex = re.compile(r"(?:class|struct|namespace) (?P<name>\w+).*(?:{)?")
def get_synthetic_name(line: str) -> Optional[str]:
"""Synthetic names appear on a single line comment on the line after the marker.
@ -28,6 +38,20 @@ def get_synthetic_name(line: str) -> Optional[str]:
return None
def sanitize_code_line(line: str) -> str:
"""Helper for scope manager. Removes sections from a code line
that would cause us to incorrectly detect curly brackets.
This is a very naive implementation and fails entirely on multi-line
strings or comments."""
line = singleQuoteRegex.sub("''", line)
line = doubleQuoteRegex.sub('""', line)
line = blockCommentRegex.sub("", line)
line = regularCommentRegex.sub("", line)
return line.strip()
def remove_trailing_comment(line: str) -> str:
return trailingCommentRegex.sub("", line)
@ -75,8 +99,8 @@ def get_class_name(line: str) -> Optional[str]:
return None
global_regex = re.compile(r"(?P<name>g_\w+)")
less_strict_global_regex = re.compile(r"(?P<name>\w+)(?:\)\(|\[.*|\s*=.*|;)")
global_regex = re.compile(r"(?P<name>(?:\w+::)*g_\w+)")
less_strict_global_regex = re.compile(r"(?P<name>(?:\w+::)*\w+)(?:\)\(|\[.*|\s*=.*|;)")
def get_variable_name(line: str) -> Optional[str]:

View file

@ -0,0 +1,73 @@
# nyuk nyuk nyuk
import pytest
from isledecomp.parser.parser import CurlyManager
from isledecomp.parser.util import sanitize_code_line
@pytest.fixture(name="curly")
def fixture_curly():
return CurlyManager()
def test_simple(curly):
curly.read_line("namespace Test {")
assert curly.get_prefix() == "Test"
curly.read_line("}")
assert curly.get_prefix() == ""
def test_oneliner(curly):
"""Should not go down into a scope for a class forward reference"""
curly.read_line("class LegoEntity;")
assert curly.get_prefix() == ""
# Now make sure that we still would not consider that class name
# even after reading the opening curly brace
curly.read_line("if (true) {")
assert curly.get_prefix() == ""
def test_ignore_comments(curly):
curly.read_line("namespace Test {")
curly.read_line("// }")
assert curly.get_prefix() == "Test"
@pytest.mark.xfail(reason="todo: need a real lexer")
def test_ignore_multiline_comments(curly):
curly.read_line("namespace Test {")
curly.read_line("/*")
curly.read_line("}")
curly.read_line("*/")
assert curly.get_prefix() == "Test"
curly.read_line("}")
assert curly.get_prefix() == ""
def test_nested(curly):
curly.read_line("namespace Test {")
curly.read_line("namespace Foo {")
assert curly.get_prefix() == "Test::Foo"
curly.read_line("}")
assert curly.get_prefix() == "Test"
sanitize_cases = [
("", ""),
(" ", ""),
("{", "{"),
("// comments {", ""),
("{ // why comment here", "{"),
("/* comments */ {", "{"),
('"curly in a string {"', '""'),
('if (!strcmp("hello { there }", g_test)) {', 'if (!strcmp("", g_test)) {'),
("'{'", "''"),
("weird_function('\"', hello, '\"')", "weird_function('', hello, '')"),
]
@pytest.mark.parametrize("start, end", sanitize_cases)
def test_sanitize(start: str, end: str):
"""Make sure that we can remove curly braces in places where they should
not be considered as part of the semantic structure of the file.
i.e. inside strings or chars, and inside comments"""
assert sanitize_code_line(start) == end

View file

@ -48,6 +48,7 @@ def test_invalid_encoded_number():
("??_7LegoCarBuildAnimPresenter@@6B@", "LegoCarBuildAnimPresenter"),
("??_7?$MxCollection@PAVLegoWorld@@@@6B@", "MxCollection<LegoWorld *>"),
("??_7?$MxPtrList@VLegoPathController@@@@6B@", "MxPtrList<LegoPathController>"),
("??_7Renderer@Tgl@@6B@", "Tgl::Renderer"),
]

View file

@ -521,3 +521,111 @@ def test_string_ignore_g_prefix(parser):
)
assert len(parser.strings) == 1
assert len(parser.alerts) == 0
def test_class_variable(parser):
"""We should accurately name static variables that are class members."""
parser.read_lines(
[
"class Test {",
"protected:",
" // GLOBAL: TEST 0x1234",
" static int g_test;",
"};",
]
)
assert len(parser.variables) == 1
assert parser.variables[0].name == "Test::g_test"
def test_namespace_variable(parser):
"""We should identify a namespace surrounding any global variables"""
parser.read_lines(
[
"namespace Test {",
"// GLOBAL: TEST 0x1234",
"int g_test = 1234;",
"}",
"// GLOBAL: TEST 0x5555",
"int g_second = 2;",
]
)
assert len(parser.variables) == 2
assert parser.variables[0].name == "Test::g_test"
assert parser.variables[1].name == "g_second"
def test_namespace_vtable(parser):
parser.read_lines(
[
"namespace Tgl {",
"// VTABLE: TEST 0x1234",
"class Renderer {",
"};",
"}",
"// VTABLE: TEST 0x5555",
"class Hello { };",
]
)
assert len(parser.vtables) == 2
assert parser.vtables[0].name == "Tgl::Renderer"
assert parser.vtables[1].name == "Hello"
def test_global_prefix_namespace(parser):
"""Should correctly identify namespaces before checking for the g_ prefix"""
parser.read_lines(
[
"class Test {",
" // GLOBAL: TEST 0x1234",
" static int g_count = 0;",
" // GLOBAL: TEST 0x5555",
" static int count = 0;",
"};",
]
)
assert len(parser.variables) == 2
assert parser.variables[0].name == "Test::g_count"
assert parser.variables[1].name == "Test::count"
assert len(parser.alerts) == 1
assert parser.alerts[0].code == ParserError.GLOBAL_MISSING_PREFIX
def test_nested_namespace(parser):
parser.read_lines(
[
"namespace Tgl {",
"class Renderer {",
" // GLOBAL: TEST 0x1234",
" static int g_count = 0;",
"};",
"};",
]
)
assert len(parser.variables) == 1
assert parser.variables[0].name == "Tgl::Renderer::g_count"
def test_match_qualified_variable(parser):
"""If a variable belongs to a scope and we use a fully qualified reference
below a GLOBAL marker, make sure we capture the full name."""
parser.read_lines(
[
"// GLOBAL: TEST 0x1234",
"int MxTest::g_count = 0;",
]
)
assert len(parser.variables) == 1
assert parser.variables[0].name == "MxTest::g_count"
assert len(parser.alerts) == 0