reccmp: Unique addresses and stub reporting (#554)

This commit is contained in:
MS 2024-02-13 20:25:51 -05:00 committed by GitHub
parent eb3b339454
commit 1b696e4bd8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 326 additions and 99 deletions

View file

@ -156,7 +156,7 @@ class LegoWorld : public LegoEntity {
// _Tree<MxCore *,MxCore *,set<MxCore *,CoreSetCompare,allocator<MxCore *> >::_Kfn,CoreSetCompare,allocator<MxCore *> >::find
// TEMPLATE: LEGO1 0x10022360
// _Construct
// ?_Construct@@YAXPAPAVMxCore@@ABQAV1@@Z
// GLOBAL: LEGO1 0x100f11a0
// _Tree<MxCore *,MxCore *,set<MxCore *,CoreSetCompare,allocator<MxCore *> >::_Kfn,CoreSetCompare,allocator<MxCore *> >::_Nil

View file

@ -20,6 +20,7 @@
@dataclass
class DiffReport:
# pylint: disable=too-many-instance-attributes
match_type: SymbolType
orig_addr: int
recomp_addr: int
@ -27,6 +28,7 @@ class DiffReport:
udiff: Optional[List[str]] = None
ratio: float = 0.0
is_effective_match: bool = False
is_stub: bool = False
@property
def effective_ratio(self) -> float:
@ -130,7 +132,23 @@ def _load_cvdump(self):
raw = self.recomp_bin.read(addr, sym.size())
try:
sym.friendly_name = raw.decode("latin1").rstrip("\x00")
# We use the string length reported in the mangled symbol as the
# data size, but this is not always accurate with respect to the
# null terminator.
# e.g. ??_C@_0BA@EFDM@MxObjectFactory?$AA@
# reported length: 16 (includes null terminator)
# c.f. ??_C@_03DPKJ@enz?$AA@
# reported length: 3 (does NOT include terminator)
# This will handle the case where the entire string contains "\x00"
# because those are distinct from the empty string of length 0.
decoded_string = raw.decode("latin1")
rstrip_string = decoded_string.rstrip("\x00")
if decoded_string != "" and rstrip_string != "":
sym.friendly_name = rstrip_string
else:
sym.friendly_name = decoded_string
except UnicodeDecodeError:
pass
@ -162,12 +180,12 @@ def _load_markers(self):
if recomp_addr is not None:
self._db.set_function_pair(fun.offset, recomp_addr)
if fun.should_skip():
self._db.skip_compare(fun.offset)
self._db.mark_stub(fun.offset)
for fun in codebase.iter_name_functions():
self._db.match_function(fun.offset, fun.name)
if fun.should_skip():
self._db.skip_compare(fun.offset)
self._db.mark_stub(fun.offset)
for var in codebase.iter_variables():
if var.is_static and var.parent_function is not None:
@ -255,15 +273,6 @@ def _match_thunks(self):
self._db.skip_compare(thunk_from_orig)
def _compare_function(self, match: MatchInfo) -> DiffReport:
if match.size == 0:
# Report a failed match to make the user aware of the empty function.
return DiffReport(
match_type=SymbolType.FUNCTION,
orig_addr=match.orig_addr,
recomp_addr=match.recomp_addr,
name=match.name,
)
orig_raw = self.orig_bin.read(match.orig_addr, match.size)
recomp_raw = self.recomp_bin.read(match.recomp_addr, match.size)
@ -406,6 +415,23 @@ def match_text(
def _compare_match(self, match: MatchInfo) -> Optional[DiffReport]:
"""Router for comparison type"""
if match.size == 0:
return None
options = self._db.get_match_options(match.orig_addr)
if options.get("skip", False):
return None
if options.get("stub", False):
return DiffReport(
match_type=match.compare_type,
orig_addr=match.orig_addr,
recomp_addr=match.recomp_addr,
name=match.name,
is_stub=True,
)
if match.compare_type == SymbolType.FUNCTION:
return self._compare_function(match)
@ -440,7 +466,9 @@ def compare_all(self) -> Iterable[DiffReport]:
def compare_functions(self) -> Iterable[DiffReport]:
for match in self.get_functions():
yield self._compare_match(match)
diff = self._compare_match(match)
if diff is not None:
yield diff
def compare_variables(self):
pass
@ -453,4 +481,6 @@ def compare_strings(self):
def compare_vtables(self) -> Iterable[DiffReport]:
for match in self.get_vtables():
yield self._compare_match(match)
diff = self._compare_match(match)
if diff is not None:
yield self._compare_match(match)

View file

@ -7,15 +7,30 @@
_SETUP_SQL = """
DROP TABLE IF EXISTS `symbols`;
DROP TABLE IF EXISTS `match_options`;
CREATE TABLE `symbols` (
compare_type int,
orig_addr int,
recomp_addr int,
name text,
decorated_name text,
size int,
should_skip int default(FALSE)
size int
);
CREATE TABLE `match_options` (
addr int not null,
name text not null,
value text,
primary key (addr, name)
) without rowid;
CREATE VIEW IF NOT EXISTS `match_info`
(compare_type, orig_addr, recomp_addr, name, size) AS
SELECT compare_type, orig_addr, recomp_addr, name, size
FROM `symbols`
ORDER BY orig_addr NULLS LAST;
CREATE INDEX `symbols_or` ON `symbols` (orig_addr);
CREATE INDEX `symbols_re` ON `symbols` (recomp_addr);
CREATE INDEX `symbols_na` ON `symbols` (name);
@ -69,6 +84,11 @@ def set_recomp_symbol(
decorated_name: Optional[str],
size: Optional[int],
):
# Ignore collisions here. The same recomp address can have
# multiple names (e.g. _strlwr and __strlwr)
if self.recomp_used(addr):
return
compare_value = compare_type.value if compare_type is not None else None
self._db.execute(
"INSERT INTO `symbols` (recomp_addr, compare_type, name, decorated_name, size) VALUES (?,?,?,?,?)",
@ -86,24 +106,16 @@ def get_unmatched_strings(self) -> List[str]:
return [string for (string,) in cur.fetchall()]
def get_all(self) -> List[MatchInfo]:
cur = self._db.execute(
"""SELECT compare_type, orig_addr, recomp_addr, name, size
FROM `symbols`
ORDER BY orig_addr NULLS LAST
""",
)
cur = self._db.execute("SELECT * FROM `match_info`")
cur.row_factory = matchinfo_factory
return cur.fetchall()
def get_matches(self) -> Optional[MatchInfo]:
cur = self._db.execute(
"""SELECT compare_type, orig_addr, recomp_addr, name, size
FROM `symbols`
"""SELECT * FROM `match_info`
WHERE orig_addr IS NOT NULL
AND recomp_addr IS NOT NULL
AND should_skip IS FALSE
ORDER BY orig_addr
""",
)
cur.row_factory = matchinfo_factory
@ -112,11 +124,9 @@ def get_matches(self) -> Optional[MatchInfo]:
def get_one_match(self, addr: int) -> Optional[MatchInfo]:
cur = self._db.execute(
"""SELECT compare_type, orig_addr, recomp_addr, name, size
FROM `symbols`
"""SELECT * FROM `match_info`
WHERE orig_addr = ?
AND recomp_addr IS NOT NULL
AND should_skip IS FALSE
""",
(addr,),
)
@ -125,8 +135,7 @@ def get_one_match(self, addr: int) -> Optional[MatchInfo]:
def get_by_orig(self, addr: int) -> Optional[MatchInfo]:
cur = self._db.execute(
"""SELECT compare_type, orig_addr, recomp_addr, name, size
FROM `symbols`
"""SELECT * FROM `match_info`
WHERE orig_addr = ?
""",
(addr,),
@ -136,8 +145,7 @@ def get_by_orig(self, addr: int) -> Optional[MatchInfo]:
def get_by_recomp(self, addr: int) -> Optional[MatchInfo]:
cur = self._db.execute(
"""SELECT compare_type, orig_addr, recomp_addr, name, size
FROM `symbols`
"""SELECT * FROM `match_info`
WHERE recomp_addr = ?
""",
(addr,),
@ -147,13 +155,10 @@ def get_by_recomp(self, addr: int) -> Optional[MatchInfo]:
def get_matches_by_type(self, compare_type: SymbolType) -> List[MatchInfo]:
cur = self._db.execute(
"""SELECT compare_type, orig_addr, recomp_addr, name, size
FROM `symbols`
"""SELECT * FROM `match_info`
WHERE compare_type = ?
AND orig_addr IS NOT NULL
AND recomp_addr IS NOT NULL
AND should_skip IS FALSE
ORDER BY orig_addr
""",
(compare_type.value,),
)
@ -161,9 +166,21 @@ def get_matches_by_type(self, compare_type: SymbolType) -> List[MatchInfo]:
return cur.fetchall()
def orig_used(self, addr: int) -> bool:
cur = self._db.execute("SELECT 1 FROM symbols WHERE orig_addr = ?", (addr,))
return cur.fetchone() is not None
def recomp_used(self, addr: int) -> bool:
cur = self._db.execute("SELECT 1 FROM symbols WHERE recomp_addr = ?", (addr,))
return cur.fetchone() is not None
def set_pair(
self, orig: int, recomp: int, compare_type: Optional[SymbolType] = None
) -> bool:
if self.orig_used(orig):
logger.error("Original address %s not unique!", hex(orig))
return False
compare_value = compare_type.value if compare_type is not None else None
cur = self._db.execute(
"UPDATE `symbols` SET orig_addr = ?, compare_type = ? WHERE recomp_addr = ?",
@ -174,14 +191,90 @@ def set_pair(
def set_function_pair(self, orig: int, recomp: int) -> bool:
"""For lineref match or _entry"""
self.set_pair(orig, recomp, SymbolType.FUNCTION)
# TODO: Both ways required?
return self.set_pair(orig, recomp, SymbolType.FUNCTION)
def _set_opt_bool(self, addr: int, option: str, enabled: bool = True):
if enabled:
self._db.execute(
"""INSERT OR IGNORE INTO `match_options`
(addr, name)
VALUES (?, ?)""",
(addr, option),
)
else:
self._db.execute(
"""DELETE FROM `match_options` WHERE addr = ? AND name = ?""",
(addr, option),
)
def mark_stub(self, orig: int):
self._set_opt_bool(orig, "stub")
def skip_compare(self, orig: int):
self._db.execute(
"UPDATE `symbols` SET should_skip = TRUE WHERE orig_addr = ?", (orig,)
self._set_opt_bool(orig, "skip")
def get_match_options(self, addr: int) -> Optional[dict]:
cur = self._db.execute(
"""SELECT name, value FROM `match_options` WHERE addr = ?""", (addr,)
)
return {
option: value if value is not None else True
for (option, value) in cur.fetchall()
}
def _find_potential_match(
self, name: str, compare_type: SymbolType
) -> Optional[int]:
"""Name lookup"""
match_decorate = compare_type != SymbolType.STRING and name.startswith("?")
if match_decorate:
sql = """
SELECT recomp_addr
FROM `symbols`
WHERE orig_addr IS NULL
AND decorated_name = ?
AND (compare_type IS NULL OR compare_type = ?)
LIMIT 1
"""
else:
sql = """
SELECT recomp_addr
FROM `symbols`
WHERE orig_addr IS NULL
AND name = ?
AND (compare_type IS NULL OR compare_type = ?)
LIMIT 1
"""
row = self._db.execute(sql, (name, compare_type.value)).fetchone()
return row[0] if row is not None else None
def _find_static_variable(
self, variable_name: str, function_sym: str
) -> Optional[int]:
"""Get the recomp address of a static function variable.
Matches using a LIKE clause on the combination of:
1. The variable name read from decomp marker.
2. The decorated name of the enclosing function.
For example, the variable "g_startupDelay" from function "IsleApp::Tick"
has symbol: `?g_startupDelay@?1??Tick@IsleApp@@QAEXH@Z@4HA`
The function's decorated name is: `?Tick@IsleApp@@QAEXH@Z`"""
row = self._db.execute(
"""SELECT recomp_addr FROM `symbols`
WHERE decorated_name LIKE '%' || ? || '%' || ? || '%'
AND orig_addr IS NULL
AND (compare_type = ? OR compare_type = ? OR compare_type IS NULL)""",
(
variable_name,
function_sym,
SymbolType.DATA.value,
SymbolType.POINTER.value,
),
).fetchone()
return row[0] if row is not None else None
def _match_on(self, compare_type: SymbolType, addr: int, name: str) -> bool:
# Update the compare_type here too since the marker tells us what we should do
@ -191,16 +284,11 @@ def _match_on(self, compare_type: SymbolType, addr: int, name: str) -> bool:
name = name[:255]
logger.debug("Looking for %s %s", compare_type.name.lower(), name)
cur = self._db.execute(
"""UPDATE `symbols`
SET orig_addr = ?, compare_type = ?
WHERE name = ?
AND orig_addr IS NULL
AND (compare_type = ? OR compare_type IS NULL)""",
(addr, compare_type.value, name, compare_type.value),
)
recomp_addr = self._find_potential_match(name, compare_type)
if recomp_addr is None:
return False
return cur.rowcount > 0
return self.set_pair(addr, recomp_addr, compare_type)
def match_function(self, addr: int, name: str) -> bool:
did_match = self._match_on(SymbolType.FUNCTION, addr, name)
@ -234,37 +322,20 @@ def match_static_variable(self, addr: int, name: str, function_addr: int) -> boo
# Get the friendly name for the "failed to match" error message
(function_name, decorated_name) = result
# Now we have to combine the variable name (read from the marker)
# and the decorated name of the enclosing function (the above variable)
# into a LIKE clause and try to match.
# For example, the variable "g_startupDelay" from function "IsleApp::Tick"
# has symbol: "?g_startupDelay@?1??Tick@IsleApp@@QAEXH@Z@4HA"
# The function's decorated name is: "?Tick@IsleApp@@QAEXH@Z"
cur = self._db.execute(
"""UPDATE `symbols`
SET orig_addr = ?
WHERE name LIKE '%' || ? || '%' || ? || '%'
AND orig_addr IS NULL
AND (compare_type = ? OR compare_type = ? OR compare_type IS NULL)""",
(
addr,
name,
decorated_name,
SymbolType.DATA.value,
SymbolType.POINTER.value,
),
recomp_addr = self._find_static_variable(name, decorated_name)
if recomp_addr is not None:
# TODO: This variable could be a pointer, but I don't think we
# have a way to tell that right now.
if self.set_pair(addr, recomp_addr, SymbolType.DATA):
return True
logger.error(
"Failed to match static variable %s from function %s",
name,
function_name,
)
did_match = cur.rowcount > 0
if not did_match:
logger.error(
"Failed to match static variable %s from function %s",
name,
function_name,
)
return did_match
return False
def match_variable(self, addr: int, name: str) -> bool:
did_match = self._match_on(SymbolType.DATA, addr, name) or self._match_on(

View file

@ -4,6 +4,9 @@
def print_diff(udiff, plain):
if udiff is None:
return False
has_diff = False
for line in udiff:
has_diff = True

View file

@ -0,0 +1,82 @@
"""Testing compare database behavior, particularly matching"""
import pytest
from isledecomp.compare.db import CompareDb
@pytest.fixture(name="db")
def fixture_db():
return CompareDb()
def test_ignore_recomp_collision(db):
"""Duplicate recomp addresses are ignored"""
db.set_recomp_symbol(0x1234, None, "hello", None, 100)
db.set_recomp_symbol(0x1234, None, "alias_for_hello", None, 100)
syms = db.get_all()
assert len(syms) == 1
def test_orig_collision(db):
"""Don't match if the original address is not unique"""
db.set_recomp_symbol(0x1234, None, "hello", None, 100)
assert db.match_function(0x5555, "hello") is True
# Second run on same address fails
assert db.match_function(0x5555, "hello") is False
# Call set_pair directly without wrapper
assert db.set_pair(0x5555, 0x1234) is False
def test_name_match(db):
db.set_recomp_symbol(0x1234, None, "hello", None, 100)
assert db.match_function(0x5555, "hello") is True
match = db.get_by_orig(0x5555)
assert match.name == "hello"
assert match.recomp_addr == 0x1234
def test_match_decorated(db):
"""Should match using decorated name even though regular name is null"""
db.set_recomp_symbol(0x1234, None, None, "?_hello", 100)
assert db.match_function(0x5555, "?_hello") is True
match = db.get_by_orig(0x5555)
assert match is not None
def test_duplicate_name(db):
"""If recomp name is not unique, match only one row"""
db.set_recomp_symbol(0x100, None, "_Construct", None, 100)
db.set_recomp_symbol(0x200, None, "_Construct", None, 100)
db.set_recomp_symbol(0x300, None, "_Construct", None, 100)
db.match_function(0x5555, "_Construct")
matches = db.get_matches()
# We aren't testing _which_ one would be matched, just that only one _was_ matched
assert len(matches) == 1
def test_static_variable_match(db):
"""Set up a situation where we can match a static function variable, then match it."""
# We need a matched function to start with.
db.set_recomp_symbol(0x1234, None, "Isle::Tick", "?Tick@IsleApp@@QAEXH@Z", 100)
db.match_function(0x5555, "Isle::Tick")
# Decorated variable name from PDB.
db.set_recomp_symbol(
0x2000, None, None, "?g_startupDelay@?1??Tick@IsleApp@@QAEXH@Z@4HA", 4
)
# Provide variable name and orig function address from decomp markers
assert db.match_static_variable(0xBEEF, "g_startupDelay", 0x5555) is True
def test_match_options_bool(db):
"""Test handling of boolean match options"""
# You don't actually need an existing orig addr for this.
assert db.get_match_options(0x1234) == {}
db.mark_stub(0x1234)
assert "stub" in db.get_match_options(0x1234)

View file

@ -15,6 +15,7 @@
True,
),
("??_C@_00A@?$AA@", 0, False),
("??_C@_01A@?$AA?$AA@", 1, False),
]

View file

@ -95,6 +95,10 @@ def print_match_verbose(match, show_both_addrs: bool = False, is_plain: bool = F
else:
addrs = hex(match.orig_addr)
if match.is_stub:
print(f"{addrs}: {match.name} is a stub. No diff.")
return
if match.effective_ratio == 1.0:
ok_text = (
"OK!"
@ -125,7 +129,10 @@ def print_match_oneline(match, show_both_addrs: bool = False, is_plain: bool = F
else:
addrs = hex(match.orig_addr)
print(f" {match.name} ({addrs}) is {percenttext} similar to the original")
if match.is_stub:
print(f" {match.name} ({addrs}) is a stub.")
else:
print(f" {match.name} ({addrs}) is {percenttext} similar to the original")
def parse_args() -> argparse.Namespace:
@ -180,6 +187,11 @@ def virtual_address(value) -> int:
action="store_true",
help="Print addresses of recompiled functions too",
)
parser.add_argument(
"--silent",
action="store_true",
help="Don't display text summary of matches",
)
parser.set_defaults(loglevel=logging.INFO)
parser.add_argument(
@ -244,25 +256,31 @@ def main():
htmlinsert = []
for match in isle_compare.compare_all():
print_match_oneline(
match, show_both_addrs=args.print_rec_addr, is_plain=args.no_color
)
if not args.silent:
print_match_oneline(
match, show_both_addrs=args.print_rec_addr, is_plain=args.no_color
)
if match.match_type == SymbolType.FUNCTION:
if match.match_type == SymbolType.FUNCTION and not match.is_stub:
function_count += 1
total_accuracy += match.ratio
total_effective_accuracy += match.effective_ratio
# If html, record the diffs to an HTML file
if args.html is not None:
htmlinsert.append(
{
"address": f"0x{match.orig_addr:x}",
"name": match.name,
"matching": match.effective_ratio,
"diff": "\n".join(match.udiff),
}
)
html_obj = {
"address": f"0x{match.orig_addr:x}",
"name": match.name,
"matching": match.effective_ratio,
}
if match.udiff is not None:
html_obj["diff"] = "\n".join(match.udiff)
if match.is_stub:
html_obj["stub"] = True
htmlinsert.append(html_obj)
## Generate files and show summary.

View file

@ -110,7 +110,14 @@
var decCel = row.appendChild(document.createElement('td'));
decCel.colSpan = 3;
var diff = data[this.dataset.index].diff;
if (diff == '') {
const stub = "stub" in data[this.dataset.index];
if (stub) {
diff = document.createElement('div');
diff.className = 'identical';
diff.innerText = 'Stub. No diff.';
decCel.appendChild(diff);
} else if (diff == '') {
diff = document.createElement('div');
diff.className = 'identical';
diff.innerText = 'Identical function - no diff';
@ -132,7 +139,7 @@
}
}
const filterOptions = { text: '', hidePerfect: false };
const filterOptions = { text: '', hidePerfect: false, hideStub: false };
function filter() {
closeAllDiffs();
@ -144,13 +151,15 @@
for (var ele of collection) {
var eledata = data[ele.dataset.index];
const stubOk = (!filterOptions.hideStub || !("stub" in eledata));
const textOk = (ltext == ''
|| eledata.address.toLowerCase().includes(ltext)
|| eledata.name.toLowerCase().includes(ltext));
const perfOk = (!filterOptions.hidePerfect || (eledata.matching < 1));
if (textOk && perfOk) {
if (stubOk && textOk && perfOk) {
ele.style.display = '';
searchCount++;
} else {
@ -233,9 +242,14 @@
addrCel.innerText = addrCel.dataset.value = element.address;
nameCel.innerText = nameCel.dataset.value = element.name;
var effectiveNote = (element.matching == 1 && element.diff != '') ? '*' : '';
matchCel.innerHTML = (element.matching * 100).toFixed(2) + '%' + effectiveNote;
matchCel.dataset.value = element.matching;
if ("stub" in element) {
matchCel.innerHTML = 'stub'
matchCel.dataset.value = -1;
} else {
var effectiveNote = (element.matching == 1 && element.diff != '') ? '*' : '';
matchCel.innerHTML = (element.matching * 100).toFixed(2) + '%' + effectiveNote;
matchCel.dataset.value = element.matching;
}
row.classList.add('funcrow');
row.addEventListener('click', rowClick);
@ -255,6 +269,12 @@
filter();
})
const cbHideStub = document.querySelector('#cbHideStub');
cbHideStub.addEventListener('change', evt => {
filterOptions.hideStub = evt.target.checked;
filter();
})
sortByColumn(0);
});
</script>
@ -266,6 +286,8 @@ <h1>Decompilation Status</h1>
<div class="filters">
<label for="cbHidePerfect">Hide 100% match</label>
<input type="checkbox" id="cbHidePerfect" />
<label for="cbHideStub">Hide stubs</label>
<input type="checkbox" id="cbHideStub" />
</div>
<table id="listing">
<tr id='listingheader'><th style='width: 20%'>Address</th><th style="width:60%">Name</th><th style='width: 20%'>Matching</th></tr>