From 6fda6ca92bdcbedffb54b1e3e351090e5d457112 Mon Sep 17 00:00:00 2001 From: MS Date: Fri, 29 Mar 2024 13:29:44 -0400 Subject: [PATCH] reccmp: Don't use placeholder for address comparison (#751) --- .../isledecomp/compare/asm/parse.py | 26 +++++++++++--- tools/isledecomp/tests/test_sanitize.py | 35 +++++++++++++++++++ 2 files changed, 57 insertions(+), 4 deletions(-) diff --git a/tools/isledecomp/isledecomp/compare/asm/parse.py b/tools/isledecomp/isledecomp/compare/asm/parse.py index 459cbd3d..d69fe76b 100644 --- a/tools/isledecomp/isledecomp/compare/asm/parse.py +++ b/tools/isledecomp/isledecomp/compare/asm/parse.py @@ -69,14 +69,16 @@ def float_replace(self, addr: int, data_size: int) -> Optional[str]: return None - def lookup(self, addr: int) -> Optional[str]: + def lookup(self, addr: int, use_cache: bool = True) -> Optional[str]: """Return a replacement name for this address if we find one.""" - if (cached := self.replacements.get(addr, None)) is not None: + if use_cache and (cached := self.replacements.get(addr, None)) is not None: return cached if callable(self.name_lookup): if (name := self.name_lookup(addr)) is not None: - self.replacements[addr] = name + if use_cache: + self.replacements[addr] = name + return name return None @@ -110,6 +112,16 @@ def hex_replace_relocated(self, match: re.Match) -> str: return match.group(0) + def hex_replace_annotated(self, match: re.Match) -> str: + """For replacing immediate value operands. Here we replace the value + only if the name lookup returns something. Do not use a placeholder.""" + value = int(match.group(1), 16) + placeholder = self.lookup(value, use_cache=False) + if placeholder is not None: + return match.group(0).replace(match.group(1), placeholder) + + return match.group(0) + def hex_replace_float(self, match: re.Match) -> str: """Special case for replacements on float instructions. If the pointer is a float constant, read it from the binary.""" @@ -178,7 +190,13 @@ def sanitize(self, inst: DisasmLiteInst) -> Tuple[str, str]: # vtable call, or this->member access. op_str = displace_replace_regex.sub(self.hex_replace_relocated, op_str) - op_str = immediate_replace_regex.sub(self.hex_replace_relocated, op_str) + # In the event of pointer comparison, only replace the immediate value + # if it is a known address. + if inst.mnemonic == "cmp": + op_str = immediate_replace_regex.sub(self.hex_replace_annotated, op_str) + else: + op_str = immediate_replace_regex.sub(self.hex_replace_relocated, op_str) + return (inst.mnemonic, op_str) def parse_asm(self, data: bytes, start_addr: Optional[int] = 0) -> List[str]: diff --git a/tools/isledecomp/tests/test_sanitize.py b/tools/isledecomp/tests/test_sanitize.py index 200cbf0a..d13fae56 100644 --- a/tools/isledecomp/tests/test_sanitize.py +++ b/tools/isledecomp/tests/test_sanitize.py @@ -221,3 +221,38 @@ def substitute_float(_: int, __: int) -> str: inst = DisasmLiteInst(0x1000, 6, "fld", "dword ptr [0x1234]") (_, op_str) = p.sanitize(inst) assert op_str == "dword ptr [g_myFloatVariable]" + + +def test_pointer_compare(): + """A loop on an array could get optimized into comparing on the address + that immediately follows the array. This may or may not be a valid address + and it may or may not be annotated. To avoid a situation where an + erroneous address value would get replaced with a placeholder and silently + pass the comparison check, we will only replace an immediate value on the + CMP instruction if it is a known address.""" + + # 0x1234 and 0x5555 are relocated and so are considered to be addresses. + def relocate_lookup(addr: int) -> bool: + return addr in (0x1234, 0x5555) + + # Only 0x5555 is a "known" address + def name_lookup(addr: int) -> Optional[str]: + return "hello" if addr == 0x5555 else None + + p = ParseAsm(relocate_lookup=relocate_lookup, name_lookup=name_lookup) + + # Will always replace on MOV instruction + (_, op_str) = p.sanitize(mock_inst("mov", "eax, 0x1234")) + assert op_str == "eax, " + (_, op_str) = p.sanitize(mock_inst("mov", "eax, 0x5555")) + assert op_str == "eax, hello" + + # n.b. We have already cached the replacement for 0x1234, but the + # special handling for CMP should skip the cache and not use it. + + # Do not replace here + (_, op_str) = p.sanitize(mock_inst("cmp", "eax, 0x1234")) + assert op_str == "eax, 0x1234" + # Should replace here + (_, op_str) = p.sanitize(mock_inst("cmp", "eax, 0x5555")) + assert op_str == "eax, hello"