mirror of
https://github.com/isledecomp/isle-portable.git
synced 2024-11-29 10:55:42 -05:00
reccmp.py improvements (#82)
* Rather than using <OFFSET> as a replacement for all offsets in a function, label the offsets as <OFFSET1>, <OFFSET2>, etc. Doing this will avoid false-positive 100% matches resulting from the same function being called in two times where a different on should have been called or vice versa. And the same for globals. I already encountered one case of this in the wild. * When a 100% match initially fails, try to make the functions match by swapping register allocations. This makes it possible to get a 100% match where the generated machine code differs only in register allocation. * Only apply the above when it is possible to reach a 100% match in that way. Otherwise show the developer the unadultrated diff to avoid complicating decompilation. * In the result listing, show the functions which are "effective matches" in this way as "100%*" instead of "100%".
This commit is contained in:
parent
f8b1995a83
commit
f247e10b7e
2 changed files with 113 additions and 15 deletions
|
@ -10,6 +10,7 @@
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import colorama
|
import colorama
|
||||||
|
import re
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(allow_abbrev=False,
|
parser = argparse.ArgumentParser(allow_abbrev=False,
|
||||||
description='Recompilation Compare: compare an original EXE with a recompiled EXE + PDB.')
|
description='Recompilation Compare: compare an original EXE with a recompiled EXE + PDB.')
|
||||||
|
@ -248,9 +249,21 @@ def get_recompiled_address(self, filename, line):
|
||||||
|
|
||||||
md = Cs(CS_ARCH_X86, CS_MODE_32)
|
md = Cs(CS_ARCH_X86, CS_MODE_32)
|
||||||
|
|
||||||
def sanitize(file, mnemonic, op_str):
|
class OffsetPlaceholderGenerator:
|
||||||
offsetplaceholder = '<OFFSET>'
|
def __init__(self):
|
||||||
|
self.counter = 0
|
||||||
|
self.replacements = {}
|
||||||
|
|
||||||
|
def get(self, addr):
|
||||||
|
if addr in self.replacements:
|
||||||
|
return self.replacements[addr]
|
||||||
|
else:
|
||||||
|
self.counter += 1
|
||||||
|
replacement = '<OFFSET%d>' % self.counter
|
||||||
|
self.replacements[addr] = replacement
|
||||||
|
return replacement
|
||||||
|
|
||||||
|
def sanitize(file, placeholderGenerator, mnemonic, op_str):
|
||||||
op_str_is_number = False
|
op_str_is_number = False
|
||||||
try:
|
try:
|
||||||
int(op_str, 16)
|
int(op_str, 16)
|
||||||
|
@ -262,7 +275,7 @@ def sanitize(file, mnemonic, op_str):
|
||||||
# Filter out "calls" because the offsets we're not currently trying to
|
# Filter out "calls" because the offsets we're not currently trying to
|
||||||
# match offsets. As long as there's a call in the right place, it's
|
# match offsets. As long as there's a call in the right place, it's
|
||||||
# probably accurate.
|
# probably accurate.
|
||||||
op_str = offsetplaceholder
|
op_str = placeholderGenerator.get(int(op_str, 16))
|
||||||
else:
|
else:
|
||||||
def filter_out_ptr(ptype, op_str):
|
def filter_out_ptr(ptype, op_str):
|
||||||
try:
|
try:
|
||||||
|
@ -273,7 +286,7 @@ def filter_out_ptr(ptype, op_str):
|
||||||
# This will throw ValueError if not hex
|
# This will throw ValueError if not hex
|
||||||
inttest = int(op_str[start:end], 16)
|
inttest = int(op_str[start:end], 16)
|
||||||
|
|
||||||
return op_str[0:start] + offsetplaceholder + op_str[end:]
|
return op_str[0:start] + placeholderGenerator.get(inttest) + op_str[end:]
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return op_str
|
return op_str
|
||||||
|
|
||||||
|
@ -288,7 +301,7 @@ def filter_out_ptr(ptype, op_str):
|
||||||
try:
|
try:
|
||||||
inttest = int(word, 16)
|
inttest = int(word, 16)
|
||||||
if inttest >= file.imagebase + file.textvirt:
|
if inttest >= file.imagebase + file.textvirt:
|
||||||
words[i] = offsetplaceholder
|
words[i] = placeholderGenerator.get(inttest)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
op_str = ' '.join(words)
|
op_str = ' '.join(words)
|
||||||
|
@ -298,18 +311,76 @@ def filter_out_ptr(ptype, op_str):
|
||||||
def parse_asm(file, addr, size):
|
def parse_asm(file, addr, size):
|
||||||
asm = []
|
asm = []
|
||||||
data = file.read(addr, size)
|
data = file.read(addr, size)
|
||||||
|
placeholderGenerator = OffsetPlaceholderGenerator()
|
||||||
for i in md.disasm(data, 0):
|
for i in md.disasm(data, 0):
|
||||||
# Use heuristics to disregard some differences that aren't representative
|
# Use heuristics to disregard some differences that aren't representative
|
||||||
# of the accuracy of a function (e.g. global offsets)
|
# of the accuracy of a function (e.g. global offsets)
|
||||||
mnemonic, op_str = sanitize(file, i.mnemonic, i.op_str)
|
mnemonic, op_str = sanitize(file, placeholderGenerator, i.mnemonic, i.op_str)
|
||||||
if op_str is None:
|
if op_str is None:
|
||||||
asm.append(mnemonic)
|
asm.append(mnemonic)
|
||||||
else:
|
else:
|
||||||
asm.append("%s %s" % (mnemonic, op_str))
|
asm.append("%s %s" % (mnemonic, op_str))
|
||||||
return asm
|
return asm
|
||||||
|
|
||||||
|
REGISTER_LIST = set([
|
||||||
|
'eax', 'ebx', 'ecx', 'edx', 'edi', 'esi', 'ebp', 'esp',
|
||||||
|
'ax', 'bx', 'cx', 'dx', 'di', 'si', 'bp', 'sp',
|
||||||
|
])
|
||||||
|
WORDS = re.compile(r'\w+')
|
||||||
|
|
||||||
|
def get_registers(line: str):
|
||||||
|
to_replace = []
|
||||||
|
# use words regex to find all matching positions:
|
||||||
|
for match in WORDS.finditer(line):
|
||||||
|
reg = match.group(0)
|
||||||
|
if reg in REGISTER_LIST:
|
||||||
|
to_replace.append((reg, match.start()))
|
||||||
|
return to_replace
|
||||||
|
|
||||||
|
def replace_register(lines: list[str], start_line: int, reg: str, replacement: str):
|
||||||
|
for i in range(start_line, len(lines)):
|
||||||
|
lines[i] = lines[i].replace(reg, replacement)
|
||||||
|
|
||||||
|
# Is it possible to make new_asm the same as original_asm by swapping registers?
|
||||||
|
def can_resolve_register_differences(original_asm, new_asm):
|
||||||
|
# Swapping ain't gonna help if the lengths are different
|
||||||
|
if len(original_asm) != len(new_asm):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Make copies so we don't modify the original
|
||||||
|
original_asm = original_asm.copy()
|
||||||
|
new_asm = new_asm.copy()
|
||||||
|
|
||||||
|
# Look for the mismatching lines
|
||||||
|
for i in range(len(original_asm)):
|
||||||
|
new_line = new_asm[i]
|
||||||
|
original_line = original_asm[i]
|
||||||
|
if new_line != original_line:
|
||||||
|
# Find all the registers to replace
|
||||||
|
to_replace = get_registers(original_line)
|
||||||
|
|
||||||
|
for j in range(len(to_replace)):
|
||||||
|
(reg, reg_index) = to_replace[j]
|
||||||
|
replacing_reg = new_line[reg_index:reg_index + len(reg)]
|
||||||
|
if replacing_reg in REGISTER_LIST:
|
||||||
|
if replacing_reg != reg:
|
||||||
|
# Do a three-way swap replacing in all the subsequent lines
|
||||||
|
temp_reg = "&" * len(reg)
|
||||||
|
replace_register(new_asm, i, replacing_reg, temp_reg)
|
||||||
|
replace_register(new_asm, i, reg, replacing_reg)
|
||||||
|
replace_register(new_asm, i, temp_reg, reg)
|
||||||
|
else:
|
||||||
|
# No replacement to do, different code, bail out
|
||||||
|
return False
|
||||||
|
# Check if the lines are now the same
|
||||||
|
for i in range(len(original_asm)):
|
||||||
|
if new_asm[i] != original_asm[i]:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
function_count = 0
|
function_count = 0
|
||||||
total_accuracy = 0
|
total_accuracy = 0
|
||||||
|
total_effective_accuracy = 0
|
||||||
htmlinsert = []
|
htmlinsert = []
|
||||||
|
|
||||||
# Generate basename of original file, used in locating OFFSET lines
|
# Generate basename of original file, used in locating OFFSET lines
|
||||||
|
@ -356,24 +427,41 @@ def parse_asm(file, addr, size):
|
||||||
if not recinfo:
|
if not recinfo:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# The effective_ratio is the ratio when ignoring differing register
|
||||||
|
# allocation vs the ratio is the true ratio.
|
||||||
|
ratio = 0.0
|
||||||
|
effective_ratio = 0.0
|
||||||
if recinfo.size:
|
if recinfo.size:
|
||||||
origasm = parse_asm(origfile, addr + recinfo.start, recinfo.size)
|
origasm = parse_asm(origfile, addr + recinfo.start, recinfo.size)
|
||||||
recompasm = parse_asm(recompfile, recinfo.addr + recinfo.start, recinfo.size)
|
recompasm = parse_asm(recompfile, recinfo.addr + recinfo.start, recinfo.size)
|
||||||
|
|
||||||
diff = difflib.SequenceMatcher(None, origasm, recompasm)
|
diff = difflib.SequenceMatcher(None, origasm, recompasm)
|
||||||
ratio = diff.ratio()
|
ratio = diff.ratio()
|
||||||
|
effective_ratio = ratio
|
||||||
|
|
||||||
|
if ratio != 1.0:
|
||||||
|
# Check whether we can resolve register swaps which are actually
|
||||||
|
# perfect matches modulo compiler entropy.
|
||||||
|
if can_resolve_register_differences(origasm, recompasm):
|
||||||
|
effective_ratio = 1.0
|
||||||
else:
|
else:
|
||||||
ratio = 0
|
ratio = 0
|
||||||
|
|
||||||
percenttext = "%.2f%%" % (ratio * 100)
|
percenttext = "%.2f%%" % (effective_ratio * 100)
|
||||||
if not plain:
|
if not plain:
|
||||||
if ratio == 1.0:
|
if effective_ratio == 1.0:
|
||||||
percenttext = colorama.Fore.GREEN + percenttext + colorama.Style.RESET_ALL
|
percenttext = colorama.Fore.GREEN + percenttext + colorama.Style.RESET_ALL
|
||||||
elif ratio > 0.8:
|
elif effective_ratio > 0.8:
|
||||||
percenttext = colorama.Fore.YELLOW + percenttext + colorama.Style.RESET_ALL
|
percenttext = colorama.Fore.YELLOW + percenttext + colorama.Style.RESET_ALL
|
||||||
else:
|
else:
|
||||||
percenttext = colorama.Fore.RED + percenttext + colorama.Style.RESET_ALL
|
percenttext = colorama.Fore.RED + percenttext + colorama.Style.RESET_ALL
|
||||||
|
|
||||||
|
if effective_ratio == 1.0 and ratio != 1.0:
|
||||||
|
if plain:
|
||||||
|
percenttext += "*"
|
||||||
|
else:
|
||||||
|
percenttext += colorama.Fore.RED + "*" + colorama.Style.RESET_ALL
|
||||||
|
|
||||||
if not verbose:
|
if not verbose:
|
||||||
if args.print_rec_addr:
|
if args.print_rec_addr:
|
||||||
addrs = '%s / %s' % (hex(addr), hex(recinfo.addr))
|
addrs = '%s / %s' % (hex(addr), hex(recinfo.addr))
|
||||||
|
@ -383,14 +471,21 @@ def parse_asm(file, addr, size):
|
||||||
|
|
||||||
function_count += 1
|
function_count += 1
|
||||||
total_accuracy += ratio
|
total_accuracy += ratio
|
||||||
|
total_effective_accuracy += effective_ratio
|
||||||
|
|
||||||
if recinfo.size:
|
if recinfo.size:
|
||||||
udiff = difflib.unified_diff(origasm, recompasm, n=10)
|
udiff = difflib.unified_diff(origasm, recompasm, n=10)
|
||||||
|
|
||||||
# If verbose, print the diff for that funciton to the output
|
# If verbose, print the diff for that funciton to the output
|
||||||
if verbose:
|
if verbose:
|
||||||
|
if effective_ratio == 1.0:
|
||||||
|
ok_text = "OK!" if plain else (colorama.Fore.GREEN + "✨ OK! ✨" + colorama.Style.RESET_ALL)
|
||||||
if ratio == 1.0:
|
if ratio == 1.0:
|
||||||
print("%s: %s 100%% match.\n\nOK!" % (hex(addr), recinfo.name))
|
print("%s: %s 100%% match.\n\n%s\n\n" %
|
||||||
|
(hex(addr), recinfo.name, ok_text))
|
||||||
|
else:
|
||||||
|
print("%s: %s Effective 100%% match. (Differs in register allocation only)\n\n%s (still differs in register allocation)\n\n" %
|
||||||
|
(hex(addr), recinfo.name, ok_text))
|
||||||
else:
|
else:
|
||||||
for line in udiff:
|
for line in udiff:
|
||||||
if line.startswith("++") or line.startswith("@@") or line.startswith("--"):
|
if line.startswith("++") or line.startswith("@@") or line.startswith("--"):
|
||||||
|
@ -416,7 +511,7 @@ def parse_asm(file, addr, size):
|
||||||
# If html, record the diffs to an HTML file
|
# If html, record the diffs to an HTML file
|
||||||
if html:
|
if html:
|
||||||
escaped = '\\n'.join(udiff).replace('"', '\\"').replace('\n', '\\n').replace('<', '<').replace('>', '>')
|
escaped = '\\n'.join(udiff).replace('"', '\\"').replace('\n', '\\n').replace('<', '<').replace('>', '>')
|
||||||
htmlinsert.append('{address: "%s", name: "%s", matching: %s, diff: "%s"}' % (hex(addr), recinfo.name, str(ratio), escaped))
|
htmlinsert.append('{address: "%s", name: "%s", matching: %s, diff: "%s"}' % (hex(addr), recinfo.name, str(effective_ratio), escaped))
|
||||||
|
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
break
|
break
|
||||||
|
@ -496,7 +591,8 @@ def gen_svg(svg, name, icon, implemented_funcs, total_funcs, raw_accuracy):
|
||||||
function_count = int(args.total)
|
function_count = int(args.total)
|
||||||
|
|
||||||
if function_count > 0:
|
if function_count > 0:
|
||||||
print('\nTotal accuracy %.2f%% across %i functions' % (total_accuracy / function_count * 100, function_count))
|
print('\nTotal effective accuracy %.2f%% across %i functions (%.2f%% actual accuracy)' %
|
||||||
|
(total_effective_accuracy / function_count * 100, function_count, total_accuracy / function_count * 100))
|
||||||
|
|
||||||
if svg:
|
if svg:
|
||||||
gen_svg(svg, os.path.basename(original), args.svg_icon, implemented_funcs, function_count, total_accuracy)
|
gen_svg(svg, os.path.basename(original), args.svg_icon, implemented_funcs, function_count, total_effective_accuracy)
|
||||||
|
|
|
@ -221,7 +221,9 @@
|
||||||
|
|
||||||
addrCel.innerHTML = addrCel.dataset.value = element.address;
|
addrCel.innerHTML = addrCel.dataset.value = element.address;
|
||||||
nameCel.innerHTML = nameCel.dataset.value = element.name;
|
nameCel.innerHTML = nameCel.dataset.value = element.name;
|
||||||
matchCel.innerHTML = (element.matching * 100).toFixed(2) + '%';
|
|
||||||
|
var effectiveNote = (element.matching == 1 && element.diff != '') ? '*' : '';
|
||||||
|
matchCel.innerHTML = (element.matching * 100).toFixed(2) + '%' + effectiveNote;
|
||||||
matchCel.dataset.value = element.matching;
|
matchCel.dataset.value = element.matching;
|
||||||
|
|
||||||
row.classList.add('funcrow');
|
row.classList.add('funcrow');
|
||||||
|
|
Loading…
Reference in a new issue