improved reccomp reliability even further, added html summary generator

Will probably host the summary somewhere for easy access
This commit is contained in:
itsmattkc 2023-06-19 12:52:21 -07:00
parent ff85548c85
commit 66dd2cdeb9
2 changed files with 306 additions and 19 deletions

View file

@ -10,11 +10,13 @@
def print_usage(): def print_usage():
print('Usage: %s [options] <original-binary> <recompiled-binary> <recompiled-pdb> <decomp-dir>\n' % sys.argv[0]) print('Usage: %s [options] <original-binary> <recompiled-binary> <recompiled-pdb> <decomp-dir>\n' % sys.argv[0])
print('\t-v, --verbose <offset>\t\t\tPrint assembly diff for specific function (original file\'s offset)') print('\t-v, --verbose <offset>\t\t\tPrint assembly diff for specific function (original file\'s offset)')
print('\t-h, --html <output-file>\t\t\tGenerate searchable HTML summary of status and diffs')
sys.exit(1) sys.exit(1)
positional_args = [] positional_args = []
verbose = None verbose = None
skip = False skip = False
html = None
for i, arg in enumerate(sys.argv): for i, arg in enumerate(sys.argv):
if skip: if skip:
@ -28,6 +30,9 @@ def print_usage():
if flag == 'v' or flag == '-verbose': if flag == 'v' or flag == '-verbose':
verbose = int(sys.argv[i + 1], 16) verbose = int(sys.argv[i + 1], 16)
skip = True skip = True
elif flag == 'h' or flag == '-html':
html = sys.argv[i + 1]
skip = True
else: else:
print('Unknown flag: %s' % arg) print('Unknown flag: %s' % arg)
print_usage() print_usage()
@ -100,13 +105,16 @@ def get_wine_path(fn):
def get_unix_path(fn): def get_unix_path(fn):
return subprocess.check_output(['winepath', fn]).decode('utf-8').strip() return subprocess.check_output(['winepath', fn]).decode('utf-8').strip()
def get_file_in_script_dir(fn):
return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn)
# Declare a class that parses the output of cvdump for fast access later # Declare a class that parses the output of cvdump for fast access later
class SymInfo: class SymInfo:
funcs = {} funcs = {}
lines = {} lines = {}
def __init__(self, pdb, file): def __init__(self, pdb, file):
call = [os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), 'cvdump.exe'), '-l', '-s'] call = [get_file_in_script_dir('cvdump.exe'), '-l', '-s']
if os.name != 'nt': if os.name != 'nt':
# Run cvdump through wine and convert path to Windows-friendly wine path # Run cvdump through wine and convert path to Windows-friendly wine path
@ -192,23 +200,31 @@ def get_recompiled_address(self, filename, line):
md = Cs(CS_ARCH_X86, CS_MODE_32) md = Cs(CS_ARCH_X86, CS_MODE_32)
def sanitize(file, mnemonic, op_str): def sanitize(file, mnemonic, op_str):
offsetplaceholder = '<OFFSET>'
if mnemonic == 'call' or mnemonic == 'jmp': if mnemonic == 'call' or mnemonic == 'jmp':
# Filter out "calls" because the offsets we're not currently trying to # Filter out "calls" because the offsets we're not currently trying to
# match offsets. As long as there's a call in the right place, it's # match offsets. As long as there's a call in the right place, it's
# probably accurate. # probably accurate.
op_str = '' op_str = offsetplaceholder
else: else:
# Filter out dword ptrs where the pointer is to an offset def filter_out_ptr(ptype, op_str):
try: try:
start = op_str.index('dword ptr [') + 11 ptrstr = ptype + ' ptr ['
start = op_str.index(ptrstr) + len(ptrstr)
end = op_str.index(']', start) end = op_str.index(']', start)
# This will throw ValueError if not hex # This will throw ValueError if not hex
inttest = int(op_str[start:end], 16) inttest = int(op_str[start:end], 16)
op_str = op_str[0:start] + op_str[end:] return op_str[0:start] + offsetplaceholder + op_str[end:]
except ValueError: except ValueError:
pass return op_str
# Filter out dword ptrs where the pointer is to an offset
op_str = filter_out_ptr('dword', op_str)
op_str = filter_out_ptr('word', op_str)
op_str = filter_out_ptr('byte', op_str)
# Use heuristics to filter out any args that look like offsets # Use heuristics to filter out any args that look like offsets
words = op_str.split(' ') words = op_str.split(' ')
@ -216,7 +232,7 @@ def sanitize(file, mnemonic, op_str):
try: try:
inttest = int(word, 16) inttest = int(word, 16)
if inttest >= file.imagebase + file.textvirt: if inttest >= file.imagebase + file.textvirt:
words[i] = '' words[i] = offsetplaceholder
except ValueError: except ValueError:
pass pass
op_str = ' '.join(words) op_str = ' '.join(words)
@ -230,11 +246,15 @@ def parse_asm(file, addr, size):
# Use heuristics to disregard some differences that aren't representative # Use heuristics to disregard some differences that aren't representative
# of the accuracy of a function (e.g. global offsets) # of the accuracy of a function (e.g. global offsets)
mnemonic, op_str = sanitize(file, i.mnemonic, i.op_str) mnemonic, op_str = sanitize(file, i.mnemonic, i.op_str)
if op_str is None:
asm.append(mnemonic)
else:
asm.append("%s %s" % (mnemonic, op_str)) asm.append("%s %s" % (mnemonic, op_str))
return asm return asm
function_count = 0 function_count = 0
total_accuracy = 0 total_accuracy = 0
htmlinsert = []
for subdir, dirs, files in os.walk(source): for subdir, dirs, files in os.walk(source):
for file in files: for file in files:
@ -274,15 +294,42 @@ def parse_asm(file, addr, size):
function_count += 1 function_count += 1
total_accuracy += ratio total_accuracy += ratio
if verbose == addr: if verbose == addr or html:
udiff = difflib.unified_diff(origasm, recompasm) udiff = difflib.unified_diff(origasm, recompasm)
if verbose == addr:
for line in udiff: for line in udiff:
print(line) print(line)
print() print()
print() print()
if html:
htmlinsert.append('{address: "%s", name: "%s", matching: %s, diff: "%s"}' % (hex(addr), recinfo.name, str(ratio), '\\n'.join(udiff).replace('"', '\\"').replace('\n', '\\n')))
except UnicodeDecodeError: except UnicodeDecodeError:
break break
def gen_html(html, data):
templatefile = open(get_file_in_script_dir('template.html'), 'r')
if not templatefile:
print('Failed to find HTML template file, can\'t generate HTML summary')
return
templatedata = templatefile.read()
templatefile.close()
templatedata = templatedata.replace('/* INSERT DATA HERE */', ','.join(data), 1)
htmlfile = open(html, 'w')
if not htmlfile:
print('Failed to write to HTML file %s' % html)
return
htmlfile.write(templatedata)
htmlfile.close()
if html:
gen_html(html, htmlinsert)
if function_count > 0: if function_count > 0:
print('\nTotal accuracy %.2f%% across %i functions' % (total_accuracy / function_count * 100, function_count)) print('\nTotal accuracy %.2f%% across %i functions' % (total_accuracy / function_count * 100, function_count))

240
tools/reccomp/template.html Normal file
View file

@ -0,0 +1,240 @@
<!DOCTYPE html>
<html>
<head>
<title>Decompilation Status</title>
<style>
body {
background: #202020;
color: #f0f0f0;
font-family: sans-serif;
}
h1 {
text-align: center;
}
.main {
width: 800px;
max-width: 100%;
margin: auto;
}
#search {
width: 100%;
box-sizing: border-box;
background: #303030;
color: #f0f0f0;
border: 1px #f0f0f0 solid;
padding: 0.5em;
border-radius: 0.5em;
}
#search::placeholder {
color: #b0b0b0;
}
#listing {
width: 100%;
border-collapse: collapse;
font-family: monospace;
}
.funcrow:hover {
background: #404040 !important;
}
.funcrow:nth-child(odd), #listing th {
background: #282828;
}
.funcrow:nth-child(even) {
background: #383838;
}
#listing td, #listing th {
border: 1px #f0f0f0 solid;
padding: 0.5em;
}
.diffneg {
color: #FF8080;
}
.diffpos {
color: #80FF80;
}
#sortind {
margin: 0 0.5em;
}
</style>
<script>
var data = [/* INSERT DATA HERE */];
function formatAsm(asm) {
var lines = asm.split('\n');
for (var i = 0; i < lines.length; i++) {
var l = lines[i];
if (l.length > 0) {
if (l[0] == '-') {
lines[i] = '<span class="diffneg">' + l + '</span>';
} else if (l[0] == '+') {
lines[i] = '<span class="diffpos">' + l + '</span>';
}
}
}
return lines.join('<br>');
}
function rowClick() {
if (this.dataset.expanded === 'true') {
this.nextSibling.remove();
this.dataset.expanded = false;
} else {
var row = this.parentNode.insertBefore(document.createElement('tr'), this.nextSibling);
row.classList.add('diff');
var decCel = row.appendChild(document.createElement('td'));
decCel.colSpan = 3;
var diff = data[this.dataset.index].diff;
if (diff == '') {
diff = '<center><i>Identical function - no diff</i></center>';
} else {
diff = formatAsm(diff);
}
decCel.innerHTML = diff;
this.dataset.expanded = true;
}
}
function closeAllDiffs() {
const collection = document.getElementsByClassName("diff");
for (var ele of collection) {
ele.remove();
}
}
function filter(text) {
closeAllDiffs();
var ltext = text.toLowerCase();
const collection = document.getElementsByClassName("funcrow");
var searchCount = 0;
for (var ele of collection) {
var eledata = data[ele.dataset.index];
if (text == ''
|| eledata.address.toLowerCase().includes(ltext)
|| eledata.name.toLowerCase().includes(ltext)) {
ele.style.display = '';
searchCount++;
} else {
ele.style.display = 'none';
}
}
}
var lastSortedCol = -1;
var ascending = true;
function sortByColumn(column) {
closeAllDiffs();
if (column == lastSortedCol) {
ascending = !ascending;
}
lastSortedCol = column;
const collection = document.getElementsByClassName("funcrow");
var newOrder = [];
for (var ele of collection) {
var inserted = false;
for (var i = 0; i < newOrder.length; i++) {
var cmpEle = newOrder[i];
var ourCol = ele.childNodes[column];
var cmpCol = cmpEle.childNodes[column];
if ((cmpCol.dataset.value > ourCol.dataset.value) == ascending) {
newOrder.splice(i, 0, ele);
inserted = true;
break;
}
}
if (!inserted) {
newOrder.push(ele);
}
}
for (var i = 1; i < newOrder.length; i++) {
newOrder[i - 1].after(newOrder[i]);
}
var sortIndicator = document.getElementById('sortind');
if (!sortIndicator) {
sortIndicator = document.createElement('span');
sortIndicator.id = 'sortind';
}
sortIndicator.innerHTML = ascending ? '&#9650;' : '&#9660;';
var th = document.getElementById('listingheader').childNodes[column];
th.appendChild(sortIndicator);
}
document.addEventListener("DOMContentLoaded", () => {
var listing = document.getElementById('listing');
const headers = listing.getElementsByTagName('th');
var headerCount = 0;
for (const header of headers) {
header.addEventListener('click', function(){
sortByColumn(this.dataset.column, true);
});
header.dataset.column = headerCount;
headerCount++;
}
data.forEach((element, index) => {
var row = listing.appendChild(document.createElement('tr'));
var addrCel = row.appendChild(document.createElement('td'));
var nameCel = row.appendChild(document.createElement('td'));
var matchCel = row.appendChild(document.createElement('td'));
addrCel.innerHTML = addrCel.dataset.value = element.address;
nameCel.innerHTML = nameCel.dataset.value = element.name;
matchCel.innerHTML = (element.matching * 100).toFixed(2) + '%';
matchCel.dataset.value = element.matching;
row.classList.add('funcrow');
row.addEventListener('click', rowClick);
row.dataset.index = index;
row.dataset.expanded = false;
});
var search = document.getElementById('search');
search.addEventListener('input', function (evt) {
filter(search.value);
});
sortByColumn(0);
});
</script>
</head>
<body>
<div class="main">
<h1>Decompilation Status</h1>
<input id="search" type="search" placeholder="Search for offset or function name...">
<br>
<br>
<table id="listing">
<tr id='listingheader'><th style='width: 20%'>Address</th><th style="width:60%">Name</th><th style='width: 20%'>Matching</th></tr>
</table>
</div>
</body>
</html>