improved reccomp reliability even further, added html summary generator

Will probably host the summary somewhere for easy access
2024-11-22 15:37:55 -05:00 · 2023-06-19 12:52:21 -07:00 · 2023-06-19 12:52:21 -07:00 · 66dd2cdeb9
commit 66dd2cdeb9
parent ff85548c85
2 changed files with 306 additions and 19 deletions
--- a/tools/reccomp/reccomp.py
+++ b/tools/reccomp/reccomp.py
@ -10,11 +10,13 @@
 def print_usage():
  print('Usage: %s [options] <original-binary> <recompiled-binary> <recompiled-pdb> <decomp-dir>\n' % sys.argv[0])
  print('\t-v, --verbose <offset>\t\t\tPrint assembly diff for specific function (original file\'s offset)')
+  print('\t-h, --html <output-file>\t\t\tGenerate searchable HTML summary of status and diffs')
  sys.exit(1)

 positional_args = []
 verbose = None
 skip = False
+html = None

 for i, arg in enumerate(sys.argv):
  if skip:
@ -28,6 +30,9 @@ def print_usage():
    if flag == 'v' or flag == '-verbose':
      verbose = int(sys.argv[i + 1], 16)
      skip = True
+    elif flag == 'h' or flag == '-html':
+      html = sys.argv[i + 1]
+      skip = True
    else:
      print('Unknown flag: %s' % arg)
      print_usage()
@ -100,13 +105,16 @@ def get_wine_path(fn):
 def get_unix_path(fn):
  return subprocess.check_output(['winepath', fn]).decode('utf-8').strip()

+def get_file_in_script_dir(fn):
+  return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn)
+
 # Declare a class that parses the output of cvdump for fast access later
 class SymInfo:
  funcs = {}
  lines = {}

  def __init__(self, pdb, file):
-    call = [os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), 'cvdump.exe'), '-l', '-s']
+    call = [get_file_in_script_dir('cvdump.exe'), '-l', '-s']

    if os.name != 'nt':
      # Run cvdump through wine and convert path to Windows-friendly wine path
@ -192,23 +200,31 @@ def get_recompiled_address(self, filename, line):
 md = Cs(CS_ARCH_X86, CS_MODE_32)

 def sanitize(file, mnemonic, op_str):
+  offsetplaceholder = '<OFFSET>'
+
  if mnemonic == 'call' or mnemonic == 'jmp':
    # Filter out "calls" because the offsets we're not currently trying to
    # match offsets. As long as there's a call in the right place, it's
    # probably accurate.
-    op_str = ''
+    op_str = offsetplaceholder
  else:
-    # Filter out dword ptrs where the pointer is to an offset
+    def filter_out_ptr(ptype, op_str):
      try:
-      start = op_str.index('dword ptr [') + 11
+        ptrstr = ptype + ' ptr ['
+        start = op_str.index(ptrstr) + len(ptrstr)
        end = op_str.index(']', start)

        # This will throw ValueError if not hex
        inttest = int(op_str[start:end], 16)

-      op_str = op_str[0:start] + op_str[end:]
+        return op_str[0:start] + offsetplaceholder + op_str[end:]
      except ValueError:
-      pass
+        return op_str
+
+    # Filter out dword ptrs where the pointer is to an offset
+    op_str = filter_out_ptr('dword', op_str)
+    op_str = filter_out_ptr('word', op_str)
+    op_str = filter_out_ptr('byte', op_str)

    # Use heuristics to filter out any args that look like offsets
    words = op_str.split(' ')
@ -216,7 +232,7 @@ def sanitize(file, mnemonic, op_str):
      try:
        inttest = int(word, 16)
        if inttest >= file.imagebase + file.textvirt:
-          words[i] = ''
+          words[i] = offsetplaceholder
      except ValueError:
        pass
    op_str = ' '.join(words)
@ -230,11 +246,15 @@ def parse_asm(file, addr, size):
    # Use heuristics to disregard some differences that aren't representative
    # of the accuracy of a function (e.g. global offsets)
    mnemonic, op_str = sanitize(file, i.mnemonic, i.op_str)
+    if op_str is None:
+      asm.append(mnemonic)
+    else:
      asm.append("%s %s" % (mnemonic, op_str))
  return asm

 function_count = 0
 total_accuracy = 0
+htmlinsert = []

 for subdir, dirs, files in os.walk(source):
  for file in files:
@ -274,15 +294,42 @@ def parse_asm(file, addr, size):
          function_count += 1
          total_accuracy += ratio

-          if verbose == addr:
+          if verbose == addr or html:
            udiff = difflib.unified_diff(origasm, recompasm)
+
+            if verbose == addr:
              for line in udiff:
                print(line)
              print()
              print()

+            if html:
+              htmlinsert.append('{address: "%s", name: "%s", matching: %s, diff: "%s"}' % (hex(addr), recinfo.name, str(ratio), '\\n'.join(udiff).replace('"', '\\"').replace('\n', '\\n')))
+
      except UnicodeDecodeError:
        break

+def gen_html(html, data):
+  templatefile = open(get_file_in_script_dir('template.html'), 'r')
+  if not templatefile:
+    print('Failed to find HTML template file, can\'t generate HTML summary')
+    return
+
+  templatedata = templatefile.read()
+  templatefile.close()
+
+  templatedata = templatedata.replace('/* INSERT DATA HERE */', ','.join(data), 1)
+
+  htmlfile = open(html, 'w')
+  if not htmlfile:
+    print('Failed to write to HTML file %s' % html)
+    return
+
+  htmlfile.write(templatedata)
+  htmlfile.close()
+
+if html:
+  gen_html(html, htmlinsert)
+
 if function_count > 0:
  print('\nTotal accuracy %.2f%% across %i functions' % (total_accuracy / function_count * 100, function_count))
--- a/tools/reccomp/template.html
+++ b/tools/reccomp/template.html
@ -0,0 +1,240 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <title>Decompilation Status</title>
+    <style>
+      body {
+        background: #202020;
+        color: #f0f0f0;
+        font-family: sans-serif;
+      }
+
+      h1 {
+        text-align: center;
+      }
+
+      .main {
+        width: 800px;
+        max-width: 100%;
+        margin: auto;
+      }
+
+      #search {
+        width: 100%;
+        box-sizing: border-box;
+        background: #303030;
+        color: #f0f0f0;
+        border: 1px #f0f0f0 solid;
+        padding: 0.5em;
+        border-radius: 0.5em;
+      }
+
+      #search::placeholder {
+        color: #b0b0b0;
+      }
+
+      #listing {
+        width: 100%;
+        border-collapse: collapse;
+        font-family: monospace;
+      }
+
+      .funcrow:hover {
+        background: #404040 !important;
+      }
+
+      .funcrow:nth-child(odd), #listing th {
+        background: #282828;
+      }
+
+      .funcrow:nth-child(even) {
+        background: #383838;
+      }
+
+      #listing td, #listing th {
+        border: 1px #f0f0f0 solid;
+        padding: 0.5em;
+      }
+
+      .diffneg {
+        color: #FF8080;
+      }
+
+      .diffpos {
+        color: #80FF80;
+      }
+
+      #sortind {
+        margin: 0 0.5em;
+      }
+    </style>
+    <script>
+      var data = [/* INSERT DATA HERE */];
+
+      function formatAsm(asm) {
+        var lines = asm.split('\n');
+
+        for (var i = 0; i < lines.length; i++) {
+          var l = lines[i];
+          if (l.length > 0) {
+            if (l[0] == '-') {
+              lines[i] = '<span class="diffneg">' + l + '</span>';
+            } else if (l[0] == '+') {
+              lines[i] = '<span class="diffpos">' + l + '</span>';
+            }
+          }
+        }
+
+        return lines.join('<br>');
+      }
+
+      function rowClick() {
+        if (this.dataset.expanded === 'true') {
+          this.nextSibling.remove();
+          this.dataset.expanded = false;
+        } else {
+          var row = this.parentNode.insertBefore(document.createElement('tr'), this.nextSibling);
+          row.classList.add('diff');
+          var decCel = row.appendChild(document.createElement('td'));
+          decCel.colSpan = 3;
+          var diff = data[this.dataset.index].diff;
+          if (diff == '') {
+            diff = '<center><i>Identical function - no diff</i></center>';
+          } else {
+            diff = formatAsm(diff);
+          }
+          decCel.innerHTML = diff;
+          this.dataset.expanded = true;
+        }
+      }
+
+      function closeAllDiffs() {
+        const collection = document.getElementsByClassName("diff");
+        for (var ele of collection) {
+          ele.remove();
+        }
+      }
+
+      function filter(text) {
+        closeAllDiffs();
+
+        var ltext = text.toLowerCase();
+
+        const collection = document.getElementsByClassName("funcrow");
+        var searchCount = 0;
+        for (var ele of collection) {
+          var eledata = data[ele.dataset.index];
+          if (text == ''
+            || eledata.address.toLowerCase().includes(ltext)
+            || eledata.name.toLowerCase().includes(ltext)) {
+            ele.style.display = '';
+            searchCount++;
+          } else {
+            ele.style.display = 'none';
+          }
+        }
+      }
+
+      var lastSortedCol = -1;
+      var ascending = true;
+
+      function sortByColumn(column) {
+        closeAllDiffs();
+
+        if (column == lastSortedCol) {
+          ascending = !ascending;
+        }
+        lastSortedCol = column;
+
+        const collection = document.getElementsByClassName("funcrow");
+
+        var newOrder = [];
+
+        for (var ele of collection) {
+          var inserted = false;
+
+          for (var i = 0; i < newOrder.length; i++) {
+            var cmpEle = newOrder[i];
+
+            var ourCol = ele.childNodes[column];
+            var cmpCol = cmpEle.childNodes[column];
+
+            if ((cmpCol.dataset.value > ourCol.dataset.value) == ascending) {
+              newOrder.splice(i, 0, ele);
+              inserted = true;
+              break;
+            }
+          }
+
+          if (!inserted) {
+            newOrder.push(ele);
+          }
+        }
+
+        for (var i = 1; i < newOrder.length; i++) {
+          newOrder[i - 1].after(newOrder[i]);
+        }
+
+        var sortIndicator = document.getElementById('sortind');
+        if (!sortIndicator) {
+          sortIndicator = document.createElement('span');
+          sortIndicator.id = 'sortind';
+        }
+        sortIndicator.innerHTML = ascending ? '&#9650;' : '&#9660;';
+
+        var th = document.getElementById('listingheader').childNodes[column];
+        th.appendChild(sortIndicator);
+      }
+
+      document.addEventListener("DOMContentLoaded", () => {
+        var listing = document.getElementById('listing');
+
+        const headers = listing.getElementsByTagName('th');
+        var headerCount = 0;
+        for (const header of headers) {
+          header.addEventListener('click', function(){
+            sortByColumn(this.dataset.column, true);
+          });
+
+          header.dataset.column = headerCount;
+          headerCount++;
+        }
+
+        data.forEach((element, index) => {
+          var row = listing.appendChild(document.createElement('tr'));
+          var addrCel = row.appendChild(document.createElement('td'));
+          var nameCel = row.appendChild(document.createElement('td'));
+          var matchCel = row.appendChild(document.createElement('td'));
+
+          addrCel.innerHTML = addrCel.dataset.value = element.address;
+          nameCel.innerHTML = nameCel.dataset.value = element.name;
+          matchCel.innerHTML = (element.matching * 100).toFixed(2) + '%';
+          matchCel.dataset.value = element.matching;
+
+          row.classList.add('funcrow');
+          row.addEventListener('click', rowClick);
+          row.dataset.index = index;
+          row.dataset.expanded = false;
+        });
+
+        var search = document.getElementById('search');
+        search.addEventListener('input', function (evt) {
+          filter(search.value);
+        });
+
+        sortByColumn(0);
+      });
+    </script>
+  </head>
+  <body>
+    <div class="main">
+      <h1>Decompilation Status</h1>
+      <input id="search" type="search" placeholder="Search for offset or function name...">
+      <br>
+      <br>
+      <table id="listing">
+        <tr id='listingheader'><th style='width: 20%'>Address</th><th style="width:60%">Name</th><th style='width: 20%'>Matching</th></tr>
+      </table>
+    </div>
+  </body>
+</html>