reccmp: New diff option (#563)

2024-11-26 01:28:30 -05:00 · 2024-02-15 03:33:40 -05:00 · 2024-02-15 03:33:40 -05:00 · 8aa9d9a8b3
commit 8aa9d9a8b3
parent 271df035fd
2 changed files with 283 additions and 46 deletions
--- a/tools/isledecomp/isledecomp/utils.py
+++ b/tools/isledecomp/isledecomp/utils.py
@ -1,5 +1,7 @@
 import os
 import sys
+from datetime import datetime
+import logging
 import colorama


@ -27,5 +29,217 @@ def print_diff(udiff, plain):
    return has_diff


+def get_percent_color(value: float) -> str:
+    """Return colorama ANSI escape character for the given decimal value."""
+    if value == 1.0:
+        return colorama.Fore.GREEN
+    if value > 0.8:
+        return colorama.Fore.YELLOW
+
+    return colorama.Fore.RED
+
+
+def percent_string(
+    ratio: float, is_effective: bool = False, is_plain: bool = False
+) -> str:
+    """Helper to construct a percentage string from the given ratio.
+    If is_effective (i.e. effective match), indicate that with the asterisk.
+    If is_plain, don't use colorama ANSI codes."""
+
+    percenttext = f"{(ratio * 100):.2f}%"
+    effective_star = "*" if is_effective else ""
+
+    if is_plain:
+        return percenttext + effective_star
+
+    return "".join(
+        [
+            get_percent_color(ratio),
+            percenttext,
+            colorama.Fore.RED if is_effective else "",
+            effective_star,
+            colorama.Style.RESET_ALL,
+        ]
+    )
+
+
+def diff_json_display(show_both_addrs: bool = False, is_plain: bool = False):
+    """Generate a function that will display the diff according to
+    the reccmp display preferences."""
+
+    def formatter(orig_addr, saved, new) -> str:
+        old_pct = "new"
+        new_pct = "gone"
+        name = ""
+        recomp_addr = "n/a"
+
+        if new is not None:
+            new_pct = (
+                "stub"
+                if new.get("stub", False)
+                else percent_string(
+                    new["matching"], new.get("effective", False), is_plain
+                )
+            )
+
+            # Prefer the current name of this function if we have it.
+            # We are using the original address as the key.
+            # A function being renamed is not of interest here.
+            name = new.get("name", "")
+            recomp_addr = new.get("recomp", "n/a")
+
+        if saved is not None:
+            old_pct = (
+                "stub"
+                if saved.get("stub", False)
+                else percent_string(
+                    saved["matching"], saved.get("effective", False), is_plain
+                )
+            )
+
+            if name == "":
+                name = saved.get("name", "")
+
+        if show_both_addrs:
+            addr_string = f"{orig_addr} / {recomp_addr:10}"
+        else:
+            addr_string = orig_addr
+
+        # The ANSI codes from colorama counted towards string length,
+        # so displaying this as an ascii-like spreadsheet
+        # (using f-string formatting) would take some effort.
+        return f"{addr_string} - {name} ({old_pct} -> {new_pct})"
+
+    return formatter
+
+
+def diff_json(
+    saved_data,
+    new_data,
+    orig_file: str,
+    show_both_addrs: bool = False,
+    is_plain: bool = False,
+):
+    """Using a saved copy of the diff summary and the current data, print a
+    report showing which functions/symbols have changed match percentage."""
+
+    # Don't try to diff a report generated for a different binary file
+    base_file = os.path.basename(orig_file).lower()
+
+    if saved_data.get("file") != base_file:
+        logging.getLogger().error(
+            "Diff report for '%s' does not match current file '%s'",
+            saved_data.get("file"),
+            base_file,
+        )
+        return
+
+    if "timestamp" in saved_data:
+        now = datetime.now().replace(microsecond=0)
+        then = datetime.fromtimestamp(saved_data["timestamp"]).replace(microsecond=0)
+
+        print(
+            " ".join(
+                [
+                    "Saved diff report generated",
+                    then.strftime("%B %d %Y, %H:%M:%S"),
+                    f"({str(now - then)} ago)",
+                ]
+            )
+        )
+
+        print()
+
+    # Convert to dict, using orig_addr as key
+    saved_invert = {obj["address"]: obj for obj in saved_data["data"]}
+    new_invert = {obj["address"]: obj for obj in new_data}
+
+    all_addrs = set(saved_invert.keys()).union(new_invert.keys())
+
+    # Put all the information in one place so we can decide how each item changed.
+    combined = {
+        addr: (
+            saved_invert.get(addr),
+            new_invert.get(addr),
+        )
+        for addr in sorted(all_addrs)
+    }
+
+    # The criteria for diff judgement is in these dict comprehensions:
+    # Any function not in the saved file
+    new_functions = {
+        key: (saved, new) for key, (saved, new) in combined.items() if saved is None
+    }
+
+    # Any function now missing from the saved file
+    # or a non-stub -> stub conversion
+    dropped_functions = {
+        key: (saved, new)
+        for key, (saved, new) in combined.items()
+        if new is None
+        or (
+            new is not None
+            and saved is not None
+            and new.get("stub", False)
+            and not saved.get("stub", False)
+        )
+    }
+
+    # TODO: move these two into functions if the assessment gets more complex
+    # Any function with increased match percentage
+    # or stub -> non-stub conversion
+    improved_functions = {
+        key: (saved, new)
+        for key, (saved, new) in combined.items()
+        if saved is not None
+        and new is not None
+        and (
+            new["matching"] > saved["matching"]
+            or (not new.get("stub", False) and saved.get("stub", False))
+        )
+    }
+
+    # Any non-stub function with decreased match percentage
+    degraded_functions = {
+        key: (saved, new)
+        for key, (saved, new) in combined.items()
+        if saved is not None
+        and new is not None
+        and new["matching"] < saved["matching"]
+        and not saved.get("stub")
+        and not new.get("stub")
+    }
+
+    # Any function with former or current "effective" match
+    entropy_functions = {
+        key: (saved, new)
+        for key, (saved, new) in combined.items()
+        if saved is not None
+        and new is not None
+        and new["matching"] == 1.0
+        and saved["matching"] == 1.0
+        and new.get("effective", False) != saved.get("effective", False)
+    }
+
+    get_diff_str = diff_json_display(show_both_addrs, is_plain)
+
+    for diff_name, diff_dict in [
+        ("New", new_functions),
+        ("Increased", improved_functions),
+        ("Decreased", degraded_functions),
+        ("Dropped", dropped_functions),
+        ("Compiler entropy", entropy_functions),
+    ]:
+        if len(diff_dict) == 0:
+            continue
+
+        print(f"{diff_name} ({len(diff_dict)}):")
+
+        for addr, (saved, new) in diff_dict.items():
+            print(get_diff_str(addr, saved, new))
+
+        print()
+
+
 def get_file_in_script_dir(fn):
    return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn)
--- a/tools/reccmp/reccmp.py
+++ b/tools/reccmp/reccmp.py
@ -5,11 +5,14 @@
 import json
 import logging
 import os
+from datetime import datetime

 from isledecomp import (
    Bin,
    get_file_in_script_dir,
    print_diff,
+    diff_json,
+    percent_string,
 )
 from isledecomp.compare import Compare as IsleCompare
 from isledecomp.types import SymbolType
@ -19,6 +22,31 @@
 colorama.init()


+def gen_json(json_file: str, orig_file: str, data):
+    """Create a JSON file that contains the comparison summary"""
+
+    # If the structure of the JSON file ever changes, we would run into a problem
+    # reading an older format file in the CI action. Mark which version we are
+    # generating so we could potentially address this down the road.
+    json_format_version = 1
+
+    # Remove the diff field
+    reduced_data = [
+        {key: value for (key, value) in obj.items() if key != "diff"} for obj in data
+    ]
+
+    with open(json_file, "w", encoding="utf-8") as f:
+        json.dump(
+            {
+                "file": os.path.basename(orig_file).lower(),
+                "format": json_format_version,
+                "timestamp": datetime.now().timestamp(),
+                "data": reduced_data,
+            },
+            f,
+        )
+
+
 def gen_html(html_file, data):
    output_data = Renderer().render_path(
        get_file_in_script_dir("template.html"), {"data": data}
@ -51,40 +79,6 @@ def gen_svg(svg_file, name_svg, icon, svg_implemented_funcs, total_funcs, raw_ac
        svgfile.write(output_data)


-def get_percent_color(value: float) -> str:
-    """Return colorama ANSI escape character for the given decimal value."""
-    if value == 1.0:
-        return colorama.Fore.GREEN
-    if value > 0.8:
-        return colorama.Fore.YELLOW
-
-    return colorama.Fore.RED
-
-
-def percent_string(
-    ratio: float, is_effective: bool = False, is_plain: bool = False
-) -> str:
-    """Helper to construct a percentage string from the given ratio.
-    If is_effective (i.e. effective match), indicate that with the asterisk.
-    If is_plain, don't use colorama ANSI codes."""
-
-    percenttext = f"{(ratio * 100):.2f}%"
-    effective_star = "*" if is_effective else ""
-
-    if is_plain:
-        return percenttext + effective_star
-
-    return "".join(
-        [
-            get_percent_color(ratio),
-            percenttext,
-            colorama.Fore.RED if is_effective else "",
-            effective_star,
-            colorama.Style.RESET_ALL,
-        ]
-    )
-
-
 def print_match_verbose(match, show_both_addrs: bool = False, is_plain: bool = False):
    percenttext = percent_string(
        match.effective_ratio, match.is_effective_match, is_plain
@ -169,6 +163,16 @@ def virtual_address(value) -> int:
        type=virtual_address,
        help="Print assembly diff for specific function (original file's offset)",
    )
+    parser.add_argument(
+        "--json",
+        metavar="<file>",
+        help="Generate JSON file with match summary",
+    )
+    parser.add_argument(
+        "--diff",
+        metavar="<file>",
+        help="Diff against summary in JSON file",
+    )
    parser.add_argument(
        "--html",
        "-H",
@ -256,7 +260,7 @@ def main():
        htmlinsert = []

        for match in isle_compare.compare_all():
-            if not args.silent:
+            if not args.silent and args.diff is None:
                print_match_oneline(
                    match, show_both_addrs=args.print_rec_addr, is_plain=args.no_color
                )
@ -267,13 +271,16 @@ def main():
                total_effective_accuracy += match.effective_ratio

            # If html, record the diffs to an HTML file
-            if args.html is not None:
            html_obj = {
                "address": f"0x{match.orig_addr:x}",
+                "recomp": f"0x{match.recomp_addr:x}",
                "name": match.name,
                "matching": match.effective_ratio,
            }

+            if match.is_effective_match:
+                html_obj["effective"] = True
+
            if match.udiff is not None:
                html_obj["diff"] = "\n".join(match.udiff)

@ -282,8 +289,24 @@ def main():

            htmlinsert.append(html_obj)

+        # Compare with saved diff report.
+        if args.diff is not None:
+            with open(args.diff, "r", encoding="utf-8") as f:
+                saved_data = json.load(f)
+
+                diff_json(
+                    saved_data,
+                    htmlinsert,
+                    args.original,
+                    show_both_addrs=args.print_rec_addr,
+                    is_plain=args.no_color,
+                )
+
        ## Generate files and show summary.

+        if args.json is not None:
+            gen_json(args.json, args.original, htmlinsert)
+
        if args.html is not None:
            gen_html(args.html, json.dumps(htmlinsert))