reccmp: New diff option (#563)

2024-12-01 20:07:20 -05:00 · 2024-02-15 03:33:40 -05:00 · 2024-02-15 03:33:40 -05:00 · 8aa9d9a8b3
commit 8aa9d9a8b3
parent 271df035fd
2 changed files with 283 additions and 46 deletions
--- a/tools/isledecomp/isledecomp/utils.py
+++ b/tools/isledecomp/isledecomp/utils.py
@ -1,5 +1,7 @@
 import os
 import sys
 from datetime import datetime
 import logging
 import colorama
@ -27,5 +29,217 @@ def print_diff(udiff, plain):
    return has_diff
 def get_percent_color(value: float) -> str:
    """Return colorama ANSI escape character for the given decimal value."""
    if value == 1.0:
        return colorama.Fore.GREEN
    if value > 0.8:
        return colorama.Fore.YELLOW
    return colorama.Fore.RED
 def percent_string(
    ratio: float, is_effective: bool = False, is_plain: bool = False
 ) -> str:
    """Helper to construct a percentage string from the given ratio.
    If is_effective (i.e. effective match), indicate that with the asterisk.
    If is_plain, don't use colorama ANSI codes."""
    percenttext = f"{(ratio * 100):.2f}%"
    effective_star = "*" if is_effective else ""
    if is_plain:
        return percenttext + effective_star
    return "".join(
        [
            get_percent_color(ratio),
            percenttext,
            colorama.Fore.RED if is_effective else "",
            effective_star,
            colorama.Style.RESET_ALL,
        ]
    )
 def diff_json_display(show_both_addrs: bool = False, is_plain: bool = False):
    """Generate a function that will display the diff according to
    the reccmp display preferences."""
    def formatter(orig_addr, saved, new) -> str:
        old_pct = "new"
        new_pct = "gone"
        name = ""
        recomp_addr = "n/a"
        if new is not None:
            new_pct = (
                "stub"
                if new.get("stub", False)
                else percent_string(
                    new["matching"], new.get("effective", False), is_plain
                )
            )
            # Prefer the current name of this function if we have it.
            # We are using the original address as the key.
            # A function being renamed is not of interest here.
            name = new.get("name", "")
            recomp_addr = new.get("recomp", "n/a")
        if saved is not None:
            old_pct = (
                "stub"
                if saved.get("stub", False)
                else percent_string(
                    saved["matching"], saved.get("effective", False), is_plain
                )
            )
            if name == "":
                name = saved.get("name", "")
        if show_both_addrs:
            addr_string = f"{orig_addr} / {recomp_addr:10}"
        else:
            addr_string = orig_addr
        # The ANSI codes from colorama counted towards string length,
        # so displaying this as an ascii-like spreadsheet
        # (using f-string formatting) would take some effort.
        return f"{addr_string} - {name} ({old_pct} -> {new_pct})"
    return formatter
 def diff_json(
    saved_data,
    new_data,
    orig_file: str,
    show_both_addrs: bool = False,
    is_plain: bool = False,
 ):
    """Using a saved copy of the diff summary and the current data, print a
    report showing which functions/symbols have changed match percentage."""
    # Don't try to diff a report generated for a different binary file
    base_file = os.path.basename(orig_file).lower()
    if saved_data.get("file") != base_file:
        logging.getLogger().error(
            "Diff report for '%s' does not match current file '%s'",
            saved_data.get("file"),
            base_file,
        )
        return
    if "timestamp" in saved_data:
        now = datetime.now().replace(microsecond=0)
        then = datetime.fromtimestamp(saved_data["timestamp"]).replace(microsecond=0)
        print(
            " ".join(
                [
                    "Saved diff report generated",
                    then.strftime("%B %d %Y, %H:%M:%S"),
                    f"({str(now - then)} ago)",
                ]
            )
        )
        print()
    # Convert to dict, using orig_addr as key
    saved_invert = {obj["address"]: obj for obj in saved_data["data"]}
    new_invert = {obj["address"]: obj for obj in new_data}
    all_addrs = set(saved_invert.keys()).union(new_invert.keys())
    # Put all the information in one place so we can decide how each item changed.
    combined = {
        addr: (
            saved_invert.get(addr),
            new_invert.get(addr),
        )
        for addr in sorted(all_addrs)
    }
    # The criteria for diff judgement is in these dict comprehensions:
    # Any function not in the saved file
    new_functions = {
        key: (saved, new) for key, (saved, new) in combined.items() if saved is None
    }
    # Any function now missing from the saved file
    # or a non-stub -> stub conversion
    dropped_functions = {
        key: (saved, new)
        for key, (saved, new) in combined.items()
        if new is None
        or (
            new is not None
            and saved is not None
            and new.get("stub", False)
            and not saved.get("stub", False)
        )
    }
    # TODO: move these two into functions if the assessment gets more complex
    # Any function with increased match percentage
    # or stub -> non-stub conversion
    improved_functions = {
        key: (saved, new)
        for key, (saved, new) in combined.items()
        if saved is not None
        and new is not None
        and (
            new["matching"] > saved["matching"]
            or (not new.get("stub", False) and saved.get("stub", False))
        )
    }
    # Any non-stub function with decreased match percentage
    degraded_functions = {
        key: (saved, new)
        for key, (saved, new) in combined.items()
        if saved is not None
        and new is not None
        and new["matching"] < saved["matching"]
        and not saved.get("stub")
        and not new.get("stub")
    }
    # Any function with former or current "effective" match
    entropy_functions = {
        key: (saved, new)
        for key, (saved, new) in combined.items()
        if saved is not None
        and new is not None
        and new["matching"] == 1.0
        and saved["matching"] == 1.0
        and new.get("effective", False) != saved.get("effective", False)
    }
    get_diff_str = diff_json_display(show_both_addrs, is_plain)
    for diff_name, diff_dict in [
        ("New", new_functions),
        ("Increased", improved_functions),
        ("Decreased", degraded_functions),
        ("Dropped", dropped_functions),
        ("Compiler entropy", entropy_functions),
    ]:
        if len(diff_dict) == 0:
            continue
        print(f"{diff_name} ({len(diff_dict)}):")
        for addr, (saved, new) in diff_dict.items():
            print(get_diff_str(addr, saved, new))
        print()
 def get_file_in_script_dir(fn):
    return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn)
--- a/tools/reccmp/reccmp.py
+++ b/tools/reccmp/reccmp.py
@ -5,11 +5,14 @@
 import json
 import logging
 import os
 from datetime import datetime
 from isledecomp import (
    Bin,
    get_file_in_script_dir,
    print_diff,
    diff_json,
    percent_string,
 )
 from isledecomp.compare import Compare as IsleCompare
 from isledecomp.types import SymbolType
@ -19,6 +22,31 @@
 colorama.init()
 def gen_json(json_file: str, orig_file: str, data):
    """Create a JSON file that contains the comparison summary"""
    # If the structure of the JSON file ever changes, we would run into a problem
    # reading an older format file in the CI action. Mark which version we are
    # generating so we could potentially address this down the road.
    json_format_version = 1
    # Remove the diff field
    reduced_data = [
        {key: value for (key, value) in obj.items() if key != "diff"} for obj in data
    ]
    with open(json_file, "w", encoding="utf-8") as f:
        json.dump(
            {
                "file": os.path.basename(orig_file).lower(),
                "format": json_format_version,
                "timestamp": datetime.now().timestamp(),
                "data": reduced_data,
            },
            f,
        )
 def gen_html(html_file, data):
    output_data = Renderer().render_path(
        get_file_in_script_dir("template.html"), {"data": data}
@ -51,40 +79,6 @@ def gen_svg(svg_file, name_svg, icon, svg_implemented_funcs, total_funcs, raw_ac
        svgfile.write(output_data)
 def get_percent_color(value: float) -> str:
    """Return colorama ANSI escape character for the given decimal value."""
    if value == 1.0:
        return colorama.Fore.GREEN
    if value > 0.8:
        return colorama.Fore.YELLOW
    return colorama.Fore.RED
 def percent_string(
    ratio: float, is_effective: bool = False, is_plain: bool = False
 ) -> str:
    """Helper to construct a percentage string from the given ratio.
    If is_effective (i.e. effective match), indicate that with the asterisk.
    If is_plain, don't use colorama ANSI codes."""
    percenttext = f"{(ratio * 100):.2f}%"
    effective_star = "*" if is_effective else ""
    if is_plain:
        return percenttext + effective_star
    return "".join(
        [
            get_percent_color(ratio),
            percenttext,
            colorama.Fore.RED if is_effective else "",
            effective_star,
            colorama.Style.RESET_ALL,
        ]
    )
 def print_match_verbose(match, show_both_addrs: bool = False, is_plain: bool = False):
    percenttext = percent_string(
        match.effective_ratio, match.is_effective_match, is_plain
@ -169,6 +163,16 @@ def virtual_address(value) -> int:
        type=virtual_address,
        help="Print assembly diff for specific function (original file's offset)",
    )
    parser.add_argument(
        "--json",
        metavar="<file>",
        help="Generate JSON file with match summary",
    )
    parser.add_argument(
        "--diff",
        metavar="<file>",
        help="Diff against summary in JSON file",
    )
    parser.add_argument(
        "--html",
        "-H",
@ -256,7 +260,7 @@ def main():
        htmlinsert = []
        for match in isle_compare.compare_all():
-            if not args.silent:
+            if not args.silent and args.diff is None:
                print_match_oneline(
                    match, show_both_addrs=args.print_rec_addr, is_plain=args.no_color
                )
@ -267,13 +271,16 @@ def main():
                total_effective_accuracy += match.effective_ratio
            # If html, record the diffs to an HTML file
            if args.html is not None:
            html_obj = {
                "address": f"0x{match.orig_addr:x}",
                "recomp": f"0x{match.recomp_addr:x}",
                "name": match.name,
                "matching": match.effective_ratio,
            }
            if match.is_effective_match:
                html_obj["effective"] = True
            if match.udiff is not None:
                html_obj["diff"] = "\n".join(match.udiff)
@ -282,8 +289,24 @@ def main():
            htmlinsert.append(html_obj)
        # Compare with saved diff report.
        if args.diff is not None:
            with open(args.diff, "r", encoding="utf-8") as f:
                saved_data = json.load(f)
                diff_json(
                    saved_data,
                    htmlinsert,
                    args.original,
                    show_both_addrs=args.print_rec_addr,
                    is_plain=args.no_color,
                )
        ## Generate files and show summary.
        if args.json is not None:
            gen_json(args.json, args.original, htmlinsert)
        if args.html is not None:
            gen_html(args.html, json.dumps(htmlinsert))