isle/tools/datacmp.py

372 lines
13 KiB
Python
Raw Normal View History

# (New) Data comparison.
import os
import argparse
import logging
from enum import Enum
from typing import Iterable, List, NamedTuple, Optional, Tuple
from struct import unpack
from isledecomp.compare import Compare as IsleCompare
from isledecomp.compare.db import MatchInfo
from isledecomp.cvdump import Cvdump
from isledecomp.cvdump.types import (
CvdumpKeyError,
CvdumpIntegrityError,
)
from isledecomp.bin import Bin as IsleBin
import colorama
colorama.just_fix_windows_console()
# Ignore all compare-db messages.
logging.getLogger("isledecomp.compare").addHandler(logging.NullHandler())
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Comparing data values.")
parser.add_argument(
"original", metavar="original-binary", help="The original binary"
)
parser.add_argument(
"recompiled", metavar="recompiled-binary", help="The recompiled binary"
)
parser.add_argument(
"pdb", metavar="recompiled-pdb", help="The PDB of the recompiled binary"
)
parser.add_argument(
"decomp_dir", metavar="decomp-dir", help="The decompiled source tree"
)
parser.add_argument(
"-v",
"--verbose",
action=argparse.BooleanOptionalAction,
default=False,
help="",
)
parser.add_argument(
"--no-color", "-n", action="store_true", help="Do not color the output"
)
2024-03-29 07:58:59 -04:00
parser.add_argument(
"--all",
"-a",
dest="show_all",
action="store_true",
help="Only show variables with a problem",
)
parser.add_argument(
"--print-rec-addr",
action="store_true",
help="Print addresses of recompiled functions too",
)
(args, _) = parser.parse_known_args()
if not os.path.isfile(args.original):
parser.error(f"Original binary {args.original} does not exist")
if not os.path.isfile(args.recompiled):
parser.error(f"Recompiled binary {args.recompiled} does not exist")
if not os.path.isfile(args.pdb):
parser.error(f"Symbols PDB {args.pdb} does not exist")
if not os.path.isdir(args.decomp_dir):
parser.error(f"Source directory {args.decomp_dir} does not exist")
return args
class CompareResult(Enum):
MATCH = 1
DIFF = 2
ERROR = 3
WARN = 4
class ComparedOffset(NamedTuple):
offset: int
# name is None for scalar types
name: Optional[str]
match: bool
values: Tuple[str, str]
class ComparisonItem(NamedTuple):
"""Each variable that was compared"""
orig_addr: int
recomp_addr: int
name: str
# The list of items that were compared.
# For a complex type, these are the members.
# For a scalar type, this is a list of size one.
# If we could not retrieve type information, this is
# a list of size one but without any specific type.
compared: List[ComparedOffset]
# If present, the error message from the types parser.
error: Optional[str] = None
# If true, there is no type specified for this variable. (i.e. non-public)
# In this case, we can only compare the raw bytes.
# This is different from the situation where a type id _is_ given, but
# we could not retrieve it for some reason. (This is an error.)
raw_only: bool = False
@property
def result(self) -> CompareResult:
if self.error is not None:
return CompareResult.ERROR
if all(c.match for c in self.compared):
return CompareResult.MATCH
# Prefer WARN for a diff without complete type information.
return CompareResult.WARN if self.raw_only else CompareResult.DIFF
def create_comparison_item(
var: MatchInfo,
compared: Optional[List[ComparedOffset]] = None,
error: Optional[str] = None,
raw_only: bool = False,
) -> ComparisonItem:
"""Helper to create the ComparisonItem from the fields in MatchInfo."""
if compared is None:
compared = []
return ComparisonItem(
orig_addr=var.orig_addr,
recomp_addr=var.recomp_addr,
name=var.name,
compared=compared,
error=error,
raw_only=raw_only,
)
def do_the_comparison(args: argparse.Namespace) -> Iterable[ComparisonItem]:
"""Run through each variable in our compare DB, then do the comparison
according to the variable's type. Emit the result."""
with IsleBin(args.original, find_str=True) as origfile, IsleBin(
args.recompiled
) as recompfile:
isle_compare = IsleCompare(origfile, recompfile, args.pdb, args.decomp_dir)
# TODO: We don't currently retain the type information of each variable
# in our compare DB. To get those, we build this mini-lookup table that
# maps recomp addresses to their type.
# We still need to build the full compare DB though, because we may
# need the matched symbols to compare pointers (e.g. on strings)
mini_cvdump = Cvdump(args.pdb).globals().types().run()
recomp_type_reference = {
recompfile.get_abs_addr(g.section, g.offset): g.type
for g in mini_cvdump.globals
if recompfile.is_valid_section(g.section)
}
for var in isle_compare.get_variables():
type_name = recomp_type_reference.get(var.recomp_addr)
# Start by assuming we can only compare the raw bytes
data_size = var.size
is_type_aware = type_name is not None
if is_type_aware:
try:
# If we are type-aware, we can get the precise
# data size for the variable.
data_type = mini_cvdump.types.get(type_name)
data_size = data_type.size
except (CvdumpKeyError, CvdumpIntegrityError) as ex:
yield create_comparison_item(var, error=repr(ex))
continue
orig_raw = origfile.read(var.orig_addr, data_size)
recomp_raw = recompfile.read(var.recomp_addr, data_size)
# The IMAGE_SECTION_HEADER defines the SizeOfRawData and VirtualSize for the section.
# If VirtualSize > SizeOfRawData, the section is comprised of the initialized data
# corresponding to bytes in the file, and the rest is padded with zeroes when
# Windows loads the image.
# The linker might place variables initialized to zero on the threshold between
# physical data and the virtual (uninitialized) data.
# If this happens (i.e. we get an incomplete read) we just do the same padding
# to prepare for the comparison.
if orig_raw is not None and len(orig_raw) < data_size:
orig_raw = orig_raw.ljust(data_size, b"\x00")
if recomp_raw is not None and len(recomp_raw) < data_size:
recomp_raw = recomp_raw.ljust(data_size, b"\x00")
# If one or both variables are entirely uninitialized
if orig_raw is None or recomp_raw is None:
# If both variables are uninitialized, we consider them equal.
match = orig_raw is None and recomp_raw is None
# We can match a variable initialized to all zeroes with
# an uninitialized variable, but this may or may not actually
# be correct, so we flag it for the user.
uninit_force_match = not match and (
(orig_raw is None and all(b == 0 for b in recomp_raw))
or (recomp_raw is None and all(b == 0 for b in orig_raw))
)
orig_value = "(uninitialized)" if orig_raw is None else "(initialized)"
recomp_value = (
"(uninitialized)" if recomp_raw is None else "(initialized)"
)
yield create_comparison_item(
var,
compared=[
ComparedOffset(
offset=0,
name=None,
match=match,
values=(orig_value, recomp_value),
)
],
raw_only=uninit_force_match,
)
continue
if not is_type_aware:
# If there is no specific type information available
# (i.e. if this is a static or non-public variable)
# then we can only compare the raw bytes.
yield create_comparison_item(
var,
compared=[
ComparedOffset(
offset=0,
name="(raw)",
match=orig_raw == recomp_raw,
values=(orig_raw, recomp_raw),
)
],
raw_only=True,
)
continue
# If we are here, we can do the type-aware comparison.
compared = []
2024-03-29 07:58:59 -04:00
compare_items = mini_cvdump.types.get_scalars_gapless(type_name)
format_str = mini_cvdump.types.get_format_string(type_name)
orig_data = unpack(format_str, orig_raw)
recomp_data = unpack(format_str, recomp_raw)
def pointer_display(addr: int, is_orig: bool) -> str:
"""Helper to streamline pointer textual display."""
if addr == 0:
return "nullptr"
ptr_match = (
isle_compare.get_by_orig(addr)
if is_orig
else isle_compare.get_by_recomp(addr)
)
if ptr_match is not None:
return f"Pointer to {ptr_match.match_name()}"
# This variable did not match if we do not have
# the pointer target in our DB.
return f"Unknown pointer 0x{addr:x}"
# Could zip here
for i, member in enumerate(compare_items):
if member.is_pointer:
match = isle_compare.is_pointer_match(orig_data[i], recomp_data[i])
value_a = pointer_display(orig_data[i], True)
value_b = pointer_display(recomp_data[i], False)
values = (value_a, value_b)
else:
match = orig_data[i] == recomp_data[i]
values = (orig_data[i], recomp_data[i])
compared.append(
ComparedOffset(
offset=member.offset,
name=member.name,
match=match,
values=values,
)
)
yield create_comparison_item(var, compared=compared)
def value_get(value: Optional[str], default: str):
return value if value is not None else default
def main():
args = parse_args()
def display_match(result: CompareResult) -> str:
"""Helper to return color string or not, depending on user preference"""
if args.no_color:
return result.name
match_color = (
colorama.Fore.GREEN
if result == CompareResult.MATCH
else (
colorama.Fore.YELLOW
if result == CompareResult.WARN
else colorama.Fore.RED
)
)
return f"{match_color}{result.name}{colorama.Style.RESET_ALL}"
2024-03-29 07:58:59 -04:00
var_count = 0
problems = 0
for item in do_the_comparison(args):
2024-03-29 07:58:59 -04:00
var_count += 1
if item.result in (CompareResult.DIFF, CompareResult.ERROR):
problems += 1
if not args.show_all and item.result == CompareResult.MATCH:
continue
address_display = (
f"0x{item.orig_addr:x} / 0x{item.recomp_addr:x}"
if args.print_rec_addr
else f"0x{item.orig_addr:x}"
)
print(f"{item.name[:80]} ({address_display}) ... {display_match(item.result)} ")
if item.error is not None:
print(f" {item.error}")
for c in item.compared:
if not args.verbose and c.match:
continue
(value_a, value_b) = c.values
if c.match:
print(f" {c.offset:5} {value_get(c.name, '(value)'):30} {value_a}")
else:
print(
f" {c.offset:5} {value_get(c.name, '(value)'):30} {value_a} : {value_b}"
)
2024-03-29 07:58:59 -04:00
if args.verbose:
print()
print(
f"{os.path.basename(args.original)} - Variables: {var_count}. Issues: {problems}"
)
return 0 if problems == 0 else 1
if __name__ == "__main__":
2024-03-29 07:58:59 -04:00
raise SystemExit(main())