Roadmap tool to compare binary structure (#479)

This commit is contained in:
MS 2024-01-22 10:15:12 -05:00 committed by GitHub
parent 05bc94f030
commit a65eb9a4e0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 320 additions and 3 deletions

View file

@ -299,8 +299,8 @@ set_property(TARGET lego1 PROPERTY SUFFIX ".DLL")
if (ISLE_BUILD_APP) if (ISLE_BUILD_APP)
add_executable(isle WIN32 add_executable(isle WIN32
ISLE/res/isle.rc ISLE/res/isle.rc
ISLE/isleapp.cpp
ISLE/define.cpp ISLE/define.cpp
ISLE/isleapp.cpp
) )
target_compile_definitions(isle PRIVATE ISLE_APP) target_compile_definitions(isle PRIVATE ISLE_APP)

View file

@ -1,6 +1,6 @@
import logging import logging
import struct import struct
from typing import List, Optional from typing import List, Optional, Tuple
from dataclasses import dataclass from dataclasses import dataclass
from collections import namedtuple from collections import namedtuple
@ -365,6 +365,14 @@ def get_abs_addr(self, section: int, offset: int) -> int:
into an absolute vaddr.""" into an absolute vaddr."""
return self.get_section_offset_by_index(section) + offset return self.get_section_offset_by_index(section) + offset
def get_relative_addr(self, addr: int) -> Tuple[int, int]:
"""Convert an absolute address back into a (section, offset) pair."""
for i, section in enumerate(self.sections):
if section.contains_vaddr(addr):
return (i + 1, addr - section.virtual_address)
return (0, 0)
def get_raw_addr(self, vaddr: int) -> int: def get_raw_addr(self, vaddr: int) -> int:
"""Returns the raw offset in the PE binary for the given virtual address.""" """Returns the raw offset in the PE binary for the given virtual address."""
self._set_section_for_vaddr(vaddr) self._set_section_for_vaddr(vaddr)

View file

@ -409,6 +409,9 @@ def _compare_match(self, match: MatchInfo) -> Optional[DiffReport]:
## Public API ## Public API
def get_all(self) -> List[MatchInfo]:
return self._db.get_all()
def get_functions(self) -> List[MatchInfo]: def get_functions(self) -> List[MatchInfo]:
return self._db.get_matches_by_type(SymbolType.FUNCTION) return self._db.get_matches_by_type(SymbolType.FUNCTION)

View file

@ -82,6 +82,17 @@ def get_unmatched_strings(self) -> List[str]:
return [string for (string,) in cur.fetchall()] return [string for (string,) in cur.fetchall()]
def get_all(self) -> List[MatchInfo]:
cur = self._db.execute(
"""SELECT compare_type, orig_addr, recomp_addr, name, size
FROM `symbols`
ORDER BY orig_addr NULLS LAST
""",
)
cur.row_factory = matchinfo_factory
return cur.fetchall()
def get_matches(self) -> Optional[MatchInfo]: def get_matches(self) -> Optional[MatchInfo]:
cur = self._db.execute( cur = self._db.execute(
"""SELECT compare_type, orig_addr, recomp_addr, name, size """SELECT compare_type, orig_addr, recomp_addr, name, size

View file

@ -39,6 +39,9 @@
r"S_GDATA32: \[(?P<section>\w{4}):(?P<offset>\w{8})\], Type:\s*(?P<type>\S+), (?P<name>.+)" r"S_GDATA32: \[(?P<section>\w{4}):(?P<offset>\w{8})\], Type:\s*(?P<type>\S+), (?P<name>.+)"
) )
# e.g. 0003 "CMakeFiles/isle.dir/ISLE/res/isle.rc.res"
# e.g. 0004 "C:\work\lego-island\isle\3rdparty\smartheap\SHLW32MT.LIB" "check.obj"
_module_regex = re.compile(r"(?P<id>\w{4})(?: \"(?P<lib>.+?)\")?(?: \"(?P<obj>.+?)\")")
# User functions only # User functions only
LinesEntry = namedtuple("LinesEntry", "filename line_no section offset") LinesEntry = namedtuple("LinesEntry", "filename line_no section offset")
@ -52,13 +55,16 @@
SymbolsEntry = namedtuple("SymbolsEntry", "type section offset size name") SymbolsEntry = namedtuple("SymbolsEntry", "type section offset size name")
# (Estimated) size of any symbol # (Estimated) size of any symbol
SizeRefEntry = namedtuple("SizeRefEntry", "section offset size") SizeRefEntry = namedtuple("SizeRefEntry", "module section offset size")
# global variables # global variables
GdataEntry = namedtuple("GdataEntry", "section offset type name") GdataEntry = namedtuple("GdataEntry", "section offset type name")
ModuleEntry = namedtuple("ModuleEntry", "id lib obj")
class CvdumpParser: class CvdumpParser:
# pylint: disable=too-many-instance-attributes
def __init__(self) -> None: def __init__(self) -> None:
self._section: str = "" self._section: str = ""
self._lines_function: Tuple[str, int] = ("", 0) self._lines_function: Tuple[str, int] = ("", 0)
@ -68,6 +74,7 @@ def __init__(self) -> None:
self.symbols = [] self.symbols = []
self.sizerefs = [] self.sizerefs = []
self.globals = [] self.globals = []
self.modules = []
def _lines_section(self, line: str): def _lines_section(self, line: str):
"""Parsing entries from the LINES section. We only care about the pairs of """Parsing entries from the LINES section. We only care about the pairs of
@ -144,12 +151,26 @@ def _section_contributions(self, line: str):
if (match := _section_contrib_regex.match(line)) is not None: if (match := _section_contrib_regex.match(line)) is not None:
self.sizerefs.append( self.sizerefs.append(
SizeRefEntry( SizeRefEntry(
module=int(match.group("module"), 16),
section=int(match.group("section"), 16), section=int(match.group("section"), 16),
offset=int(match.group("offset"), 16), offset=int(match.group("offset"), 16),
size=int(match.group("size"), 16), size=int(match.group("size"), 16),
) )
) )
def _modules_section(self, line: str):
"""Record the object file (and lib file, if used) linked into the binary.
The auto-incrementing id is cross-referenced in SECTION CONTRIBUTIONS
(and perhaps other locations)"""
if (match := _module_regex.match(line)) is not None:
self.modules.append(
ModuleEntry(
id=int(match.group("id"), 16),
lib=match.group("lib"),
obj=match.group("obj"),
)
)
def read_line(self, line: str): def read_line(self, line: str):
# Blank lines are there to help the reader; they have no context significance # Blank lines are there to help the reader; they have no context significance
if line.strip() == "": if line.strip() == "":
@ -174,6 +195,9 @@ def read_line(self, line: str):
elif self._section == "GLOBALS": elif self._section == "GLOBALS":
self._globals_section(line) self._globals_section(line)
elif self._section == "MODULES":
self._modules_section(line)
def read_lines(self, lines: Iterable[str]): def read_lines(self, lines: Iterable[str]):
for line in lines: for line in lines:
self.read_line(line) self.read_line(line)

View file

@ -13,6 +13,7 @@ class DumpOpt(Enum):
GLOBALS = 2 GLOBALS = 2
PUBLICS = 3 PUBLICS = 3
SECTION_CONTRIB = 4 SECTION_CONTRIB = 4
MODULES = 5
cvdump_opt_map = { cvdump_opt_map = {
@ -21,6 +22,7 @@ class DumpOpt(Enum):
DumpOpt.GLOBALS: "-g", DumpOpt.GLOBALS: "-g",
DumpOpt.PUBLICS: "-p", DumpOpt.PUBLICS: "-p",
DumpOpt.SECTION_CONTRIB: "-seccontrib", DumpOpt.SECTION_CONTRIB: "-seccontrib",
DumpOpt.MODULES: "-m",
} }
@ -49,6 +51,10 @@ def section_contributions(self):
self._options.add(DumpOpt.SECTION_CONTRIB) self._options.add(DumpOpt.SECTION_CONTRIB)
return self return self
def modules(self):
self._options.add(DumpOpt.MODULES)
return self
def cmd_line(self) -> List[str]: def cmd_line(self) -> List[str]:
cvdump_exe = lib_path_join("cvdump.exe") cvdump_exe = lib_path_join("cvdump.exe")
flags = [cvdump_opt_map[opt] for opt in self._options] flags = [cvdump_opt_map[opt] for opt in self._options]

265
tools/roadmap/roadmap.py Normal file
View file

@ -0,0 +1,265 @@
"""For all addresses matched by code annotations or recomp pdb,
report how "far off" the recomp symbol is from its proper place
in the original binary."""
import os
import argparse
import logging
from typing import List, Optional
from collections import namedtuple
from isledecomp import Bin as IsleBin
from isledecomp.cvdump import Cvdump
from isledecomp.compare import Compare as IsleCompare
from isledecomp.types import SymbolType
# Ignore all compare-db messages.
logging.getLogger("isledecomp.compare").addHandler(logging.NullHandler())
def or_blank(value) -> str:
"""Helper for dealing with potential None values in text output."""
return "" if value is None else str(value)
class ModuleMap:
"""Load a subset of sections from the pdb to allow you to look up the
module number based on the recomp address."""
def __init__(self, pdb, binfile) -> None:
cvdump = Cvdump(pdb).section_contributions().modules().run()
self.module_lookup = {m.id: (m.lib, m.obj) for m in cvdump.modules}
self.section_contrib = [
(
binfile.get_abs_addr(sizeref.section, sizeref.offset),
sizeref.size,
sizeref.module,
)
for sizeref in cvdump.sizerefs
if binfile.is_valid_section(sizeref.section)
]
def get_module(self, addr: int) -> Optional[str]:
for start, size, module_id in self.section_contrib:
if start <= addr < start + size:
if (module := self.module_lookup.get(module_id)) is not None:
return module
return None
def print_sections(sections):
print(" name | start | v.size | raw size")
print("---------|----------|----------|----------")
for sect in sections:
name = sect.name.decode("ascii").rstrip("\x00")
print(
f"{name:>8} | {sect.virtual_address:8x} | {sect.virtual_size:8x} | {sect.size_of_raw_data:8x}"
)
print()
def match_type_abbreviation(mtype: Optional[SymbolType]) -> str:
"""Return abbreviation of the given SymbolType name"""
if mtype is None:
return ""
return mtype.name.lower()[:3]
RoadmapRow = namedtuple(
"RoadmapRow",
[
"orig_sect_ofs",
"recomp_sect_ofs",
"orig_addr",
"recomp_addr",
"displacement",
"sym_type",
"size",
"name",
"module",
],
)
def print_text_report(results: List[RoadmapRow]):
"""Print the result with original and recomp addresses."""
for row in results:
print(
" ".join(
[
f"{or_blank(row.orig_sect_ofs):14}",
f"{or_blank(row.recomp_sect_ofs):14}",
f"{or_blank(row.displacement):>8}",
f"{row.sym_type:3}",
f"{or_blank(row.size):6}",
or_blank(row.name),
]
)
)
def print_diff_report(results: List[RoadmapRow]):
"""Print only entries where we have the recomp address.
This is intended for generating a file to diff against.
The recomp addresses are always changing so we hide those."""
for row in results:
if row.orig_addr is None or row.recomp_addr is None:
continue
print(
" ".join(
[
f"{or_blank(row.orig_sect_ofs):14}",
f"{or_blank(row.displacement):>8}",
f"{row.sym_type:3}",
f"{or_blank(row.size):6}",
or_blank(row.name),
]
)
)
def export_to_csv(csv_file: str, results: List[RoadmapRow]):
with open(csv_file, "w+", encoding="utf-8") as f:
f.write(
"orig_sect_ofs,recomp_sect_ofs,orig_addr,recomp_addr,displacement,row_type,size,name,module\n"
)
for row in results:
f.write(",".join(map(or_blank, row)))
f.write("\n")
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Show all addresses from original and recomp."
)
parser.add_argument(
"original", metavar="original-binary", help="The original binary"
)
parser.add_argument(
"recompiled", metavar="recompiled-binary", help="The recompiled binary"
)
parser.add_argument(
"pdb", metavar="recompiled-pdb", help="The PDB of the recompiled binary"
)
parser.add_argument(
"decomp_dir", metavar="decomp-dir", help="The decompiled source tree"
)
parser.add_argument("--csv", metavar="<file>", help="If set, export to CSV")
parser.add_argument(
"--verbose", "-v", action="store_true", help="Show recomp addresses in output"
)
(args, _) = parser.parse_known_args()
if not os.path.isfile(args.original):
parser.error(f"Original binary {args.original} does not exist")
if not os.path.isfile(args.recompiled):
parser.error(f"Recompiled binary {args.recompiled} does not exist")
if not os.path.isfile(args.pdb):
parser.error(f"Symbols PDB {args.pdb} does not exist")
if not os.path.isdir(args.decomp_dir):
parser.error(f"Source directory {args.decomp_dir} does not exist")
return args
def main():
args = parse_args()
with IsleBin(args.original, find_str=True) as orig_bin, IsleBin(
args.recompiled
) as recomp_bin:
engine = IsleCompare(orig_bin, recomp_bin, args.pdb, args.decomp_dir)
module_map = ModuleMap(args.pdb, recomp_bin)
def is_same_section(orig: int, recomp: int) -> bool:
"""Compare the section name instead of the index.
LEGO1.dll adds extra sections for some reason. (Smacker library?)"""
try:
orig_name = orig_bin.sections[orig - 1].name
recomp_name = recomp_bin.sections[recomp - 1].name
return orig_name == recomp_name
except IndexError:
return False
def to_roadmap_row(match):
orig_sect = None
orig_ofs = None
orig_sect_ofs = None
recomp_sect = None
recomp_ofs = None
recomp_sect_ofs = None
orig_addr = None
recomp_addr = None
displacement = None
module_name = None
if match.recomp_addr is not None:
if (module_ref := module_map.get_module(match.recomp_addr)) is not None:
(_, module_name) = module_ref
row_type = match_type_abbreviation(match.compare_type)
name = (
repr(match.name)
if match.compare_type == SymbolType.STRING
else match.name
)
if match.orig_addr is not None:
orig_addr = match.orig_addr
(orig_sect, orig_ofs) = orig_bin.get_relative_addr(match.orig_addr)
orig_sect_ofs = f"{orig_sect:04}:{orig_ofs:08x}"
if match.recomp_addr is not None:
recomp_addr = match.recomp_addr
(recomp_sect, recomp_ofs) = recomp_bin.get_relative_addr(
match.recomp_addr
)
recomp_sect_ofs = f"{recomp_sect:04}:{recomp_ofs:08x}"
if (
orig_sect is not None
and recomp_sect is not None
and is_same_section(orig_sect, recomp_sect)
):
displacement = recomp_ofs - orig_ofs
return RoadmapRow(
orig_sect_ofs,
recomp_sect_ofs,
orig_addr,
recomp_addr,
displacement,
row_type,
match.size,
name,
module_name,
)
results = list(map(to_roadmap_row, engine.get_all()))
if args.csv is None:
if args.verbose:
print("ORIG sections:")
print_sections(orig_bin.sections)
print("RECOMP sections:")
print_sections(recomp_bin.sections)
print_text_report(results)
else:
print_diff_report(results)
if args.csv is not None:
export_to_csv(args.csv, results)
if __name__ == "__main__":
main()