Python Linting and Code Formatting (#298)

* Create common print_diff function

* Add pylint and black

* Fix linting, move classes to utils

* Add black/pylint to github actions

* Fix linting

* Move Bin and SymInfo into their own files

* Split out format

* Tidy up workdlows and pip, add readme

* Lint tests, add tests to readme
This commit is contained in:
Thomas Phillips 2023-11-26 07:27:42 +13:00 committed by GitHub
parent fb0d1ccb62
commit b14116cc93
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
22 changed files with 1675 additions and 789 deletions

View file

@ -70,15 +70,14 @@ jobs:
path: legobin path: legobin
key: legobin key: legobin
- name: Build isledecomp library - name: Install python packages
shell: bash shell: bash
run: | run: |
pip install tools/isledecomp pip install -r tools/requirements.txt
- name: Summarize Accuracy - name: Summarize Accuracy
shell: bash shell: bash
run: | run: |
pip install -r tools/reccmp/requirements.txt
python3 tools/reccmp/reccmp.py -S ISLEPROGRESS.SVG --svg-icon tools/reccmp/isle.png -H ISLEPROGRESS.HTML legobin/ISLE.EXE build/ISLE.EXE build/ISLE.PDB . | tee ISLEPROGRESS.TXT python3 tools/reccmp/reccmp.py -S ISLEPROGRESS.SVG --svg-icon tools/reccmp/isle.png -H ISLEPROGRESS.HTML legobin/ISLE.EXE build/ISLE.EXE build/ISLE.PDB . | tee ISLEPROGRESS.TXT
python3 tools/reccmp/reccmp.py -S LEGO1PROGRESS.SVG -T 1929 --svg-icon tools/reccmp/lego1.png -H LEGO1PROGRESS.HTML legobin/LEGO1.DLL build/LEGO1.DLL build/LEGO1.PDB . | tee LEGO1PROGRESS.TXT python3 tools/reccmp/reccmp.py -S LEGO1PROGRESS.SVG -T 1929 --svg-icon tools/reccmp/lego1.png -H LEGO1PROGRESS.HTML legobin/LEGO1.DLL build/LEGO1.DLL build/LEGO1.PDB . | tee LEGO1PROGRESS.TXT

View file

@ -20,3 +20,20 @@ jobs:
LEGO1/realtime/*.cpp LEGO1/realtime/*.h \ LEGO1/realtime/*.cpp LEGO1/realtime/*.h \
LEGO1/tgl/*.h \ LEGO1/tgl/*.h \
LEGO1/viewmanager/*.cpp LEGO1/viewmanager/*.h LEGO1/viewmanager/*.cpp LEGO1/viewmanager/*.h
python-format:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Install python libraries
shell: bash
run: |
pip install black pylint pytest -r tools/requirements.txt
- name: Run pylint and black
shell: bash
run: |
pylint tools --ignore=build
black --check tools

View file

@ -9,12 +9,11 @@ jobs:
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v3
- name: Build isledecomp library - name: Install python libraries
run: | run: |
pip install tools/isledecomp pip install -r tools/requirements.txt
- name: Run checkorder.py - name: Run checkorder.py
run: | run: |
pip install -r tools/checkorder/requirements.txt
python3 tools/checkorder/checkorder.py --verbose --enforce ISLE python3 tools/checkorder/checkorder.py --verbose --enforce ISLE
python3 tools/checkorder/checkorder.py --verbose --enforce LEGO1 python3 tools/checkorder/checkorder.py --verbose --enforce LEGO1

4
.gitignore vendored
View file

@ -16,6 +16,6 @@ ISLE.EXE
LEGO1.DLL LEGO1.DLL
build/ build/
*.swp *.swp
LEGO1PROGRESS.HTML LEGO1PROGRESS.*
LEGO1PROGRESS.SVG ISLEPROGRESS.*
*.pyc *.pyc

635
.pylintrc Normal file
View file

@ -0,0 +1,635 @@
[MAIN]
# Analyse import fallback blocks. This can be used to support both Python 2 and
# 3 compatible code, which means that the block might have code that exists
# only in one or another interpreter, leading to false positives when analysed.
analyse-fallback-blocks=no
# Clear in-memory caches upon conclusion of linting. Useful if running pylint
# in a server-like mode.
clear-cache-post-run=no
# Load and enable all available extensions. Use --list-extensions to see a list
# all available extensions.
#enable-all-extensions=
# In error mode, messages with a category besides ERROR or FATAL are
# suppressed, and no reports are done by default. Error mode is compatible with
# disabling specific errors.
#errors-only=
# Always return a 0 (non-error) status code, even if lint errors are found.
# This is primarily useful in continuous integration scripts.
#exit-zero=
# A comma-separated list of package or module names from where C extensions may
# be loaded. Extensions are loading into the active Python interpreter and may
# run arbitrary code.
extension-pkg-allow-list=
# A comma-separated list of package or module names from where C extensions may
# be loaded. Extensions are loading into the active Python interpreter and may
# run arbitrary code. (This is an alternative name to extension-pkg-allow-list
# for backward compatibility.)
extension-pkg-whitelist=
# Return non-zero exit code if any of these messages/categories are detected,
# even if score is above --fail-under value. Syntax same as enable. Messages
# specified are enabled, while categories only check already-enabled messages.
fail-on=
# Specify a score threshold under which the program will exit with error.
fail-under=10
# Interpret the stdin as a python script, whose filename needs to be passed as
# the module_or_package argument.
#from-stdin=
# Files or directories to be skipped. They should be base names, not paths.
ignore=CVS
# Add files or directories matching the regular expressions patterns to the
# ignore-list. The regex matches against paths and can be in Posix or Windows
# format. Because '\\' represents the directory delimiter on Windows systems,
# it can't be used as an escape character.
ignore-paths=
# Files or directories matching the regular expression patterns are skipped.
# The regex matches against base names, not paths. The default value ignores
# Emacs file locks
ignore-patterns=^\.#
# List of module names for which member attributes should not be checked
# (useful for modules/projects where namespaces are manipulated during runtime
# and thus existing member attributes cannot be deduced by static analysis). It
# supports qualified module names, as well as Unix pattern matching.
ignored-modules=
# Python code to execute, usually for sys.path manipulation such as
# pygtk.require().
#init-hook=
# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
# number of processors available to use, and will cap the count on Windows to
# avoid hangs.
jobs=1
# Control the amount of potential inferred values when inferring a single
# object. This can help the performance when dealing with large functions or
# complex, nested conditions.
limit-inference-results=100
# List of plugins (as comma separated values of python module names) to load,
# usually to register additional checkers.
load-plugins=
# Pickle collected data for later comparisons.
persistent=yes
# Minimum Python version to use for version dependent checks. Will default to
# the version used to run pylint.
py-version=3.11
# Discover python modules and packages in the file system subtree.
recursive=no
# Add paths to the list of the source roots. Supports globbing patterns. The
# source root is an absolute path or a path relative to the current working
# directory used to determine a package namespace for modules located under the
# source root.
source-roots=
# When enabled, pylint would attempt to guess common misconfiguration and emit
# user-friendly hints instead of false-positive error messages.
suggestion-mode=yes
# Allow loading of arbitrary C extensions. Extensions are imported into the
# active Python interpreter and may run arbitrary code.
unsafe-load-any-extension=no
# In verbose mode, extra non-checker-related info will be displayed.
#verbose=
[BASIC]
# Naming style matching correct argument names.
argument-naming-style=snake_case
# Regular expression matching correct argument names. Overrides argument-
# naming-style. If left empty, argument names will be checked with the set
# naming style.
#argument-rgx=
# Naming style matching correct attribute names.
attr-naming-style=snake_case
# Regular expression matching correct attribute names. Overrides attr-naming-
# style. If left empty, attribute names will be checked with the set naming
# style.
#attr-rgx=
# Bad variable names which should always be refused, separated by a comma.
bad-names=foo,
bar,
baz,
toto,
tutu,
tata
# Bad variable names regexes, separated by a comma. If names match any regex,
# they will always be refused
bad-names-rgxs=
# Naming style matching correct class attribute names.
class-attribute-naming-style=any
# Regular expression matching correct class attribute names. Overrides class-
# attribute-naming-style. If left empty, class attribute names will be checked
# with the set naming style.
#class-attribute-rgx=
# Naming style matching correct class constant names.
class-const-naming-style=UPPER_CASE
# Regular expression matching correct class constant names. Overrides class-
# const-naming-style. If left empty, class constant names will be checked with
# the set naming style.
#class-const-rgx=
# Naming style matching correct class names.
class-naming-style=PascalCase
# Regular expression matching correct class names. Overrides class-naming-
# style. If left empty, class names will be checked with the set naming style.
#class-rgx=
# Naming style matching correct constant names.
const-naming-style=snake_case
# Regular expression matching correct constant names. Overrides const-naming-
# style. If left empty, constant names will be checked with the set naming
# style.
#const-rgx=
# Minimum line length for functions/classes that require docstrings, shorter
# ones are exempt.
docstring-min-length=-1
# Naming style matching correct function names.
function-naming-style=snake_case
# Regular expression matching correct function names. Overrides function-
# naming-style. If left empty, function names will be checked with the set
# naming style.
#function-rgx=
# Good variable names which should always be accepted, separated by a comma.
good-names=i,
j,
k,
ex,
Run,
_
# Good variable names regexes, separated by a comma. If names match any regex,
# they will always be accepted
good-names-rgxs=
# Include a hint for the correct naming format with invalid-name.
include-naming-hint=no
# Naming style matching correct inline iteration names.
inlinevar-naming-style=any
# Regular expression matching correct inline iteration names. Overrides
# inlinevar-naming-style. If left empty, inline iteration names will be checked
# with the set naming style.
#inlinevar-rgx=
# Naming style matching correct method names.
method-naming-style=snake_case
# Regular expression matching correct method names. Overrides method-naming-
# style. If left empty, method names will be checked with the set naming style.
#method-rgx=
# Naming style matching correct module names.
module-naming-style=snake_case
# Regular expression matching correct module names. Overrides module-naming-
# style. If left empty, module names will be checked with the set naming style.
#module-rgx=
# Colon-delimited sets of names that determine each other's naming style when
# the name regexes allow several styles.
name-group=
# Regular expression which should only match function or class names that do
# not require a docstring.
no-docstring-rgx=^_
# List of decorators that produce properties, such as abc.abstractproperty. Add
# to this list to register other decorators that produce valid properties.
# These decorators are taken in consideration only for invalid-name.
property-classes=abc.abstractproperty
# Regular expression matching correct type alias names. If left empty, type
# alias names will be checked with the set naming style.
#typealias-rgx=
# Regular expression matching correct type variable names. If left empty, type
# variable names will be checked with the set naming style.
#typevar-rgx=
# Naming style matching correct variable names.
variable-naming-style=snake_case
# Regular expression matching correct variable names. Overrides variable-
# naming-style. If left empty, variable names will be checked with the set
# naming style.
#variable-rgx=
[CLASSES]
# Warn about protected attribute access inside special methods
check-protected-access-in-special-methods=no
# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,
__new__,
setUp,
asyncSetUp,
__post_init__
# List of member names, which should be excluded from the protected access
# warning.
exclude-protected=_asdict,_fields,_replace,_source,_make,os._exit
# List of valid names for the first argument in a class method.
valid-classmethod-first-arg=cls
# List of valid names for the first argument in a metaclass class method.
valid-metaclass-classmethod-first-arg=mcs
[DESIGN]
# List of regular expressions of class ancestor names to ignore when counting
# public methods (see R0903)
exclude-too-few-public-methods=
# List of qualified class names to ignore when counting class parents (see
# R0901)
ignored-parents=
# Maximum number of arguments for function / method.
max-args=6
# Maximum number of attributes for a class (see R0902).
max-attributes=7
# Maximum number of boolean expressions in an if statement (see R0916).
max-bool-expr=5
# Maximum number of branch for function / method body.
max-branches=30
# Maximum number of locals for function / method body.
max-locals=30
# Maximum number of parents for a class (see R0901).
max-parents=7
# Maximum number of public methods for a class (see R0904).
max-public-methods=20
# Maximum number of return / yield for function / method body.
max-returns=6
# Maximum number of statements in function / method body.
max-statements=75
# Minimum number of public methods for a class (see R0903).
min-public-methods=0
[EXCEPTIONS]
# Exceptions that will emit a warning when caught.
overgeneral-exceptions=builtins.BaseException,builtins.Exception
[FORMAT]
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
expected-line-ending-format=
# Regexp for a line that is allowed to be longer than the limit.
ignore-long-lines=^\s*(# )?<?https?://\S+>?$
# Number of spaces of indent required inside a hanging or continued line.
indent-after-paren=2
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
# tab).
indent-string=' '
# Maximum number of characters on a single line.
max-line-length=200
# Maximum number of lines in a module.
max-module-lines=1000
# Allow the body of a class to be on the same line as the declaration if body
# contains single statement.
single-line-class-stmt=no
# Allow the body of an if to be on the same line as the test if there is no
# else.
single-line-if-stmt=no
[IMPORTS]
# List of modules that can be imported at any level, not just the top level
# one.
allow-any-import-level=
# Allow explicit reexports by alias from a package __init__.
allow-reexport-from-package=no
# Allow wildcard imports from modules that define __all__.
allow-wildcard-with-all=no
# Deprecated modules which should not be used, separated by a comma.
deprecated-modules=
# Output a graph (.gv or any supported image format) of external dependencies
# to the given file (report RP0402 must not be disabled).
ext-import-graph=
# Output a graph (.gv or any supported image format) of all (i.e. internal and
# external) dependencies to the given file (report RP0402 must not be
# disabled).
import-graph=
# Output a graph (.gv or any supported image format) of internal dependencies
# to the given file (report RP0402 must not be disabled).
int-import-graph=
# Force import order to recognize a module as part of the standard
# compatibility libraries.
known-standard-library=
# Force import order to recognize a module as part of a third party library.
known-third-party=enchant
# Couples of modules and preferred modules, separated by a comma.
preferred-modules=
[LOGGING]
# The type of string formatting that logging methods do. `old` means using %
# formatting, `new` is for `{}` formatting.
logging-format-style=old
# Logging modules to check that the string format arguments are in logging
# function parameter format.
logging-modules=logging
[MESSAGES CONTROL]
# Only show warnings with the listed confidence levels. Leave empty to show
# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE,
# UNDEFINED.
confidence=HIGH,
CONTROL_FLOW,
INFERENCE,
INFERENCE_FAILURE,
UNDEFINED
# Disable the message, report, category or checker with the given id(s). You
# can either give multiple identifiers separated by comma (,) or put this
# option multiple times (only on the command line, not in the configuration
# file where it should appear only once). You can also use "--disable=all" to
# disable everything first and then re-enable specific checks. For example, if
# you want to run only the similarities checker, you can use "--disable=all
# --enable=similarities". If you want to run only the classes checker, but have
# no Warning level messages displayed, use "--disable=all --enable=classes
# --disable=W".
disable=raw-checker-failed,
bad-inline-option,
locally-disabled,
file-ignored,
suppressed-message,
useless-suppression,
deprecated-pragma,
use-symbolic-message-instead,
missing-class-docstring,
missing-function-docstring,
missing-module-docstring,
fixme
# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
# multiple time (only on the command line, not in the configuration file where
# it should appear only once). See also the "--disable" option for examples.
enable=c-extension-no-member
[METHOD_ARGS]
# List of qualified names (i.e., library.method) which require a timeout
# parameter e.g. 'requests.api.get,requests.api.post'
timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.api.options,requests.api.patch,requests.api.post,requests.api.put,requests.api.request
[MISCELLANEOUS]
# List of note tags to take in consideration, separated by a comma.
notes=FIXME,
XXX,
TODO
# Regular expression of note tags to take in consideration.
notes-rgx=
[REFACTORING]
# Maximum number of nested blocks for function / method body
max-nested-blocks=5
# Complete name of functions that never returns. When checking for
# inconsistent-return-statements if a never returning function is called then
# it will be considered as an explicit return statement and no message will be
# printed.
never-returning-functions=sys.exit,argparse.parse_error
[REPORTS]
# Python expression which should return a score less than or equal to 10. You
# have access to the variables 'fatal', 'error', 'warning', 'refactor',
# 'convention', and 'info' which contain the number of messages in each
# category, as well as 'statement' which is the total number of statements
# analyzed. This score is used by the global evaluation report (RP0004).
evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10))
# Template used to display messages. This is a python new-style format string
# used to format the message information. See doc for all details.
msg-template=
# Set the output format. Available formats are text, parseable, colorized, json
# and msvs (visual studio). You can also give a reporter class, e.g.
# mypackage.mymodule.MyReporterClass.
#output-format=
# Tells whether to display a full report or only the messages.
reports=no
# Activate the evaluation score.
score=yes
[SIMILARITIES]
# Comments are removed from the similarity computation
ignore-comments=yes
# Docstrings are removed from the similarity computation
ignore-docstrings=yes
# Imports are removed from the similarity computation
ignore-imports=yes
# Signatures are removed from the similarity computation
ignore-signatures=yes
# Minimum lines number of a similarity.
min-similarity-lines=4
[SPELLING]
# Limits count of emitted suggestions for spelling mistakes.
max-spelling-suggestions=4
# Spelling dictionary name. No available dictionaries : You need to install
# both the python package and the system dependency for enchant to work..
spelling-dict=
# List of comma separated words that should be considered directives if they
# appear at the beginning of a comment and should not be checked.
spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:
# List of comma separated words that should not be checked.
spelling-ignore-words=
# A path to a file that contains the private dictionary; one word per line.
spelling-private-dict-file=
# Tells whether to store unknown words to the private dictionary (see the
# --spelling-private-dict-file option) instead of raising a message.
spelling-store-unknown-words=no
[STRING]
# This flag controls whether inconsistent-quotes generates a warning when the
# character used as a quote delimiter is used inconsistently within a module.
check-quote-consistency=no
# This flag controls whether the implicit-str-concat should generate a warning
# on implicit string concatenation in sequences defined over several lines.
check-str-concat-over-line-jumps=no
[TYPECHECK]
# List of decorators that produce context managers, such as
# contextlib.contextmanager. Add to this list to register other decorators that
# produce valid context managers.
contextmanager-decorators=contextlib.contextmanager
# List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E1101 when accessed. Python regular
# expressions are accepted.
generated-members=
# Tells whether to warn about missing members when the owner of the attribute
# is inferred to be None.
ignore-none=yes
# This flag controls whether pylint should warn about no-member and similar
# checks whenever an opaque object is returned when inferring. The inference
# can return multiple potential results while evaluating a Python object, but
# some branches might not be evaluated, which results in partial inference. In
# that case, it might be useful to still emit no-member and other checks for
# the rest of the inferred objects.
ignore-on-opaque-inference=yes
# List of symbolic message names to ignore for Mixin members.
ignored-checks-for-mixins=no-member,
not-async-context-manager,
not-context-manager,
attribute-defined-outside-init
# List of class names for which member attributes should not be checked (useful
# for classes with dynamically set attributes). This supports the use of
# qualified names.
ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace
# Show a hint with possible names when a member name was not found. The aspect
# of finding the hint is based on edit distance.
missing-member-hint=yes
# The minimum edit distance a name should have in order to be considered a
# similar match for a missing member name.
missing-member-hint-distance=1
# The total number of similar names that should be taken in consideration when
# showing a hint for a missing member.
missing-member-max-choices=1
# Regex pattern to define which classes are considered mixins.
mixin-class-rgx=.*[Mm]ixin
# List of decorators that change the signature of a decorated function.
signature-mutators=
[VARIABLES]
# List of additional names supposed to be defined in builtins. Remember that
# you should avoid defining new builtins when possible.
additional-builtins=
# Tells whether unused global variables should be treated as a violation.
allow-global-unused-variables=yes
# List of names allowed to shadow builtins
allowed-redefined-builtins=
# List of strings which can identify a callback function by name. A callback
# name must start or end with one of those strings.
callbacks=cb_,
_cb
# A regular expression matching the name of dummy variables (i.e. expected to
# not be used).
dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
# Argument names that match this expression will be ignored.
ignored-argument-names=_.*|^ignored_|^unused_
# Tells whether we should check for unused import in __init__ files.
init-import=no
# List of qualified module names which can have objects that can redefine
# builtins.
redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io

2
pyproject.toml Normal file
View file

@ -0,0 +1,2 @@
[flake8]
max-line-length = 120

50
tools/README.md Normal file
View file

@ -0,0 +1,50 @@
# LEGO Island Decompilation Tools
These are a set of Python tools for helping with the decomp project
## Installing
Use pip to install the required packages:
```
pip install -r tools/requirements.txt
```
## reccmp
This is a script to compare the original EXE or DLL with a recpmpiled EXE or DLL, provided a .PDB file
## verexp
This verifies exports by comparing the exports of an original DLL and the recompiled DLL
## checkorder
This checks the order of C++ source and header files to make sure the functions are in order
## isledecomp
This is a library that is used by rhe above scripts. it has a collection of useful classes and functions
### Testing
`isledecomp` has a small suite of tests. Install pylint and run it, passing in the directory:
```
pip install pytest
pytest tools/isledecomp/tests/
```
## Development
In order to keep the code clean and consistent, we use `pylint` and `black`:
```
pip install black pylint
```
### To run pylint (ignores build and virtualenv):
```
pylint tools/ --ignore=build,bin,lib
```
### To check code formatting without rewriting files:
```
black --check tools/
```
### To apply code formatting:
```
black tools/
```

View file

@ -1,14 +1,9 @@
import os import os
import sys import sys
import argparse import argparse
from isledecomp.dir import ( from isledecomp.dir import walk_source_dir, is_file_cpp
walk_source_dir,
is_file_cpp
)
from isledecomp.parser import find_code_blocks from isledecomp.parser import find_code_blocks
from isledecomp.parser.util import ( from isledecomp.parser.util import is_exact_offset_comment
is_exact_offset_comment
)
def sig_truncate(sig: str) -> str: def sig_truncate(sig: str) -> str:
@ -21,12 +16,14 @@ def check_file(filename: str, verbose: bool = False) -> bool:
"""Open and read the given file, then check whether the code blocks """Open and read the given file, then check whether the code blocks
are in order. If verbose, print each block.""" are in order. If verbose, print each block."""
with open(filename, 'r') as f: with open(filename, "r", encoding="utf-8") as f:
code_blocks = find_code_blocks(f) code_blocks = find_code_blocks(f)
bad_comments = [(block.start_line, block.offset_comment) bad_comments = [
(block.start_line, block.offset_comment)
for block in code_blocks for block in code_blocks
if not is_exact_offset_comment(block.offset_comment)] if not is_exact_offset_comment(block.offset_comment)
]
just_offsets = [block.offset for block in code_blocks] just_offsets = [block.offset for block in code_blocks]
sorted_offsets = sorted(just_offsets) sorted_offsets = sorted(just_offsets)
@ -35,8 +32,7 @@ def check_file(filename: str, verbose: bool = False) -> bool:
# If we detect inexact comments, don't print anything unless we are # If we detect inexact comments, don't print anything unless we are
# in verbose mode. If the file is out of order, we always print the # in verbose mode. If the file is out of order, we always print the
# file name. # file name.
should_report = ((len(bad_comments) > 0 and verbose) should_report = (len(bad_comments) > 0 and verbose) or file_out_of_order
or file_out_of_order)
if not should_report and not file_out_of_order: if not should_report and not file_out_of_order:
return False return False
@ -49,19 +45,21 @@ def check_file(filename: str, verbose: bool = False) -> bool:
prev_offset = 0 prev_offset = 0
for block in code_blocks: for block in code_blocks:
msg = ' '.join([ msg = " ".join(
' ' if block.offset > prev_offset else '!', [
f'{block.offset:08x}', " " if block.offset > prev_offset else "!",
f'{block.end_line - block.start_line:4} lines', f"{block.offset:08x}",
f'{order_lookup[block.offset]:3}', f"{block.end_line - block.start_line:4} lines",
' ', f"{order_lookup[block.offset]:3}",
" ",
sig_truncate(block.signature), sig_truncate(block.signature),
]) ]
)
print(msg) print(msg)
prev_offset = block.offset prev_offset = block.offset
for (line_no, line) in bad_comments: for line_no, line in bad_comments:
print(f'* line {line_no:3} bad offset comment ({line})') print(f"* line {line_no:3} bad offset comment ({line})")
print() print()
@ -69,15 +67,25 @@ def check_file(filename: str, verbose: bool = False) -> bool:
def parse_args(test_args: list | None = None) -> dict: def parse_args(test_args: list | None = None) -> dict:
p = argparse.ArgumentParser() p = argparse.ArgumentParser(
p.add_argument('target', help='The file or directory to check.') description="Checks the source files to make sure the function offset comments are in order",
p.add_argument('--enforce', action=argparse.BooleanOptionalAction, )
p.add_argument("target", help="The file or directory to check.")
p.add_argument(
"--enforce",
action=argparse.BooleanOptionalAction,
default=False, default=False,
help='Fail with error code if target is out of order.') help="Fail with error code if target is out of order.",
p.add_argument('--verbose', action=argparse.BooleanOptionalAction, )
p.add_argument(
"--verbose",
action=argparse.BooleanOptionalAction,
default=False, default=False,
help=('Display each code block in the file and show ' help=(
'where each consecutive run of blocks is broken.')) "Display each code block in the file and show "
"where each consecutive run of blocks is broken."
),
)
if test_args is None: if test_args is None:
args = p.parse_args() args = p.parse_args()
@ -90,31 +98,33 @@ def parse_args(test_args: list | None = None) -> dict:
def main(): def main():
args = parse_args() args = parse_args()
if os.path.isdir(args['target']): if os.path.isdir(args["target"]):
files_to_check = list(walk_source_dir(args['target'])) files_to_check = list(walk_source_dir(args["target"]))
elif os.path.isfile(args['target']) and is_file_cpp(args['target']): elif os.path.isfile(args["target"]) and is_file_cpp(args["target"]):
files_to_check = [args['target']] files_to_check = [args["target"]]
else: else:
sys.exit('Invalid target') sys.exit("Invalid target")
files_out_of_order = 0 files_out_of_order = 0
for file in files_to_check: for file in files_to_check:
is_jumbled = check_file(file, args['verbose']) is_jumbled = check_file(file, args["verbose"])
if is_jumbled: if is_jumbled:
files_out_of_order += 1 files_out_of_order += 1
if files_out_of_order > 0: if files_out_of_order > 0:
error_message = ' '.join([ error_message = " ".join(
[
str(files_out_of_order), str(files_out_of_order),
'files are' if files_out_of_order > 1 else 'file is', "files are" if files_out_of_order > 1 else "file is",
'out of order' "out of order",
]) ]
)
print(error_message) print(error_message)
if files_out_of_order > 0 and args['enforce']: if files_out_of_order > 0 and args["enforce"]:
sys.exit(1) sys.exit(1)
if __name__ == '__main__': if __name__ == "__main__":
main() main()

View file

@ -1 +0,0 @@
isledecomp

View file

@ -0,0 +1,5 @@
from .bin import *
from .dir import *
from .parser import *
from .syminfo import *
from .utils import *

View file

@ -0,0 +1,47 @@
import struct
# Declare a class that can automatically convert virtual executable addresses
# to file addresses
class Bin:
def __init__(self, filename, logger):
self.logger = logger
self.logger.debug('Parsing headers of "%s"... ', filename)
self.filename = filename
self.file = None
self.imagebase = None
self.textvirt = None
self.textraw = None
def __enter__(self):
self.logger.debug(f"Bin {self.filename} Enter")
self.file = open(self.filename, "rb")
# HACK: Strictly, we should be parsing the header, but we know where
# everything is in these two files so we just jump straight there
# Read ImageBase
self.file.seek(0xB4)
(self.imagebase,) = struct.unpack("<i", self.file.read(4))
# Read .text VirtualAddress
self.file.seek(0x184)
(self.textvirt,) = struct.unpack("<i", self.file.read(4))
# Read .text PointerToRawData
self.file.seek(0x18C)
(self.textraw,) = struct.unpack("<i", self.file.read(4))
self.logger.debug("... Parsing finished")
return self
def __exit__(self, exc_type, exc_value, exc_traceback):
self.logger.debug(f"Bin {self.filename} Exit")
if self.file:
self.file.close()
def get_addr(self, virt):
return virt - self.imagebase - self.textvirt + self.textraw
def read(self, offset, size):
self.file.seek(self.get_addr(offset))
return self.file.read(size)

View file

@ -1,10 +1,48 @@
import os import os
import subprocess
import sys
from typing import Iterator from typing import Iterator
class WinePathConverter:
def __init__(self, unix_cwd):
self.unix_cwd = unix_cwd
self.win_cwd = self._call_winepath_unix2win(self.unix_cwd)
def get_wine_path(self, unix_fn: str) -> str:
if unix_fn.startswith("./"):
return self.win_cwd + "\\" + unix_fn[2:].replace("/", "\\")
if unix_fn.startswith(self.unix_cwd):
return (
self.win_cwd
+ "\\"
+ unix_fn.removeprefix(self.unix_cwd).replace("/", "\\").lstrip("\\")
)
return self._call_winepath_unix2win(unix_fn)
def get_unix_path(self, win_fn: str) -> str:
if win_fn.startswith(".\\") or win_fn.startswith("./"):
return self.unix_cwd + "/" + win_fn[2:].replace("\\", "/")
if win_fn.startswith(self.win_cwd):
return (
self.unix_cwd
+ "/"
+ win_fn.removeprefix(self.win_cwd).replace("\\", "/")
)
return self._call_winepath_win2unix(win_fn)
@staticmethod
def _call_winepath_unix2win(fn: str) -> str:
return subprocess.check_output(["winepath", "-w", fn], text=True).strip()
@staticmethod
def _call_winepath_win2unix(fn: str) -> str:
return subprocess.check_output(["winepath", fn], text=True).strip()
def is_file_cpp(filename: str) -> bool: def is_file_cpp(filename: str) -> bool:
(basefile, ext) = os.path.splitext(filename) (_, ext) = os.path.splitext(filename)
return ext.lower() in ('.h', '.cpp') return ext.lower() in (".h", ".cpp")
def walk_source_dir(source: str, recursive: bool = True) -> Iterator[str]: def walk_source_dir(source: str, recursive: bool = True) -> Iterator[str]:
@ -12,10 +50,14 @@ def walk_source_dir(source: str, recursive: bool = True) -> Iterator[str]:
any C++ files found.""" any C++ files found."""
source = os.path.abspath(source) source = os.path.abspath(source)
for subdir, dirs, files in os.walk(source): for subdir, _, files in os.walk(source):
for file in files: for file in files:
if is_file_cpp(file): if is_file_cpp(file):
yield os.path.join(subdir, file) yield os.path.join(subdir, file)
if not recursive: if not recursive:
break break
def get_file_in_script_dir(fn):
return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn)

View file

@ -7,7 +7,6 @@
OffsetMatch, OffsetMatch,
is_blank_or_comment, is_blank_or_comment,
match_offset_comment, match_offset_comment,
is_exact_offset_comment,
get_template_function_name, get_template_function_name,
remove_trailing_comment, remove_trailing_comment,
distinct_by_module, distinct_by_module,
@ -51,14 +50,16 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
# Our list of offset marks could have duplicates on # Our list of offset marks could have duplicates on
# module name, so we'll eliminate those now. # module name, so we'll eliminate those now.
for offset_match in distinct_by_module(offset_matches): for offset_match in distinct_by_module(offset_matches):
block = CodeBlock(offset=offset_match.address, block = CodeBlock(
offset=offset_match.address,
signature=function_sig, signature=function_sig,
start_line=start_line, start_line=start_line,
end_line=end_line, end_line=end_line,
offset_comment=offset_match.comment, offset_comment=offset_match.comment,
module=offset_match.module, module=offset_match.module,
is_template=offset_match.is_template, is_template=offset_match.is_template,
is_stub=offset_match.is_stub) is_stub=offset_match.is_stub,
)
blocks.append(block) blocks.append(block)
offset_matches = [] offset_matches = []
state = ReaderState.WANT_OFFSET state = ReaderState.WANT_OFFSET
@ -66,15 +67,18 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
if can_seek: if can_seek:
line_no += 1 line_no += 1
line = stream.readline() line = stream.readline()
if line == '': if line == "":
break break
new_match = match_offset_comment(line) new_match = match_offset_comment(line)
if new_match is not None: if new_match is not None:
# We will allow multiple offsets if we have just begun # We will allow multiple offsets if we have just begun
# the code block, but not after we hit the curly brace. # the code block, but not after we hit the curly brace.
if state in (ReaderState.WANT_OFFSET, ReaderState.IN_TEMPLATE, if state in (
ReaderState.WANT_SIG): ReaderState.WANT_OFFSET,
ReaderState.IN_TEMPLATE,
ReaderState.WANT_SIG,
):
# If we detected an offset marker unexpectedly, # If we detected an offset marker unexpectedly,
# we are handling it here so we can continue seeking. # we are handling it here so we can continue seeking.
can_seek = True can_seek = True
@ -116,11 +120,10 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
# same line. clang-format should prevent this (BraceWrapping) # same line. clang-format should prevent this (BraceWrapping)
# but it is easy to detect. # but it is easy to detect.
# If the entire function is on one line, handle that too. # If the entire function is on one line, handle that too.
if function_sig.endswith('{'): if function_sig.endswith("{"):
start_line = line_no start_line = line_no
state = ReaderState.IN_FUNC state = ReaderState.IN_FUNC
elif (function_sig.endswith('}') or elif function_sig.endswith("}") or function_sig.endswith("};"):
function_sig.endswith('};')):
start_line = line_no start_line = line_no
end_line = line_no end_line = line_no
state = ReaderState.FUNCTION_DONE state = ReaderState.FUNCTION_DONE
@ -128,14 +131,14 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
state = ReaderState.WANT_CURLY state = ReaderState.WANT_CURLY
elif state == ReaderState.WANT_CURLY: elif state == ReaderState.WANT_CURLY:
if line.strip() == '{': if line.strip() == "{":
start_line = line_no start_line = line_no
state = ReaderState.IN_FUNC state = ReaderState.IN_FUNC
elif state == ReaderState.IN_FUNC: elif state == ReaderState.IN_FUNC:
# Naive but reasonable assumption that functions will end with # Naive but reasonable assumption that functions will end with
# a curly brace on its own line with no prepended spaces. # a curly brace on its own line with no prepended spaces.
if line.startswith('}'): if line.startswith("}"):
end_line = line_no end_line = line_no
state = ReaderState.FUNCTION_DONE state = ReaderState.FUNCTION_DONE

View file

@ -5,34 +5,49 @@
from collections import namedtuple from collections import namedtuple
CodeBlock = namedtuple('CodeBlock', CodeBlock = namedtuple(
['offset', 'signature', 'start_line', 'end_line', "CodeBlock",
'offset_comment', 'module', 'is_template', 'is_stub']) [
"offset",
"signature",
"start_line",
"end_line",
"offset_comment",
"module",
"is_template",
"is_stub",
],
)
OffsetMatch = namedtuple('OffsetMatch', ['module', 'address', 'is_template', OffsetMatch = namedtuple(
'is_stub', 'comment']) "OffsetMatch", ["module", "address", "is_template", "is_stub", "comment"]
)
# This has not been formally established, but considering that "STUB" # This has not been formally established, but considering that "STUB"
# is a temporary state for a function, we assume it will appear last, # is a temporary state for a function, we assume it will appear last,
# after any other modifiers (i.e. TEMPLATE) # after any other modifiers (i.e. TEMPLATE)
# To match a reasonable variance of formatting for the offset comment # To match a reasonable variance of formatting for the offset comment
offsetCommentRegex = re.compile(r'\s*//\s*OFFSET:\s*(\w+)\s+(?:0x)?([a-f0-9]+)(\s+TEMPLATE)?(\s+STUB)?', # nopep8 offsetCommentRegex = re.compile(
flags=re.I) r"\s*//\s*OFFSET:\s*(\w+)\s+(?:0x)?([a-f0-9]+)(\s+TEMPLATE)?(\s+STUB)?", # nopep8
flags=re.I,
)
# To match the exact syntax (text upper case, hex lower case, with spaces) # To match the exact syntax (text upper case, hex lower case, with spaces)
# that is used in most places # that is used in most places
offsetCommentExactRegex = re.compile(r'^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)( TEMPLATE)?( STUB)?$') # nopep8 offsetCommentExactRegex = re.compile(
r"^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)( TEMPLATE)?( STUB)?$"
) # nopep8
# The goal here is to just read whatever is on the next line, so some # The goal here is to just read whatever is on the next line, so some
# flexibility in the formatting seems OK # flexibility in the formatting seems OK
templateCommentRegex = re.compile(r'\s*//\s+(.*)') templateCommentRegex = re.compile(r"\s*//\s+(.*)")
# To remove any comment (//) or block comment (/*) and its leading spaces # To remove any comment (//) or block comment (/*) and its leading spaces
# from the end of a code line # from the end of a code line
trailingCommentRegex = re.compile(r'(\s*(?://|/\*).*)$') trailingCommentRegex = re.compile(r"(\s*(?://|/\*).*)$")
def get_template_function_name(line: str) -> str: def get_template_function_name(line: str) -> str:
@ -47,7 +62,7 @@ def get_template_function_name(line: str) -> str:
def remove_trailing_comment(line: str) -> str: def remove_trailing_comment(line: str) -> str:
return trailingCommentRegex.sub('', line) return trailingCommentRegex.sub("", line)
def is_blank_or_comment(line: str) -> bool: def is_blank_or_comment(line: str) -> bool:
@ -55,10 +70,12 @@ def is_blank_or_comment(line: str) -> bool:
There could be blank lines or other comments before the There could be blank lines or other comments before the
function signature, and we want to skip those.""" function signature, and we want to skip those."""
line_strip = line.strip() line_strip = line.strip()
return (len(line_strip) == 0 return (
or line_strip.startswith('//') len(line_strip) == 0
or line_strip.startswith('/*') or line_strip.startswith("//")
or line_strip.endswith('*/')) or line_strip.startswith("/*")
or line_strip.endswith("*/")
)
def is_exact_offset_comment(line: str) -> bool: def is_exact_offset_comment(line: str) -> bool:
@ -72,11 +89,13 @@ def match_offset_comment(line: str) -> OffsetMatch | None:
if match is None: if match is None:
return None return None
return OffsetMatch(module=match.group(1), return OffsetMatch(
module=match.group(1),
address=int(match.group(2), 16), address=int(match.group(2), 16),
is_template=match.group(3) is not None, is_template=match.group(3) is not None,
is_stub=match.group(4) is not None, is_stub=match.group(4) is not None,
comment=line.strip()) comment=line.strip(),
)
def distinct_by_module(offsets: List) -> List: def distinct_by_module(offsets: List) -> List:

View file

@ -0,0 +1,138 @@
import os
import subprocess
from .utils import get_file_in_script_dir
class RecompiledInfo:
addr = None
size = None
name = None
start = None
# Declare a class that parses the output of cvdump for fast access later
class SymInfo:
funcs = {}
lines = {}
names = {}
def __init__(self, pdb, sym_recompfile, sym_logger, sym_wine_path_converter=None):
self.logger = sym_logger
call = [get_file_in_script_dir("cvdump.exe"), "-l", "-s"]
if sym_wine_path_converter:
# Run cvdump through wine and convert path to Windows-friendly wine path
call.insert(0, "wine")
call.append(sym_wine_path_converter.get_wine_path(pdb))
else:
call.append(pdb)
self.logger.info("Parsing %s ...", pdb)
self.logger.debug("Command = %s", call)
line_dump = subprocess.check_output(call).decode("utf-8").split("\r\n")
current_section = None
self.logger.debug("Parsing output of cvdump.exe ...")
for i, line in enumerate(line_dump):
if line.startswith("***"):
current_section = line[4:]
if current_section == "SYMBOLS" and "S_GPROC32" in line:
sym_addr = int(line[26:34], 16)
info = RecompiledInfo()
info.addr = (
sym_addr + sym_recompfile.imagebase + sym_recompfile.textvirt
)
use_dbg_offs = False
if use_dbg_offs:
debug_offs = line_dump[i + 2]
debug_start = int(debug_offs[22:30], 16)
debug_end = int(debug_offs[43:], 16)
info.start = debug_start
info.size = debug_end - debug_start
else:
info.start = 0
info.size = int(line[41:49], 16)
info.name = line[77:]
self.names[info.name] = info
self.funcs[sym_addr] = info
elif (
current_section == "LINES"
and line.startswith(" ")
and not line.startswith(" ")
):
sourcepath = line.split()[0]
if sym_wine_path_converter:
# Convert filename to Unix path for file compare
sourcepath = sym_wine_path_converter.get_unix_path(sourcepath)
if sourcepath not in self.lines:
self.lines[sourcepath] = {}
j = i + 2
while True:
ll = line_dump[j].split()
if len(ll) == 0:
break
k = 0
while k < len(ll):
linenum = int(ll[k + 0])
address = int(ll[k + 1], 16)
if linenum not in self.lines[sourcepath]:
self.lines[sourcepath][linenum] = address
k += 2
j += 1
self.logger.debug("... Parsing output of cvdump.exe finished")
def get_recompiled_address(self, filename, line):
recompiled_addr = None
self.logger.debug("Looking for %s:%s", filename, line)
filename_basename = os.path.basename(filename).lower()
for fn in self.lines:
# Sometimes a PDB is compiled with a relative path while we always have
# an absolute path. Therefore we must
try:
if os.path.basename(
fn
).lower() == filename_basename and os.path.samefile(fn, filename):
filename = fn
break
except FileNotFoundError:
continue
if filename in self.lines and line in self.lines[filename]:
recompiled_addr = self.lines[filename][line]
if recompiled_addr in self.funcs:
return self.funcs[recompiled_addr]
self.logger.error(
"Failed to find function symbol with address: %x", recompiled_addr
)
return None
self.logger.error(
"Failed to find function symbol with filename and line: %s:%s",
filename,
line,
)
return None
def get_recompiled_address_from_name(self, name):
self.logger.debug("Looking for %s", name)
if name in self.names:
return self.names[name]
self.logger.error("Failed to find function symbol with name: %s", name)
return None

View file

@ -0,0 +1,42 @@
import os
import sys
import colorama
def print_diff(udiff, plain):
has_diff = False
for line in udiff:
has_diff = True
color = ""
if line.startswith("++") or line.startswith("@@") or line.startswith("--"):
# Skip unneeded parts of the diff for the brief view
continue
# Work out color if we are printing color
if not plain:
if line.startswith("+"):
color = colorama.Fore.GREEN
elif line.startswith("-"):
color = colorama.Fore.RED
print(color + line)
# Reset color if we're printing in color
if not plain:
print(colorama.Style.RESET_ALL, end="")
return has_diff
def get_file_in_script_dir(fn):
return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn)
class OffsetPlaceholderGenerator:
def __init__(self):
self.counter = 0
self.replacements = {}
def get(self, replace_addr):
if replace_addr in self.replacements:
return self.replacements[replace_addr]
self.counter += 1
replacement = f"<OFFSET{self.counter}>"
self.replacements[replace_addr] = replacement
return replacement

View file

@ -1,9 +1,9 @@
from setuptools import setup, find_packages from setuptools import setup, find_packages
setup( setup(
name='isledecomp', name="isledecomp",
version='0.1.0', version="0.1.0",
description='Python tools for the isledecomp project', description="Python tools for the isledecomp project",
packages=find_packages(), packages=find_packages(),
tests_require=['pytest'], tests_require=["pytest"],
) )

View file

@ -1,17 +1,16 @@
import os import os
import pytest
from typing import List, TextIO from typing import List, TextIO
from isledecomp.parser import find_code_blocks from isledecomp.parser import find_code_blocks
from isledecomp.parser.util import CodeBlock from isledecomp.parser.util import CodeBlock
SAMPLE_DIR = os.path.join(os.path.dirname(__file__), 'samples') SAMPLE_DIR = os.path.join(os.path.dirname(__file__), "samples")
def sample_file(filename: str) -> TextIO: def sample_file(filename: str) -> TextIO:
"""Wrapper for opening the samples from the directory that does not """Wrapper for opening the samples from the directory that does not
depend on the cwd where we run the test""" depend on the cwd where we run the test"""
full_path = os.path.join(SAMPLE_DIR, filename) full_path = os.path.join(SAMPLE_DIR, filename)
return open(full_path, 'r') return open(full_path, "r", encoding="utf-8")
def code_blocks_are_sorted(blocks: List[CodeBlock]) -> bool: def code_blocks_are_sorted(blocks: List[CodeBlock]) -> bool:
@ -25,7 +24,7 @@ def code_blocks_are_sorted(blocks: List[CodeBlock]) -> bool:
def test_sanity(): def test_sanity():
"""Read a very basic file""" """Read a very basic file"""
with sample_file('basic_file.cpp') as f: with sample_file("basic_file.cpp") as f:
blocks = find_code_blocks(f) blocks = find_code_blocks(f)
assert len(blocks) == 3 assert len(blocks) == 3
@ -39,7 +38,7 @@ def test_sanity():
def test_oneline(): def test_oneline():
"""(Assuming clang-format permits this) This sample has a function """(Assuming clang-format permits this) This sample has a function
on a single line. This will test the end-of-function detection""" on a single line. This will test the end-of-function detection"""
with sample_file('oneline_function.cpp') as f: with sample_file("oneline_function.cpp") as f:
blocks = find_code_blocks(f) blocks = find_code_blocks(f)
assert len(blocks) == 2 assert len(blocks) == 2
@ -49,7 +48,7 @@ def test_oneline():
def test_missing_offset(): def test_missing_offset():
"""What if the function doesn't have an offset comment?""" """What if the function doesn't have an offset comment?"""
with sample_file('missing_offset.cpp') as f: with sample_file("missing_offset.cpp") as f:
blocks = find_code_blocks(f) blocks = find_code_blocks(f)
# TODO: For now, the function without the offset will just be ignored. # TODO: For now, the function without the offset will just be ignored.
@ -62,7 +61,7 @@ def test_jumbled_case():
"""The parser just reports what it sees. It is the responsibility of """The parser just reports what it sees. It is the responsibility of
the downstream tools to do something about a jumbled file. the downstream tools to do something about a jumbled file.
Just verify that we are reading it correctly.""" Just verify that we are reading it correctly."""
with sample_file('out_of_order.cpp') as f: with sample_file("out_of_order.cpp") as f:
blocks = find_code_blocks(f) blocks = find_code_blocks(f)
assert len(blocks) == 3 assert len(blocks) == 3
@ -70,7 +69,7 @@ def test_jumbled_case():
def test_bad_file(): def test_bad_file():
with sample_file('poorly_formatted.cpp') as f: with sample_file("poorly_formatted.cpp") as f:
blocks = find_code_blocks(f) blocks = find_code_blocks(f)
assert len(blocks) == 3 assert len(blocks) == 3
@ -78,7 +77,7 @@ def test_bad_file():
def test_indented(): def test_indented():
"""Offsets for functions inside of a class will probably be indented.""" """Offsets for functions inside of a class will probably be indented."""
with sample_file('basic_class.cpp') as f: with sample_file("basic_class.cpp") as f:
blocks = find_code_blocks(f) blocks = find_code_blocks(f)
# TODO: We don't properly detect the end of these functions # TODO: We don't properly detect the end of these functions
@ -87,17 +86,17 @@ def test_indented():
# all the functions that are there. # all the functions that are there.
assert len(blocks) == 2 assert len(blocks) == 2
assert blocks[0].offset == int('0x12345678', 16) assert blocks[0].offset == int("0x12345678", 16)
assert blocks[0].start_line == 15 assert blocks[0].start_line == 15
# assert blocks[0].end_line == 18 # assert blocks[0].end_line == 18
assert blocks[1].offset == int('0xdeadbeef', 16) assert blocks[1].offset == int("0xdeadbeef", 16)
assert blocks[1].start_line == 22 assert blocks[1].start_line == 22
# assert blocks[1].end_line == 24 # assert blocks[1].end_line == 24
def test_inline(): def test_inline():
with sample_file('inline.cpp') as f: with sample_file("inline.cpp") as f:
blocks = find_code_blocks(f) blocks = find_code_blocks(f)
assert len(blocks) == 2 assert len(blocks) == 2
@ -110,19 +109,19 @@ def test_multiple_offsets():
"""If multiple offset marks appear before for a code block, take them """If multiple offset marks appear before for a code block, take them
all but ensure module name (case-insensitive) is distinct. all but ensure module name (case-insensitive) is distinct.
Use first module occurrence in case of duplicates.""" Use first module occurrence in case of duplicates."""
with sample_file('multiple_offsets.cpp') as f: with sample_file("multiple_offsets.cpp") as f:
blocks = find_code_blocks(f) blocks = find_code_blocks(f)
assert len(blocks) == 4 assert len(blocks) == 4
assert blocks[0].module == 'TEST' assert blocks[0].module == "TEST"
assert blocks[0].start_line == 9 assert blocks[0].start_line == 9
assert blocks[1].module == 'HELLO' assert blocks[1].module == "HELLO"
assert blocks[1].start_line == 9 assert blocks[1].start_line == 9
# Duplicate modules are ignored # Duplicate modules are ignored
assert blocks[2].start_line == 16 assert blocks[2].start_line == 16
assert blocks[2].offset == 0x2345 assert blocks[2].offset == 0x2345
assert blocks[3].module == 'TEST' assert blocks[3].module == "TEST"
assert blocks[3].offset == 0x2002 assert blocks[3].offset == 0x2002

View file

@ -1,6 +1,6 @@
import pytest
from collections import namedtuple from collections import namedtuple
from typing import List from typing import List
import pytest
from isledecomp.parser.util import ( from isledecomp.parser.util import (
is_blank_or_comment, is_blank_or_comment,
match_offset_comment, match_offset_comment,
@ -10,21 +10,20 @@
blank_or_comment_param = [ blank_or_comment_param = [
(True, ''), (True, ""),
(True, '\t'), (True, "\t"),
(True, ' '), (True, " "),
(False, '\tint abc=123;'), (False, "\tint abc=123;"),
(True, '// OFFSET: LEGO1 0xdeadbeef'), (True, "// OFFSET: LEGO1 0xdeadbeef"),
(True, ' /* Block comment beginning'), (True, " /* Block comment beginning"),
(True, 'Block comment ending */ '), (True, "Block comment ending */ "),
# TODO: does clang-format have anything to say about these cases? # TODO: does clang-format have anything to say about these cases?
(False, 'x++; // Comment folows'), (False, "x++; // Comment folows"),
(False, 'x++; /* Block comment begins'), (False, "x++; /* Block comment begins"),
] ]
@pytest.mark.parametrize('expected, line', blank_or_comment_param) @pytest.mark.parametrize("expected, line", blank_or_comment_param)
def test_is_blank_or_comment(line: str, expected: bool): def test_is_blank_or_comment(line: str, expected: bool):
assert is_blank_or_comment(line) is expected assert is_blank_or_comment(line) is expected
@ -32,82 +31,73 @@ def test_is_blank_or_comment(line: str, expected: bool):
offset_comment_samples = [ offset_comment_samples = [
# (can_parse: bool, exact_match: bool, line: str) # (can_parse: bool, exact_match: bool, line: str)
# Should match both expected modules with optional STUB marker # Should match both expected modules with optional STUB marker
(True, True, '// OFFSET: LEGO1 0xdeadbeef'), (True, True, "// OFFSET: LEGO1 0xdeadbeef"),
(True, True, '// OFFSET: LEGO1 0xdeadbeef STUB'), (True, True, "// OFFSET: LEGO1 0xdeadbeef STUB"),
(True, True, '// OFFSET: ISLE 0x12345678'), (True, True, "// OFFSET: ISLE 0x12345678"),
(True, True, '// OFFSET: ISLE 0x12345678 STUB'), (True, True, "// OFFSET: ISLE 0x12345678 STUB"),
# No trailing spaces allowed # No trailing spaces allowed
(True, False, '// OFFSET: LEGO1 0xdeadbeef '), (True, False, "// OFFSET: LEGO1 0xdeadbeef "),
(True, False, '// OFFSET: LEGO1 0xdeadbeef STUB '), (True, False, "// OFFSET: LEGO1 0xdeadbeef STUB "),
# Must have exactly one space between elements # Must have exactly one space between elements
(True, False, '//OFFSET: ISLE 0xdeadbeef'), (True, False, "//OFFSET: ISLE 0xdeadbeef"),
(True, False, '// OFFSET:ISLE 0xdeadbeef'), (True, False, "// OFFSET:ISLE 0xdeadbeef"),
(True, False, '// OFFSET: ISLE 0xdeadbeef'), (True, False, "// OFFSET: ISLE 0xdeadbeef"),
(True, False, '// OFFSET: ISLE 0xdeadbeef'), (True, False, "// OFFSET: ISLE 0xdeadbeef"),
(True, False, '// OFFSET: ISLE 0xdeadbeef'), (True, False, "// OFFSET: ISLE 0xdeadbeef"),
(True, False, '// OFFSET: ISLE 0xdeadbeef STUB'), (True, False, "// OFFSET: ISLE 0xdeadbeef STUB"),
# Must have 0x prefix for hex number # Must have 0x prefix for hex number
(True, False, '// OFFSET: ISLE deadbeef'), (True, False, "// OFFSET: ISLE deadbeef"),
# Offset, module name, and STUB must be uppercase # Offset, module name, and STUB must be uppercase
(True, False, '// offset: ISLE 0xdeadbeef'), (True, False, "// offset: ISLE 0xdeadbeef"),
(True, False, '// offset: isle 0xdeadbeef'), (True, False, "// offset: isle 0xdeadbeef"),
(True, False, '// OFFSET: LEGO1 0xdeadbeef stub'), (True, False, "// OFFSET: LEGO1 0xdeadbeef stub"),
# Hex string must be lowercase # Hex string must be lowercase
(True, False, '// OFFSET: ISLE 0xDEADBEEF'), (True, False, "// OFFSET: ISLE 0xDEADBEEF"),
# TODO: How flexible should we be with matching the module name? # TODO: How flexible should we be with matching the module name?
(True, True, '// OFFSET: OMNI 0x12345678'), (True, True, "// OFFSET: OMNI 0x12345678"),
(True, True, '// OFFSET: LEG01 0x12345678'), (True, True, "// OFFSET: LEG01 0x12345678"),
(True, False, '// OFFSET: hello 0x12345678'), (True, False, "// OFFSET: hello 0x12345678"),
# Not close enough to match # Not close enough to match
(False, False, '// OFFSET: ISLE0x12345678'), (False, False, "// OFFSET: ISLE0x12345678"),
(False, False, '// OFFSET: 0x12345678'), (False, False, "// OFFSET: 0x12345678"),
(False, False, '// LEGO1: 0x12345678'), (False, False, "// LEGO1: 0x12345678"),
# Hex string shorter than 8 characters # Hex string shorter than 8 characters
(True, True, '// OFFSET: LEGO1 0x1234'), (True, True, "// OFFSET: LEGO1 0x1234"),
# TODO: These match but shouldn't. # TODO: These match but shouldn't.
# (False, False, '// OFFSET: LEGO1 0'), # (False, False, '// OFFSET: LEGO1 0'),
# (False, False, '// OFFSET: LEGO1 0x'), # (False, False, '// OFFSET: LEGO1 0x'),
] ]
@pytest.mark.parametrize('match, exact, line', offset_comment_samples) @pytest.mark.parametrize("match, _, line", offset_comment_samples)
def test_offset_match(line: str, match: bool, exact): def test_offset_match(line: str, match: bool, _):
did_match = match_offset_comment(line) is not None did_match = match_offset_comment(line) is not None
assert did_match is match assert did_match is match
@pytest.mark.parametrize('match, exact, line', offset_comment_samples) @pytest.mark.parametrize("_, exact, line", offset_comment_samples)
def test_exact_offset_comment(line: str, exact: bool, match): def test_exact_offset_comment(line: str, exact: bool, _):
assert is_exact_offset_comment(line) is exact assert is_exact_offset_comment(line) is exact
# Helper for the next test: cut down version of OffsetMatch # Helper for the next test: cut down version of OffsetMatch
MiniOfs = namedtuple('MiniOfs', ['module', 'value']) MiniOfs = namedtuple("MiniOfs", ["module", "value"])
distinct_by_module_samples = [ distinct_by_module_samples = [
# empty set # empty set
([], []), ([], []),
# same module name # same module name
([MiniOfs('TEST', 123), MiniOfs('TEST', 555)], ([MiniOfs("TEST", 123), MiniOfs("TEST", 555)], [MiniOfs("TEST", 123)]),
[MiniOfs('TEST', 123)]),
# same module name, case-insensitive # same module name, case-insensitive
([MiniOfs('test', 123), MiniOfs('TEST', 555)], ([MiniOfs("test", 123), MiniOfs("TEST", 555)], [MiniOfs("test", 123)]),
[MiniOfs('test', 123)]),
# duplicates, non-consecutive # duplicates, non-consecutive
([MiniOfs('test', 123), MiniOfs('abc', 111), MiniOfs('TEST', 555)], (
[MiniOfs('test', 123), MiniOfs('abc', 111)]), [MiniOfs("test", 123), MiniOfs("abc", 111), MiniOfs("TEST", 555)],
[MiniOfs("test", 123), MiniOfs("abc", 111)],
),
] ]
@pytest.mark.parametrize('sample, expected', distinct_by_module_samples) @pytest.mark.parametrize("sample, expected", distinct_by_module_samples)
def test_distinct_by_module(sample: List[MiniOfs], expected: List[MiniOfs]): def test_distinct_by_module(sample: List[MiniOfs], expected: List[MiniOfs]):
assert distinct_by_module(sample) == expected assert distinct_by_module(sample) == expected

View file

@ -2,284 +2,51 @@
import argparse import argparse
import base64 import base64
from capstone import *
import difflib import difflib
import struct import json
import subprocess
import logging import logging
import os import os
import sys
import colorama
import json
import re import re
from isledecomp.dir import walk_source_dir
from isledecomp.parser import find_code_blocks from isledecomp import (
Bin,
find_code_blocks,
get_file_in_script_dir,
OffsetPlaceholderGenerator,
print_diff,
SymInfo,
walk_source_dir,
WinePathConverter,
)
from capstone import Cs, CS_ARCH_X86, CS_MODE_32
import colorama
from pystache import Renderer from pystache import Renderer
parser = argparse.ArgumentParser(allow_abbrev=False, REGISTER_LIST = set(
description='Recompilation Compare: compare an original EXE with a recompiled EXE + PDB.') [
parser.add_argument('original', metavar='original-binary', help='The original binary') "ax",
parser.add_argument('recompiled', metavar='recompiled-binary', help='The recompiled binary') "bp",
parser.add_argument('pdb', metavar='recompiled-pdb', help='The PDB of the recompiled binary') "bx",
parser.add_argument('decomp_dir', metavar='decomp-dir', help='The decompiled source tree') "cx",
parser.add_argument('--total', '-T', metavar='<count>', help='Total number of expected functions (improves total accuracy statistic)') "di",
parser.add_argument('--verbose', '-v', metavar='<offset>', help='Print assembly diff for specific function (original file\'s offset)') "dx",
parser.add_argument('--html', '-H', metavar='<file>', help='Generate searchable HTML summary of status and diffs') "eax",
parser.add_argument('--no-color', '-n', action='store_true', help='Do not color the output') "ebp",
parser.add_argument('--svg', '-S', metavar='<file>', help='Generate SVG graphic of progress') "ebx",
parser.add_argument('--svg-icon', metavar='icon', help='Icon to use in SVG (PNG)') "ecx",
parser.add_argument('--print-rec-addr', action='store_true', help='Print addresses of recompiled functions too') "edi",
"edx",
"esi",
"esp",
"si",
"sp",
]
)
WORDS = re.compile(r"\w+")
parser.set_defaults(loglevel=logging.INFO)
parser.add_argument('--debug', action='store_const', const=logging.DEBUG, dest='loglevel', help='Print script debug information')
args = parser.parse_args() def sanitize(file, placeholder_generator, mnemonic, op_str):
logging.basicConfig(level=args.loglevel, format='[%(levelname)s] %(message)s')
logger = logging.getLogger(__name__)
colorama.init()
verbose = None
found_verbose_target = False
if args.verbose:
try:
verbose = int(args.verbose, 16)
except ValueError:
parser.error('invalid verbose argument')
html_path = args.html
plain = args.no_color
original = args.original
if not os.path.isfile(original):
parser.error(f'Original binary {original} does not exist')
recomp = args.recompiled
if not os.path.isfile(recomp):
parser.error(f'Recompiled binary {recomp} does not exist')
syms = args.pdb
if not os.path.isfile(syms):
parser.error(f'Symbols PDB {syms} does not exist')
source = args.decomp_dir
if not os.path.isdir(source):
parser.error(f'Source directory {source} does not exist')
svg = args.svg
# Declare a class that can automatically convert virtual executable addresses
# to file addresses
class Bin:
def __init__(self, filename):
logger.debug(f'Parsing headers of "{filename}"... ')
self.file = open(filename, 'rb')
#HACK: Strictly, we should be parsing the header, but we know where
# everything is in these two files so we just jump straight there
# Read ImageBase
self.file.seek(0xB4)
self.imagebase, = struct.unpack('<i', self.file.read(4))
# Read .text VirtualAddress
self.file.seek(0x184)
self.textvirt, = struct.unpack('<i', self.file.read(4))
# Read .text PointerToRawData
self.file.seek(0x18C)
self.textraw, = struct.unpack('<i', self.file.read(4))
logger.debug('... Parsing finished')
def __del__(self):
if self.file:
self.file.close()
def get_addr(self, virt):
return virt - self.imagebase - self.textvirt + self.textraw
def read(self, offset, size):
self.file.seek(self.get_addr(offset))
return self.file.read(size)
class RecompiledInfo:
def __init__(self):
self.addr = None
self.size = None
self.name = None
self.start = None
class WinePathConverter:
def __init__(self, unix_cwd):
self.unix_cwd = unix_cwd
self.win_cwd = self._call_winepath_unix2win(self.unix_cwd)
def get_wine_path(self, unix_fn: str) -> str:
if unix_fn.startswith('./'):
return self.win_cwd + '\\' + unix_fn[2:].replace('/', '\\')
if unix_fn.startswith(self.unix_cwd):
return self.win_cwd + '\\' + unix_fn.removeprefix(self.unix_cwd).replace('/', '\\').lstrip('\\')
return self._call_winepath_unix2win(unix_fn)
def get_unix_path(self, win_fn: str) -> str:
if win_fn.startswith('.\\') or win_fn.startswith('./'):
return self.unix_cwd + '/' + win_fn[2:].replace('\\', '/')
if win_fn.startswith(self.win_cwd):
return self.unix_cwd + '/' + win_fn.removeprefix(self.win_cwd).replace('\\', '/')
return self._call_winepath_win2unix(win_fn)
@staticmethod
def _call_winepath_unix2win(fn: str) -> str:
return subprocess.check_output(['winepath', '-w', fn], text=True).strip()
@staticmethod
def _call_winepath_win2unix(fn: str) -> str:
return subprocess.check_output(['winepath', fn], text=True).strip()
def get_file_in_script_dir(fn):
return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn)
# Declare a class that parses the output of cvdump for fast access later
class SymInfo:
funcs = {}
lines = {}
names = {}
def __init__(self, pdb, file, wine_path_converter):
call = [get_file_in_script_dir('cvdump.exe'), '-l', '-s']
if wine_path_converter:
# Run cvdump through wine and convert path to Windows-friendly wine path
call.insert(0, 'wine')
call.append(wine_path_converter.get_wine_path(pdb))
else:
call.append(pdb)
logger.info(f'Parsing {pdb} ...')
logger.debug(f'Command = {call}')
line_dump = subprocess.check_output(call).decode('utf-8').split('\r\n')
current_section = None
logger.debug('Parsing output of cvdump.exe ...')
for i, line in enumerate(line_dump):
if line.startswith('***'):
current_section = line[4:]
if current_section == 'SYMBOLS' and 'S_GPROC32' in line:
addr = int(line[26:34], 16)
info = RecompiledInfo()
info.addr = addr + recompfile.imagebase + recompfile.textvirt
use_dbg_offs = False
if use_dbg_offs:
debug_offs = line_dump[i + 2]
debug_start = int(debug_offs[22:30], 16)
debug_end = int(debug_offs[43:], 16)
info.start = debug_start
info.size = debug_end - debug_start
else:
info.start = 0
info.size = int(line[41:49], 16)
info.name = line[77:]
self.names[info.name] = info
self.funcs[addr] = info
elif current_section == 'LINES' and line.startswith(' ') and not line.startswith(' '):
sourcepath = line.split()[0]
if wine_path_converter:
# Convert filename to Unix path for file compare
sourcepath = wine_path_converter.get_unix_path(sourcepath)
if sourcepath not in self.lines:
self.lines[sourcepath] = {}
j = i + 2
while True:
ll = line_dump[j].split()
if len(ll) == 0:
break
k = 0
while k < len(ll):
linenum = int(ll[k + 0])
address = int(ll[k + 1], 16)
if linenum not in self.lines[sourcepath]:
self.lines[sourcepath][linenum] = address
k += 2
j += 1
logger.debug('... Parsing output of cvdump.exe finished')
def get_recompiled_address(self, filename, line):
addr = None
found = False
logger.debug(f'Looking for {filename}:{line}')
filename_basename = os.path.basename(filename).lower()
for fn in self.lines:
# Sometimes a PDB is compiled with a relative path while we always have
# an absolute path. Therefore we must
try:
if (os.path.basename(fn).lower() == filename_basename and
os.path.samefile(fn, filename)):
filename = fn
break
except FileNotFoundError as e:
continue
if filename in self.lines and line in self.lines[fn]:
addr = self.lines[fn][line]
if addr in self.funcs:
return self.funcs[addr]
else:
logger.error(f'Failed to find function symbol with address: 0x{addr:x}')
else:
logger.error(f'Failed to find function symbol with filename and line: {filename}:{line}')
def get_recompiled_address_from_name(self, name):
logger.debug('Looking for %s', name)
if name in self.names:
return self.names[name]
else:
logger.error(f'Failed to find function symbol with name: {name}')
wine_path_converter = None
if os.name != 'nt':
wine_path_converter = WinePathConverter(source)
origfile = Bin(original)
recompfile = Bin(recomp)
syminfo = SymInfo(syms, recompfile, wine_path_converter)
print()
md = Cs(CS_ARCH_X86, CS_MODE_32)
class OffsetPlaceholderGenerator:
def __init__(self):
self.counter = 0
self.replacements = {}
def get(self, addr):
if addr in self.replacements:
return self.replacements[addr]
else:
self.counter += 1
replacement = f'<OFFSET{self.counter}>'
self.replacements[addr] = replacement
return replacement
def sanitize(file, placeholderGenerator, mnemonic, op_str):
op_str_is_number = False op_str_is_number = False
try: try:
int(op_str, 16) int(op_str, 16)
@ -287,76 +54,61 @@ def sanitize(file, placeholderGenerator, mnemonic, op_str):
except ValueError: except ValueError:
pass pass
if (mnemonic == 'call' or mnemonic == 'jmp') and op_str_is_number: if (mnemonic in ["call", "jmp"]) and op_str_is_number:
# Filter out "calls" because the offsets we're not currently trying to # Filter out "calls" because the offsets we're not currently trying to
# match offsets. As long as there's a call in the right place, it's # match offsets. As long as there's a call in the right place, it's
# probably accurate. # probably accurate.
op_str = placeholderGenerator.get(int(op_str, 16)) op_str = placeholder_generator.get(int(op_str, 16))
else: else:
def filter_out_ptr(ptype, op_str): def filter_out_ptr(ptype, op_str):
try: try:
ptrstr = ptype + ' ptr [' ptrstr = ptype + " ptr ["
start = op_str.index(ptrstr) + len(ptrstr) start = op_str.index(ptrstr) + len(ptrstr)
end = op_str.index(']', start) end = op_str.index("]", start)
# This will throw ValueError if not hex # This will throw ValueError if not hex
inttest = int(op_str[start:end], 16) inttest = int(op_str[start:end], 16)
return op_str[0:start] + placeholderGenerator.get(inttest) + op_str[end:] return (
op_str[0:start] + placeholder_generator.get(inttest) + op_str[end:]
)
except ValueError: except ValueError:
return op_str return op_str
# Filter out dword ptrs where the pointer is to an offset # Filter out dword ptrs where the pointer is to an offset
op_str = filter_out_ptr('dword', op_str) op_str = filter_out_ptr("dword", op_str)
op_str = filter_out_ptr('word', op_str) op_str = filter_out_ptr("word", op_str)
op_str = filter_out_ptr('byte', op_str) op_str = filter_out_ptr("byte", op_str)
# Use heuristics to filter out any args that look like offsets # Use heuristics to filter out any args that look like offsets
words = op_str.split(' ') words = op_str.split(" ")
for i, word in enumerate(words): for i, word in enumerate(words):
try: try:
inttest = int(word, 16) inttest = int(word, 16)
if inttest >= file.imagebase + file.textvirt: if inttest >= file.imagebase + file.textvirt:
words[i] = placeholderGenerator.get(inttest) words[i] = placeholder_generator.get(inttest)
except ValueError: except ValueError:
pass pass
op_str = ' '.join(words) op_str = " ".join(words)
return mnemonic, op_str return mnemonic, op_str
def parse_asm(file, addr, size):
def parse_asm(disassembler, file, asm_addr, size):
asm = [] asm = []
data = file.read(addr, size) data = file.read(asm_addr, size)
placeholderGenerator = OffsetPlaceholderGenerator() placeholder_generator = OffsetPlaceholderGenerator()
for i in md.disasm(data, 0): for i in disassembler.disasm(data, 0):
# Use heuristics to disregard some differences that aren't representative # Use heuristics to disregard some differences that aren't representative
# of the accuracy of a function (e.g. global offsets) # of the accuracy of a function (e.g. global offsets)
mnemonic, op_str = sanitize(file, placeholderGenerator, i.mnemonic, i.op_str) mnemonic, op_str = sanitize(file, placeholder_generator, i.mnemonic, i.op_str)
if op_str is None: if op_str is None:
asm.append(mnemonic) asm.append(mnemonic)
else: else:
asm.append(f'{mnemonic} {op_str}') asm.append(f"{mnemonic} {op_str}")
return asm return asm
REGISTER_LIST = set([
'ax',
'bp',
'bx',
'cx',
'di',
'dx',
'eax',
'ebp',
'ebx',
'ecx',
'edi',
'edx',
'esi',
'esp',
'si',
'sp',
])
WORDS = re.compile(r'\w+')
def get_registers(line: str): def get_registers(line: str):
to_replace = [] to_replace = []
@ -367,8 +119,15 @@ def get_registers(line: str):
to_replace.append((reg, match.start())) to_replace.append((reg, match.start()))
return to_replace return to_replace
def replace_register(lines: list[str], start_line: int, reg: str, replacement: str) -> list[str]:
return [line.replace(reg, replacement) if i >= start_line else line for i, line in enumerate(lines)] def replace_register(
lines: list[str], start_line: int, reg: str, replacement: str
) -> list[str]:
return [
line.replace(reg, replacement) if i >= start_line else line
for i, line in enumerate(lines)
]
# Is it possible to make new_asm the same as original_asm by swapping registers? # Is it possible to make new_asm the same as original_asm by swapping registers?
def can_resolve_register_differences(original_asm, new_asm): def can_resolve_register_differences(original_asm, new_asm):
@ -382,20 +141,19 @@ def can_resolve_register_differences(original_asm, new_asm):
return False return False
# Look for the mismatching lines # Look for the mismatching lines
for i in range(len(original_asm)): for i, original_line in enumerate(original_asm):
new_line = new_asm[i] new_line = new_asm[i]
original_line = original_asm[i]
if new_line != original_line: if new_line != original_line:
# Find all the registers to replace # Find all the registers to replace
to_replace = get_registers(original_line) to_replace = get_registers(original_line)
for j in range(len(to_replace)): for replace in to_replace:
(reg, reg_index) = to_replace[j] (reg, reg_index) = replace
replacing_reg = new_line[reg_index:reg_index + len(reg)] replacing_reg = new_line[reg_index : reg_index + len(reg)]
if replacing_reg in REGISTER_LIST: if replacing_reg in REGISTER_LIST:
if replacing_reg != reg: if replacing_reg != reg:
# Do a three-way swap replacing in all the subsequent lines # Do a three-way swap replacing in all the subsequent lines
temp_reg = '&' * len(reg) temp_reg = "&" * len(reg)
new_asm = replace_register(new_asm, i, replacing_reg, temp_reg) new_asm = replace_register(new_asm, i, replacing_reg, temp_reg)
new_asm = replace_register(new_asm, i, reg, replacing_reg) new_asm = replace_register(new_asm, i, reg, replacing_reg)
new_asm = replace_register(new_asm, i, temp_reg, reg) new_asm = replace_register(new_asm, i, temp_reg, reg)
@ -403,21 +161,160 @@ def can_resolve_register_differences(original_asm, new_asm):
# No replacement to do, different code, bail out # No replacement to do, different code, bail out
return False return False
# Check if the lines are now the same # Check if the lines are now the same
for i in range(len(original_asm)): for i, original_line in enumerate(original_asm):
if new_asm[i] != original_asm[i]: if new_asm[i] != original_line:
return False return False
return True return True
function_count = 0
total_accuracy = 0
total_effective_accuracy = 0
htmlinsert = []
# Generate basename of original file, used in locating OFFSET lines def gen_html(html_file, data):
basename = os.path.basename(os.path.splitext(original)[0]) output_data = Renderer().render_path(
get_file_in_script_dir("template.html"), {"data": data}
)
for srcfilename in walk_source_dir(source): with open(html_file, "w", encoding="utf-8") as htmlfile:
with open(srcfilename, 'r') as srcfile: htmlfile.write(output_data)
def gen_svg(svg_file, name_svg, icon, svg_implemented_funcs, total_funcs, raw_accuracy):
icon_data = None
if icon:
with open(icon, "rb") as iconfile:
icon_data = base64.b64encode(iconfile.read()).decode("utf-8")
total_statistic = raw_accuracy / total_funcs
full_percentbar_width = 127.18422
output_data = Renderer().render_path(
get_file_in_script_dir("template.svg"),
{
"name": name_svg,
"icon": icon_data,
"implemented": f"{(svg_implemented_funcs / total_funcs * 100):.2f}% ({svg_implemented_funcs}/{total_funcs})",
"accuracy": f"{(raw_accuracy / svg_implemented_funcs * 100):.2f}%",
"progbar": total_statistic * full_percentbar_width,
"percent": f"{(total_statistic * 100):.2f}%",
},
)
with open(svg_file, "w", encoding="utf-8") as svgfile:
svgfile.write(output_data)
# Do the actual work
if __name__ == "__main__":
parser = argparse.ArgumentParser(
allow_abbrev=False,
description="Recompilation Compare: compare an original EXE with a recompiled EXE + PDB.",
)
parser.add_argument(
"original", metavar="original-binary", help="The original binary"
)
parser.add_argument(
"recompiled", metavar="recompiled-binary", help="The recompiled binary"
)
parser.add_argument(
"pdb", metavar="recompiled-pdb", help="The PDB of the recompiled binary"
)
parser.add_argument(
"decomp_dir", metavar="decomp-dir", help="The decompiled source tree"
)
parser.add_argument(
"--total",
"-T",
metavar="<count>",
help="Total number of expected functions (improves total accuracy statistic)",
)
parser.add_argument(
"--verbose",
"-v",
metavar="<offset>",
help="Print assembly diff for specific function (original file's offset)",
)
parser.add_argument(
"--html",
"-H",
metavar="<file>",
help="Generate searchable HTML summary of status and diffs",
)
parser.add_argument(
"--no-color", "-n", action="store_true", help="Do not color the output"
)
parser.add_argument(
"--svg", "-S", metavar="<file>", help="Generate SVG graphic of progress"
)
parser.add_argument("--svg-icon", metavar="icon", help="Icon to use in SVG (PNG)")
parser.add_argument(
"--print-rec-addr",
action="store_true",
help="Print addresses of recompiled functions too",
)
parser.set_defaults(loglevel=logging.INFO)
parser.add_argument(
"--debug",
action="store_const",
const=logging.DEBUG,
dest="loglevel",
help="Print script debug information",
)
args = parser.parse_args()
logging.basicConfig(level=args.loglevel, format="[%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)
colorama.init()
verbose = None
found_verbose_target = False
if args.verbose:
try:
verbose = int(args.verbose, 16)
except ValueError:
parser.error("invalid verbose argument")
html_path = args.html
plain = args.no_color
original = args.original
if not os.path.isfile(original):
parser.error(f"Original binary {original} does not exist")
recomp = args.recompiled
if not os.path.isfile(recomp):
parser.error(f"Recompiled binary {recomp} does not exist")
syms = args.pdb
if not os.path.isfile(syms):
parser.error(f"Symbols PDB {syms} does not exist")
source = args.decomp_dir
if not os.path.isdir(source):
parser.error(f"Source directory {source} does not exist")
svg = args.svg
wine_path_converter = None
if os.name != "nt":
wine_path_converter = WinePathConverter(source)
with Bin(original, logger) as origfile, Bin(recomp, logger) as recompfile:
syminfo = SymInfo(
syms, recompfile, logger, sym_wine_path_converter=wine_path_converter
)
print()
capstone_disassembler = Cs(CS_ARCH_X86, CS_MODE_32)
function_count = 0
total_accuracy = 0
total_effective_accuracy = 0
htmlinsert = []
# Generate basename of original file, used in locating OFFSET lines
basename = os.path.basename(os.path.splitext(original)[0])
for srcfilename in walk_source_dir(source):
with open(srcfilename, "r", encoding="utf-8") as srcfile:
blocks = find_code_blocks(srcfile) blocks = find_code_blocks(srcfile)
for block in blocks: for block in blocks:
@ -440,7 +337,9 @@ def can_resolve_register_differences(original_asm, new_asm):
if not recinfo: if not recinfo:
continue continue
else: else:
recinfo = syminfo.get_recompiled_address(srcfilename, block.start_line) recinfo = syminfo.get_recompiled_address(
srcfilename, block.start_line
)
if not recinfo: if not recinfo:
continue continue
@ -449,8 +348,18 @@ def can_resolve_register_differences(original_asm, new_asm):
ratio = 0.0 ratio = 0.0
effective_ratio = 0.0 effective_ratio = 0.0
if recinfo.size: if recinfo.size:
origasm = parse_asm(origfile, addr + recinfo.start, recinfo.size) origasm = parse_asm(
recompasm = parse_asm(recompfile, recinfo.addr + recinfo.start, recinfo.size) capstone_disassembler,
origfile,
addr + recinfo.start,
recinfo.size,
)
recompasm = parse_asm(
capstone_disassembler,
recompfile,
recinfo.addr + recinfo.start,
recinfo.size,
)
diff = difflib.SequenceMatcher(None, origasm, recompasm) diff = difflib.SequenceMatcher(None, origasm, recompasm)
ratio = diff.ratio() ratio = diff.ratio()
@ -464,28 +373,40 @@ def can_resolve_register_differences(original_asm, new_asm):
else: else:
ratio = 0 ratio = 0
percenttext = f'{(effective_ratio * 100):.2f}%' percenttext = f"{(effective_ratio * 100):.2f}%"
if not plain: if not plain:
if effective_ratio == 1.0: if effective_ratio == 1.0:
percenttext = colorama.Fore.GREEN + percenttext + colorama.Style.RESET_ALL percenttext = (
colorama.Fore.GREEN + percenttext + colorama.Style.RESET_ALL
)
elif effective_ratio > 0.8: elif effective_ratio > 0.8:
percenttext = colorama.Fore.YELLOW + percenttext + colorama.Style.RESET_ALL percenttext = (
colorama.Fore.YELLOW
+ percenttext
+ colorama.Style.RESET_ALL
)
else: else:
percenttext = colorama.Fore.RED + percenttext + colorama.Style.RESET_ALL percenttext = (
colorama.Fore.RED + percenttext + colorama.Style.RESET_ALL
)
if effective_ratio == 1.0 and ratio != 1.0: if effective_ratio == 1.0 and ratio != 1.0:
if plain: if plain:
percenttext += '*' percenttext += "*"
else: else:
percenttext += colorama.Fore.RED + '*' + colorama.Style.RESET_ALL percenttext += (
colorama.Fore.RED + "*" + colorama.Style.RESET_ALL
)
if args.print_rec_addr: if args.print_rec_addr:
addrs = f'0x{addr:x} / 0x{recinfo.addr:x}' addrs = f"0x{addr:x} / 0x{recinfo.addr:x}"
else: else:
addrs = hex(addr) addrs = hex(addr)
if not verbose: if not verbose:
print(f' {recinfo.name} ({addrs}) is {percenttext} similar to the original') print(
f" {recinfo.name} ({addrs}) is {percenttext} similar to the original"
)
function_count += 1 function_count += 1
total_accuracy += ratio total_accuracy += ratio
@ -497,81 +418,48 @@ def can_resolve_register_differences(original_asm, new_asm):
# If verbose, print the diff for that function to the output # If verbose, print the diff for that function to the output
if verbose: if verbose:
if effective_ratio == 1.0: if effective_ratio == 1.0:
ok_text = 'OK!' if plain else (colorama.Fore.GREEN + '✨ OK! ✨' + colorama.Style.RESET_ALL) ok_text = (
"OK!"
if plain
else (
colorama.Fore.GREEN
+ "✨ OK! ✨"
+ colorama.Style.RESET_ALL
)
)
if ratio == 1.0: if ratio == 1.0:
print(f'{addrs}: {recinfo.name} 100% match.\n\n{ok_text}\n\n') print(
f"{addrs}: {recinfo.name} 100% match.\n\n{ok_text}\n\n"
)
else: else:
print(f'{addrs}: {recinfo.name} Effective 100%% match. (Differs in register allocation only)\n\n{ok_text} (still differs in register allocation)\n\n') print(
f"{addrs}: {recinfo.name} Effective 100%% match. (Differs in register allocation only)\n\n{ok_text} (still differs in register allocation)\n\n"
)
else: else:
for line in udiff: print_diff(udiff, plain)
if line.startswith('++') or line.startswith('@@') or line.startswith('--'):
# Skip unneeded parts of the diff for the brief view
pass
elif line.startswith('+'):
if plain:
print(line)
else:
print(colorama.Fore.GREEN + line)
elif line.startswith('-'):
if plain:
print(line)
else:
print(colorama.Fore.RED + line)
else:
print(line)
if not plain:
print(colorama.Style.RESET_ALL, end='')
print(f'\n{recinfo.name} is only {percenttext} similar to the original, diff above') print(
f"\n{recinfo.name} is only {percenttext} similar to the original, diff above"
)
# If html, record the diffs to an HTML file # If html, record the diffs to an HTML file
if html_path: if html_path:
htmlinsert.append({"address": f"0x{addr:x}", htmlinsert.append(
{
"address": f"0x{addr:x}",
"name": recinfo.name, "name": recinfo.name,
"matching": effective_ratio, "matching": effective_ratio,
"diff": '\n'.join(udiff)}) "diff": "\n".join(udiff),
def gen_html(html_file, data):
output_data = Renderer().render_path(get_file_in_script_dir('template.html'),
{
"data": data,
} }
) )
with open(html_file, 'w') as htmlfile: if html_path:
htmlfile.write(output_data)
def gen_svg(svg_file, name_svg, icon, svg_implemented_funcs, total_funcs, raw_accuracy):
icon_data = None
if icon:
with open(icon, 'rb') as iconfile:
icon_data = base64.b64encode(iconfile.read()).decode('utf-8')
total_statistic = raw_accuracy / total_funcs
full_percentbar_width = 127.18422
output_data = Renderer().render_path(get_file_in_script_dir('template.svg'),
{
"name": name_svg,
"icon": icon_data,
"implemented": f'{(svg_implemented_funcs / total_funcs * 100):.2f}% ({svg_implemented_funcs}/{total_funcs})',
"accuracy": f'{(raw_accuracy / svg_implemented_funcs * 100):.2f}%',
"progbar": total_statistic * full_percentbar_width,
"percent": f'{(total_statistic * 100):.2f}%',
}
)
with open(svg_file, 'w') as svgfile:
svgfile.write(output_data)
if html_path:
gen_html(html_path, json.dumps(htmlinsert)) gen_html(html_path, json.dumps(htmlinsert))
if verbose: if verbose:
if not found_verbose_target: if not found_verbose_target:
print(f'Failed to find the function with address 0x{verbose:x}') print(f"Failed to find the function with address 0x{verbose:x}")
else: else:
implemented_funcs = function_count implemented_funcs = function_count
if args.total: if args.total:
@ -580,7 +468,16 @@ def gen_svg(svg_file, name_svg, icon, svg_implemented_funcs, total_funcs, raw_ac
if function_count > 0: if function_count > 0:
effective_accuracy = total_effective_accuracy / function_count * 100 effective_accuracy = total_effective_accuracy / function_count * 100
actual_accuracy = total_accuracy / function_count * 100 actual_accuracy = total_accuracy / function_count * 100
print(f'\nTotal effective accuracy {effective_accuracy:.2f}% across {function_count} functions ({actual_accuracy:.2f}% actual accuracy)') print(
f"\nTotal effective accuracy {effective_accuracy:.2f}% across {function_count} functions ({actual_accuracy:.2f}% actual accuracy)"
)
if svg: if svg:
gen_svg(svg, os.path.basename(original), args.svg_icon, implemented_funcs, function_count, total_effective_accuracy) gen_svg(
svg,
os.path.basename(original),
args.svg_icon,
implemented_funcs,
function_count,
total_effective_accuracy,
)

View file

@ -1,3 +1,4 @@
tools/isledecomp
capstone capstone
colorama colorama
isledecomp isledecomp

View file

@ -1,76 +1,68 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import argparse import argparse
import colorama
import difflib import difflib
import subprocess import subprocess
import os import os
import sys import sys
parser = argparse.ArgumentParser(allow_abbrev=False, from isledecomp.utils import print_diff
description='Verify Exports: Compare the exports of two DLLs.')
parser.add_argument('original', metavar='original-binary', help='The original binary') parser = argparse.ArgumentParser(
parser.add_argument('recompiled', metavar='recompiled-binary', help='The recompiled binary') allow_abbrev=False, description="Verify Exports: Compare the exports of two DLLs."
parser.add_argument('--no-color', '-n', action='store_true', help='Do not color the output') )
parser.add_argument("original", metavar="original-binary", help="The original binary")
parser.add_argument(
"recompiled", metavar="recompiled-binary", help="The recompiled binary"
)
parser.add_argument(
"--no-color", "-n", action="store_true", help="Do not color the output"
)
args = parser.parse_args() args = parser.parse_args()
if not os.path.isfile(args.original): if not os.path.isfile(args.original):
parser.error(f'Original binary file {args.original} does not exist') parser.error(f"Original binary file {args.original} does not exist")
if not os.path.isfile(args.recompiled): if not os.path.isfile(args.recompiled):
parser.error(f'Recompiled binary {args.recompiled} does not exist') parser.error(f"Recompiled binary {args.recompiled} does not exist")
def get_file_in_script_dir(fn): def get_file_in_script_dir(fn):
return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn) return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn)
def get_exports(file):
call = [get_file_in_script_dir('DUMPBIN.EXE'), '/EXPORTS']
if os.name != 'nt': def get_exports(file):
call.insert(0, 'wine') call = [get_file_in_script_dir("DUMPBIN.EXE"), "/EXPORTS"]
file = subprocess.check_output(['winepath', '-w', file]).decode('utf-8').strip()
if os.name != "nt":
call.insert(0, "wine")
file = subprocess.check_output(["winepath", "-w", file]).decode("utf-8").strip()
call.append(file) call.append(file)
raw = subprocess.check_output(call).decode('utf-8').split('\r\n') raw = subprocess.check_output(call).decode("utf-8").split("\r\n")
exports = [] exports = []
start = False start = False
for line in raw: for line in raw:
if not start: if not start:
if line == ' ordinal hint name': if line == " ordinal hint name":
start = True start = True
else: else:
if line: if line:
exports.append(line[27:line.rindex(' (')]) exports.append(line[27 : line.rindex(" (")])
elif exports: elif exports:
break break
return exports return exports
og_exp = get_exports(args.original) og_exp = get_exports(args.original)
re_exp = get_exports(args.recompiled) re_exp = get_exports(args.recompiled)
udiff = difflib.unified_diff(og_exp, re_exp) udiff = difflib.unified_diff(og_exp, re_exp)
has_diff = False has_diff = print_diff(udiff, args.no_color)
for line in udiff:
has_diff = True
color = ''
if line.startswith('++') or line.startswith('@@') or line.startswith('--'):
# Skip unneeded parts of the diff for the brief view
continue
# Work out color if we are printing color
if not args.no_color:
if line.startswith('+'):
color = colorama.Fore.GREEN
elif line.startswith('-'):
color = colorama.Fore.RED
print(color + line)
# Reset color if we're printing in color
if not args.no_color:
print(colorama.Style.RESET_ALL, end='')
sys.exit(1 if has_diff else 0) sys.exit(1 if has_diff else 0)