Python Linting and Code Formatting (#298)

* Create common print_diff function

* Add pylint and black

* Fix linting, move classes to utils

* Add black/pylint to github actions

* Fix linting

* Move Bin and SymInfo into their own files

* Split out format

* Tidy up workdlows and pip, add readme

* Lint tests, add tests to readme
This commit is contained in:
Thomas Phillips 2023-11-26 07:27:42 +13:00 committed by GitHub
parent fb0d1ccb62
commit b14116cc93
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
22 changed files with 1675 additions and 789 deletions

View file

@ -70,15 +70,14 @@ jobs:
path: legobin
key: legobin
- name: Build isledecomp library
- name: Install python packages
shell: bash
run: |
pip install tools/isledecomp
pip install -r tools/requirements.txt
- name: Summarize Accuracy
shell: bash
run: |
pip install -r tools/reccmp/requirements.txt
python3 tools/reccmp/reccmp.py -S ISLEPROGRESS.SVG --svg-icon tools/reccmp/isle.png -H ISLEPROGRESS.HTML legobin/ISLE.EXE build/ISLE.EXE build/ISLE.PDB . | tee ISLEPROGRESS.TXT
python3 tools/reccmp/reccmp.py -S LEGO1PROGRESS.SVG -T 1929 --svg-icon tools/reccmp/lego1.png -H LEGO1PROGRESS.HTML legobin/LEGO1.DLL build/LEGO1.DLL build/LEGO1.PDB . | tee LEGO1PROGRESS.TXT

View file

@ -20,3 +20,20 @@ jobs:
LEGO1/realtime/*.cpp LEGO1/realtime/*.h \
LEGO1/tgl/*.h \
LEGO1/viewmanager/*.cpp LEGO1/viewmanager/*.h
python-format:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Install python libraries
shell: bash
run: |
pip install black pylint pytest -r tools/requirements.txt
- name: Run pylint and black
shell: bash
run: |
pylint tools --ignore=build
black --check tools

View file

@ -9,12 +9,11 @@ jobs:
steps:
- uses: actions/checkout@v3
- name: Build isledecomp library
- name: Install python libraries
run: |
pip install tools/isledecomp
pip install -r tools/requirements.txt
- name: Run checkorder.py
run: |
pip install -r tools/checkorder/requirements.txt
python3 tools/checkorder/checkorder.py --verbose --enforce ISLE
python3 tools/checkorder/checkorder.py --verbose --enforce LEGO1

4
.gitignore vendored
View file

@ -16,6 +16,6 @@ ISLE.EXE
LEGO1.DLL
build/
*.swp
LEGO1PROGRESS.HTML
LEGO1PROGRESS.SVG
LEGO1PROGRESS.*
ISLEPROGRESS.*
*.pyc

635
.pylintrc Normal file
View file

@ -0,0 +1,635 @@
[MAIN]
# Analyse import fallback blocks. This can be used to support both Python 2 and
# 3 compatible code, which means that the block might have code that exists
# only in one or another interpreter, leading to false positives when analysed.
analyse-fallback-blocks=no
# Clear in-memory caches upon conclusion of linting. Useful if running pylint
# in a server-like mode.
clear-cache-post-run=no
# Load and enable all available extensions. Use --list-extensions to see a list
# all available extensions.
#enable-all-extensions=
# In error mode, messages with a category besides ERROR or FATAL are
# suppressed, and no reports are done by default. Error mode is compatible with
# disabling specific errors.
#errors-only=
# Always return a 0 (non-error) status code, even if lint errors are found.
# This is primarily useful in continuous integration scripts.
#exit-zero=
# A comma-separated list of package or module names from where C extensions may
# be loaded. Extensions are loading into the active Python interpreter and may
# run arbitrary code.
extension-pkg-allow-list=
# A comma-separated list of package or module names from where C extensions may
# be loaded. Extensions are loading into the active Python interpreter and may
# run arbitrary code. (This is an alternative name to extension-pkg-allow-list
# for backward compatibility.)
extension-pkg-whitelist=
# Return non-zero exit code if any of these messages/categories are detected,
# even if score is above --fail-under value. Syntax same as enable. Messages
# specified are enabled, while categories only check already-enabled messages.
fail-on=
# Specify a score threshold under which the program will exit with error.
fail-under=10
# Interpret the stdin as a python script, whose filename needs to be passed as
# the module_or_package argument.
#from-stdin=
# Files or directories to be skipped. They should be base names, not paths.
ignore=CVS
# Add files or directories matching the regular expressions patterns to the
# ignore-list. The regex matches against paths and can be in Posix or Windows
# format. Because '\\' represents the directory delimiter on Windows systems,
# it can't be used as an escape character.
ignore-paths=
# Files or directories matching the regular expression patterns are skipped.
# The regex matches against base names, not paths. The default value ignores
# Emacs file locks
ignore-patterns=^\.#
# List of module names for which member attributes should not be checked
# (useful for modules/projects where namespaces are manipulated during runtime
# and thus existing member attributes cannot be deduced by static analysis). It
# supports qualified module names, as well as Unix pattern matching.
ignored-modules=
# Python code to execute, usually for sys.path manipulation such as
# pygtk.require().
#init-hook=
# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
# number of processors available to use, and will cap the count on Windows to
# avoid hangs.
jobs=1
# Control the amount of potential inferred values when inferring a single
# object. This can help the performance when dealing with large functions or
# complex, nested conditions.
limit-inference-results=100
# List of plugins (as comma separated values of python module names) to load,
# usually to register additional checkers.
load-plugins=
# Pickle collected data for later comparisons.
persistent=yes
# Minimum Python version to use for version dependent checks. Will default to
# the version used to run pylint.
py-version=3.11
# Discover python modules and packages in the file system subtree.
recursive=no
# Add paths to the list of the source roots. Supports globbing patterns. The
# source root is an absolute path or a path relative to the current working
# directory used to determine a package namespace for modules located under the
# source root.
source-roots=
# When enabled, pylint would attempt to guess common misconfiguration and emit
# user-friendly hints instead of false-positive error messages.
suggestion-mode=yes
# Allow loading of arbitrary C extensions. Extensions are imported into the
# active Python interpreter and may run arbitrary code.
unsafe-load-any-extension=no
# In verbose mode, extra non-checker-related info will be displayed.
#verbose=
[BASIC]
# Naming style matching correct argument names.
argument-naming-style=snake_case
# Regular expression matching correct argument names. Overrides argument-
# naming-style. If left empty, argument names will be checked with the set
# naming style.
#argument-rgx=
# Naming style matching correct attribute names.
attr-naming-style=snake_case
# Regular expression matching correct attribute names. Overrides attr-naming-
# style. If left empty, attribute names will be checked with the set naming
# style.
#attr-rgx=
# Bad variable names which should always be refused, separated by a comma.
bad-names=foo,
bar,
baz,
toto,
tutu,
tata
# Bad variable names regexes, separated by a comma. If names match any regex,
# they will always be refused
bad-names-rgxs=
# Naming style matching correct class attribute names.
class-attribute-naming-style=any
# Regular expression matching correct class attribute names. Overrides class-
# attribute-naming-style. If left empty, class attribute names will be checked
# with the set naming style.
#class-attribute-rgx=
# Naming style matching correct class constant names.
class-const-naming-style=UPPER_CASE
# Regular expression matching correct class constant names. Overrides class-
# const-naming-style. If left empty, class constant names will be checked with
# the set naming style.
#class-const-rgx=
# Naming style matching correct class names.
class-naming-style=PascalCase
# Regular expression matching correct class names. Overrides class-naming-
# style. If left empty, class names will be checked with the set naming style.
#class-rgx=
# Naming style matching correct constant names.
const-naming-style=snake_case
# Regular expression matching correct constant names. Overrides const-naming-
# style. If left empty, constant names will be checked with the set naming
# style.
#const-rgx=
# Minimum line length for functions/classes that require docstrings, shorter
# ones are exempt.
docstring-min-length=-1
# Naming style matching correct function names.
function-naming-style=snake_case
# Regular expression matching correct function names. Overrides function-
# naming-style. If left empty, function names will be checked with the set
# naming style.
#function-rgx=
# Good variable names which should always be accepted, separated by a comma.
good-names=i,
j,
k,
ex,
Run,
_
# Good variable names regexes, separated by a comma. If names match any regex,
# they will always be accepted
good-names-rgxs=
# Include a hint for the correct naming format with invalid-name.
include-naming-hint=no
# Naming style matching correct inline iteration names.
inlinevar-naming-style=any
# Regular expression matching correct inline iteration names. Overrides
# inlinevar-naming-style. If left empty, inline iteration names will be checked
# with the set naming style.
#inlinevar-rgx=
# Naming style matching correct method names.
method-naming-style=snake_case
# Regular expression matching correct method names. Overrides method-naming-
# style. If left empty, method names will be checked with the set naming style.
#method-rgx=
# Naming style matching correct module names.
module-naming-style=snake_case
# Regular expression matching correct module names. Overrides module-naming-
# style. If left empty, module names will be checked with the set naming style.
#module-rgx=
# Colon-delimited sets of names that determine each other's naming style when
# the name regexes allow several styles.
name-group=
# Regular expression which should only match function or class names that do
# not require a docstring.
no-docstring-rgx=^_
# List of decorators that produce properties, such as abc.abstractproperty. Add
# to this list to register other decorators that produce valid properties.
# These decorators are taken in consideration only for invalid-name.
property-classes=abc.abstractproperty
# Regular expression matching correct type alias names. If left empty, type
# alias names will be checked with the set naming style.
#typealias-rgx=
# Regular expression matching correct type variable names. If left empty, type
# variable names will be checked with the set naming style.
#typevar-rgx=
# Naming style matching correct variable names.
variable-naming-style=snake_case
# Regular expression matching correct variable names. Overrides variable-
# naming-style. If left empty, variable names will be checked with the set
# naming style.
#variable-rgx=
[CLASSES]
# Warn about protected attribute access inside special methods
check-protected-access-in-special-methods=no
# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,
__new__,
setUp,
asyncSetUp,
__post_init__
# List of member names, which should be excluded from the protected access
# warning.
exclude-protected=_asdict,_fields,_replace,_source,_make,os._exit
# List of valid names for the first argument in a class method.
valid-classmethod-first-arg=cls
# List of valid names for the first argument in a metaclass class method.
valid-metaclass-classmethod-first-arg=mcs
[DESIGN]
# List of regular expressions of class ancestor names to ignore when counting
# public methods (see R0903)
exclude-too-few-public-methods=
# List of qualified class names to ignore when counting class parents (see
# R0901)
ignored-parents=
# Maximum number of arguments for function / method.
max-args=6
# Maximum number of attributes for a class (see R0902).
max-attributes=7
# Maximum number of boolean expressions in an if statement (see R0916).
max-bool-expr=5
# Maximum number of branch for function / method body.
max-branches=30
# Maximum number of locals for function / method body.
max-locals=30
# Maximum number of parents for a class (see R0901).
max-parents=7
# Maximum number of public methods for a class (see R0904).
max-public-methods=20
# Maximum number of return / yield for function / method body.
max-returns=6
# Maximum number of statements in function / method body.
max-statements=75
# Minimum number of public methods for a class (see R0903).
min-public-methods=0
[EXCEPTIONS]
# Exceptions that will emit a warning when caught.
overgeneral-exceptions=builtins.BaseException,builtins.Exception
[FORMAT]
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
expected-line-ending-format=
# Regexp for a line that is allowed to be longer than the limit.
ignore-long-lines=^\s*(# )?<?https?://\S+>?$
# Number of spaces of indent required inside a hanging or continued line.
indent-after-paren=2
# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
# tab).
indent-string=' '
# Maximum number of characters on a single line.
max-line-length=200
# Maximum number of lines in a module.
max-module-lines=1000
# Allow the body of a class to be on the same line as the declaration if body
# contains single statement.
single-line-class-stmt=no
# Allow the body of an if to be on the same line as the test if there is no
# else.
single-line-if-stmt=no
[IMPORTS]
# List of modules that can be imported at any level, not just the top level
# one.
allow-any-import-level=
# Allow explicit reexports by alias from a package __init__.
allow-reexport-from-package=no
# Allow wildcard imports from modules that define __all__.
allow-wildcard-with-all=no
# Deprecated modules which should not be used, separated by a comma.
deprecated-modules=
# Output a graph (.gv or any supported image format) of external dependencies
# to the given file (report RP0402 must not be disabled).
ext-import-graph=
# Output a graph (.gv or any supported image format) of all (i.e. internal and
# external) dependencies to the given file (report RP0402 must not be
# disabled).
import-graph=
# Output a graph (.gv or any supported image format) of internal dependencies
# to the given file (report RP0402 must not be disabled).
int-import-graph=
# Force import order to recognize a module as part of the standard
# compatibility libraries.
known-standard-library=
# Force import order to recognize a module as part of a third party library.
known-third-party=enchant
# Couples of modules and preferred modules, separated by a comma.
preferred-modules=
[LOGGING]
# The type of string formatting that logging methods do. `old` means using %
# formatting, `new` is for `{}` formatting.
logging-format-style=old
# Logging modules to check that the string format arguments are in logging
# function parameter format.
logging-modules=logging
[MESSAGES CONTROL]
# Only show warnings with the listed confidence levels. Leave empty to show
# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE,
# UNDEFINED.
confidence=HIGH,
CONTROL_FLOW,
INFERENCE,
INFERENCE_FAILURE,
UNDEFINED
# Disable the message, report, category or checker with the given id(s). You
# can either give multiple identifiers separated by comma (,) or put this
# option multiple times (only on the command line, not in the configuration
# file where it should appear only once). You can also use "--disable=all" to
# disable everything first and then re-enable specific checks. For example, if
# you want to run only the similarities checker, you can use "--disable=all
# --enable=similarities". If you want to run only the classes checker, but have
# no Warning level messages displayed, use "--disable=all --enable=classes
# --disable=W".
disable=raw-checker-failed,
bad-inline-option,
locally-disabled,
file-ignored,
suppressed-message,
useless-suppression,
deprecated-pragma,
use-symbolic-message-instead,
missing-class-docstring,
missing-function-docstring,
missing-module-docstring,
fixme
# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
# multiple time (only on the command line, not in the configuration file where
# it should appear only once). See also the "--disable" option for examples.
enable=c-extension-no-member
[METHOD_ARGS]
# List of qualified names (i.e., library.method) which require a timeout
# parameter e.g. 'requests.api.get,requests.api.post'
timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.api.options,requests.api.patch,requests.api.post,requests.api.put,requests.api.request
[MISCELLANEOUS]
# List of note tags to take in consideration, separated by a comma.
notes=FIXME,
XXX,
TODO
# Regular expression of note tags to take in consideration.
notes-rgx=
[REFACTORING]
# Maximum number of nested blocks for function / method body
max-nested-blocks=5
# Complete name of functions that never returns. When checking for
# inconsistent-return-statements if a never returning function is called then
# it will be considered as an explicit return statement and no message will be
# printed.
never-returning-functions=sys.exit,argparse.parse_error
[REPORTS]
# Python expression which should return a score less than or equal to 10. You
# have access to the variables 'fatal', 'error', 'warning', 'refactor',
# 'convention', and 'info' which contain the number of messages in each
# category, as well as 'statement' which is the total number of statements
# analyzed. This score is used by the global evaluation report (RP0004).
evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10))
# Template used to display messages. This is a python new-style format string
# used to format the message information. See doc for all details.
msg-template=
# Set the output format. Available formats are text, parseable, colorized, json
# and msvs (visual studio). You can also give a reporter class, e.g.
# mypackage.mymodule.MyReporterClass.
#output-format=
# Tells whether to display a full report or only the messages.
reports=no
# Activate the evaluation score.
score=yes
[SIMILARITIES]
# Comments are removed from the similarity computation
ignore-comments=yes
# Docstrings are removed from the similarity computation
ignore-docstrings=yes
# Imports are removed from the similarity computation
ignore-imports=yes
# Signatures are removed from the similarity computation
ignore-signatures=yes
# Minimum lines number of a similarity.
min-similarity-lines=4
[SPELLING]
# Limits count of emitted suggestions for spelling mistakes.
max-spelling-suggestions=4
# Spelling dictionary name. No available dictionaries : You need to install
# both the python package and the system dependency for enchant to work..
spelling-dict=
# List of comma separated words that should be considered directives if they
# appear at the beginning of a comment and should not be checked.
spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:
# List of comma separated words that should not be checked.
spelling-ignore-words=
# A path to a file that contains the private dictionary; one word per line.
spelling-private-dict-file=
# Tells whether to store unknown words to the private dictionary (see the
# --spelling-private-dict-file option) instead of raising a message.
spelling-store-unknown-words=no
[STRING]
# This flag controls whether inconsistent-quotes generates a warning when the
# character used as a quote delimiter is used inconsistently within a module.
check-quote-consistency=no
# This flag controls whether the implicit-str-concat should generate a warning
# on implicit string concatenation in sequences defined over several lines.
check-str-concat-over-line-jumps=no
[TYPECHECK]
# List of decorators that produce context managers, such as
# contextlib.contextmanager. Add to this list to register other decorators that
# produce valid context managers.
contextmanager-decorators=contextlib.contextmanager
# List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E1101 when accessed. Python regular
# expressions are accepted.
generated-members=
# Tells whether to warn about missing members when the owner of the attribute
# is inferred to be None.
ignore-none=yes
# This flag controls whether pylint should warn about no-member and similar
# checks whenever an opaque object is returned when inferring. The inference
# can return multiple potential results while evaluating a Python object, but
# some branches might not be evaluated, which results in partial inference. In
# that case, it might be useful to still emit no-member and other checks for
# the rest of the inferred objects.
ignore-on-opaque-inference=yes
# List of symbolic message names to ignore for Mixin members.
ignored-checks-for-mixins=no-member,
not-async-context-manager,
not-context-manager,
attribute-defined-outside-init
# List of class names for which member attributes should not be checked (useful
# for classes with dynamically set attributes). This supports the use of
# qualified names.
ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace
# Show a hint with possible names when a member name was not found. The aspect
# of finding the hint is based on edit distance.
missing-member-hint=yes
# The minimum edit distance a name should have in order to be considered a
# similar match for a missing member name.
missing-member-hint-distance=1
# The total number of similar names that should be taken in consideration when
# showing a hint for a missing member.
missing-member-max-choices=1
# Regex pattern to define which classes are considered mixins.
mixin-class-rgx=.*[Mm]ixin
# List of decorators that change the signature of a decorated function.
signature-mutators=
[VARIABLES]
# List of additional names supposed to be defined in builtins. Remember that
# you should avoid defining new builtins when possible.
additional-builtins=
# Tells whether unused global variables should be treated as a violation.
allow-global-unused-variables=yes
# List of names allowed to shadow builtins
allowed-redefined-builtins=
# List of strings which can identify a callback function by name. A callback
# name must start or end with one of those strings.
callbacks=cb_,
_cb
# A regular expression matching the name of dummy variables (i.e. expected to
# not be used).
dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
# Argument names that match this expression will be ignored.
ignored-argument-names=_.*|^ignored_|^unused_
# Tells whether we should check for unused import in __init__ files.
init-import=no
# List of qualified module names which can have objects that can redefine
# builtins.
redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io

2
pyproject.toml Normal file
View file

@ -0,0 +1,2 @@
[flake8]
max-line-length = 120

50
tools/README.md Normal file
View file

@ -0,0 +1,50 @@
# LEGO Island Decompilation Tools
These are a set of Python tools for helping with the decomp project
## Installing
Use pip to install the required packages:
```
pip install -r tools/requirements.txt
```
## reccmp
This is a script to compare the original EXE or DLL with a recpmpiled EXE or DLL, provided a .PDB file
## verexp
This verifies exports by comparing the exports of an original DLL and the recompiled DLL
## checkorder
This checks the order of C++ source and header files to make sure the functions are in order
## isledecomp
This is a library that is used by rhe above scripts. it has a collection of useful classes and functions
### Testing
`isledecomp` has a small suite of tests. Install pylint and run it, passing in the directory:
```
pip install pytest
pytest tools/isledecomp/tests/
```
## Development
In order to keep the code clean and consistent, we use `pylint` and `black`:
```
pip install black pylint
```
### To run pylint (ignores build and virtualenv):
```
pylint tools/ --ignore=build,bin,lib
```
### To check code formatting without rewriting files:
```
black --check tools/
```
### To apply code formatting:
```
black tools/
```

View file

@ -1,14 +1,9 @@
import os
import sys
import argparse
from isledecomp.dir import (
walk_source_dir,
is_file_cpp
)
from isledecomp.dir import walk_source_dir, is_file_cpp
from isledecomp.parser import find_code_blocks
from isledecomp.parser.util import (
is_exact_offset_comment
)
from isledecomp.parser.util import is_exact_offset_comment
def sig_truncate(sig: str) -> str:
@ -21,12 +16,14 @@ def check_file(filename: str, verbose: bool = False) -> bool:
"""Open and read the given file, then check whether the code blocks
are in order. If verbose, print each block."""
with open(filename, 'r') as f:
with open(filename, "r", encoding="utf-8") as f:
code_blocks = find_code_blocks(f)
bad_comments = [(block.start_line, block.offset_comment)
bad_comments = [
(block.start_line, block.offset_comment)
for block in code_blocks
if not is_exact_offset_comment(block.offset_comment)]
if not is_exact_offset_comment(block.offset_comment)
]
just_offsets = [block.offset for block in code_blocks]
sorted_offsets = sorted(just_offsets)
@ -35,8 +32,7 @@ def check_file(filename: str, verbose: bool = False) -> bool:
# If we detect inexact comments, don't print anything unless we are
# in verbose mode. If the file is out of order, we always print the
# file name.
should_report = ((len(bad_comments) > 0 and verbose)
or file_out_of_order)
should_report = (len(bad_comments) > 0 and verbose) or file_out_of_order
if not should_report and not file_out_of_order:
return False
@ -49,19 +45,21 @@ def check_file(filename: str, verbose: bool = False) -> bool:
prev_offset = 0
for block in code_blocks:
msg = ' '.join([
' ' if block.offset > prev_offset else '!',
f'{block.offset:08x}',
f'{block.end_line - block.start_line:4} lines',
f'{order_lookup[block.offset]:3}',
' ',
msg = " ".join(
[
" " if block.offset > prev_offset else "!",
f"{block.offset:08x}",
f"{block.end_line - block.start_line:4} lines",
f"{order_lookup[block.offset]:3}",
" ",
sig_truncate(block.signature),
])
]
)
print(msg)
prev_offset = block.offset
for (line_no, line) in bad_comments:
print(f'* line {line_no:3} bad offset comment ({line})')
for line_no, line in bad_comments:
print(f"* line {line_no:3} bad offset comment ({line})")
print()
@ -69,15 +67,25 @@ def check_file(filename: str, verbose: bool = False) -> bool:
def parse_args(test_args: list | None = None) -> dict:
p = argparse.ArgumentParser()
p.add_argument('target', help='The file or directory to check.')
p.add_argument('--enforce', action=argparse.BooleanOptionalAction,
p = argparse.ArgumentParser(
description="Checks the source files to make sure the function offset comments are in order",
)
p.add_argument("target", help="The file or directory to check.")
p.add_argument(
"--enforce",
action=argparse.BooleanOptionalAction,
default=False,
help='Fail with error code if target is out of order.')
p.add_argument('--verbose', action=argparse.BooleanOptionalAction,
help="Fail with error code if target is out of order.",
)
p.add_argument(
"--verbose",
action=argparse.BooleanOptionalAction,
default=False,
help=('Display each code block in the file and show '
'where each consecutive run of blocks is broken.'))
help=(
"Display each code block in the file and show "
"where each consecutive run of blocks is broken."
),
)
if test_args is None:
args = p.parse_args()
@ -90,31 +98,33 @@ def parse_args(test_args: list | None = None) -> dict:
def main():
args = parse_args()
if os.path.isdir(args['target']):
files_to_check = list(walk_source_dir(args['target']))
elif os.path.isfile(args['target']) and is_file_cpp(args['target']):
files_to_check = [args['target']]
if os.path.isdir(args["target"]):
files_to_check = list(walk_source_dir(args["target"]))
elif os.path.isfile(args["target"]) and is_file_cpp(args["target"]):
files_to_check = [args["target"]]
else:
sys.exit('Invalid target')
sys.exit("Invalid target")
files_out_of_order = 0
for file in files_to_check:
is_jumbled = check_file(file, args['verbose'])
is_jumbled = check_file(file, args["verbose"])
if is_jumbled:
files_out_of_order += 1
if files_out_of_order > 0:
error_message = ' '.join([
error_message = " ".join(
[
str(files_out_of_order),
'files are' if files_out_of_order > 1 else 'file is',
'out of order'
])
"files are" if files_out_of_order > 1 else "file is",
"out of order",
]
)
print(error_message)
if files_out_of_order > 0 and args['enforce']:
if files_out_of_order > 0 and args["enforce"]:
sys.exit(1)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View file

@ -1 +0,0 @@
isledecomp

View file

@ -0,0 +1,5 @@
from .bin import *
from .dir import *
from .parser import *
from .syminfo import *
from .utils import *

View file

@ -0,0 +1,47 @@
import struct
# Declare a class that can automatically convert virtual executable addresses
# to file addresses
class Bin:
def __init__(self, filename, logger):
self.logger = logger
self.logger.debug('Parsing headers of "%s"... ', filename)
self.filename = filename
self.file = None
self.imagebase = None
self.textvirt = None
self.textraw = None
def __enter__(self):
self.logger.debug(f"Bin {self.filename} Enter")
self.file = open(self.filename, "rb")
# HACK: Strictly, we should be parsing the header, but we know where
# everything is in these two files so we just jump straight there
# Read ImageBase
self.file.seek(0xB4)
(self.imagebase,) = struct.unpack("<i", self.file.read(4))
# Read .text VirtualAddress
self.file.seek(0x184)
(self.textvirt,) = struct.unpack("<i", self.file.read(4))
# Read .text PointerToRawData
self.file.seek(0x18C)
(self.textraw,) = struct.unpack("<i", self.file.read(4))
self.logger.debug("... Parsing finished")
return self
def __exit__(self, exc_type, exc_value, exc_traceback):
self.logger.debug(f"Bin {self.filename} Exit")
if self.file:
self.file.close()
def get_addr(self, virt):
return virt - self.imagebase - self.textvirt + self.textraw
def read(self, offset, size):
self.file.seek(self.get_addr(offset))
return self.file.read(size)

View file

@ -1,10 +1,48 @@
import os
import subprocess
import sys
from typing import Iterator
class WinePathConverter:
def __init__(self, unix_cwd):
self.unix_cwd = unix_cwd
self.win_cwd = self._call_winepath_unix2win(self.unix_cwd)
def get_wine_path(self, unix_fn: str) -> str:
if unix_fn.startswith("./"):
return self.win_cwd + "\\" + unix_fn[2:].replace("/", "\\")
if unix_fn.startswith(self.unix_cwd):
return (
self.win_cwd
+ "\\"
+ unix_fn.removeprefix(self.unix_cwd).replace("/", "\\").lstrip("\\")
)
return self._call_winepath_unix2win(unix_fn)
def get_unix_path(self, win_fn: str) -> str:
if win_fn.startswith(".\\") or win_fn.startswith("./"):
return self.unix_cwd + "/" + win_fn[2:].replace("\\", "/")
if win_fn.startswith(self.win_cwd):
return (
self.unix_cwd
+ "/"
+ win_fn.removeprefix(self.win_cwd).replace("\\", "/")
)
return self._call_winepath_win2unix(win_fn)
@staticmethod
def _call_winepath_unix2win(fn: str) -> str:
return subprocess.check_output(["winepath", "-w", fn], text=True).strip()
@staticmethod
def _call_winepath_win2unix(fn: str) -> str:
return subprocess.check_output(["winepath", fn], text=True).strip()
def is_file_cpp(filename: str) -> bool:
(basefile, ext) = os.path.splitext(filename)
return ext.lower() in ('.h', '.cpp')
(_, ext) = os.path.splitext(filename)
return ext.lower() in (".h", ".cpp")
def walk_source_dir(source: str, recursive: bool = True) -> Iterator[str]:
@ -12,10 +50,14 @@ def walk_source_dir(source: str, recursive: bool = True) -> Iterator[str]:
any C++ files found."""
source = os.path.abspath(source)
for subdir, dirs, files in os.walk(source):
for subdir, _, files in os.walk(source):
for file in files:
if is_file_cpp(file):
yield os.path.join(subdir, file)
if not recursive:
break
def get_file_in_script_dir(fn):
return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn)

View file

@ -7,7 +7,6 @@
OffsetMatch,
is_blank_or_comment,
match_offset_comment,
is_exact_offset_comment,
get_template_function_name,
remove_trailing_comment,
distinct_by_module,
@ -51,14 +50,16 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
# Our list of offset marks could have duplicates on
# module name, so we'll eliminate those now.
for offset_match in distinct_by_module(offset_matches):
block = CodeBlock(offset=offset_match.address,
block = CodeBlock(
offset=offset_match.address,
signature=function_sig,
start_line=start_line,
end_line=end_line,
offset_comment=offset_match.comment,
module=offset_match.module,
is_template=offset_match.is_template,
is_stub=offset_match.is_stub)
is_stub=offset_match.is_stub,
)
blocks.append(block)
offset_matches = []
state = ReaderState.WANT_OFFSET
@ -66,15 +67,18 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
if can_seek:
line_no += 1
line = stream.readline()
if line == '':
if line == "":
break
new_match = match_offset_comment(line)
if new_match is not None:
# We will allow multiple offsets if we have just begun
# the code block, but not after we hit the curly brace.
if state in (ReaderState.WANT_OFFSET, ReaderState.IN_TEMPLATE,
ReaderState.WANT_SIG):
if state in (
ReaderState.WANT_OFFSET,
ReaderState.IN_TEMPLATE,
ReaderState.WANT_SIG,
):
# If we detected an offset marker unexpectedly,
# we are handling it here so we can continue seeking.
can_seek = True
@ -116,11 +120,10 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
# same line. clang-format should prevent this (BraceWrapping)
# but it is easy to detect.
# If the entire function is on one line, handle that too.
if function_sig.endswith('{'):
if function_sig.endswith("{"):
start_line = line_no
state = ReaderState.IN_FUNC
elif (function_sig.endswith('}') or
function_sig.endswith('};')):
elif function_sig.endswith("}") or function_sig.endswith("};"):
start_line = line_no
end_line = line_no
state = ReaderState.FUNCTION_DONE
@ -128,14 +131,14 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
state = ReaderState.WANT_CURLY
elif state == ReaderState.WANT_CURLY:
if line.strip() == '{':
if line.strip() == "{":
start_line = line_no
state = ReaderState.IN_FUNC
elif state == ReaderState.IN_FUNC:
# Naive but reasonable assumption that functions will end with
# a curly brace on its own line with no prepended spaces.
if line.startswith('}'):
if line.startswith("}"):
end_line = line_no
state = ReaderState.FUNCTION_DONE

View file

@ -5,34 +5,49 @@
from collections import namedtuple
CodeBlock = namedtuple('CodeBlock',
['offset', 'signature', 'start_line', 'end_line',
'offset_comment', 'module', 'is_template', 'is_stub'])
CodeBlock = namedtuple(
"CodeBlock",
[
"offset",
"signature",
"start_line",
"end_line",
"offset_comment",
"module",
"is_template",
"is_stub",
],
)
OffsetMatch = namedtuple('OffsetMatch', ['module', 'address', 'is_template',
'is_stub', 'comment'])
OffsetMatch = namedtuple(
"OffsetMatch", ["module", "address", "is_template", "is_stub", "comment"]
)
# This has not been formally established, but considering that "STUB"
# is a temporary state for a function, we assume it will appear last,
# after any other modifiers (i.e. TEMPLATE)
# To match a reasonable variance of formatting for the offset comment
offsetCommentRegex = re.compile(r'\s*//\s*OFFSET:\s*(\w+)\s+(?:0x)?([a-f0-9]+)(\s+TEMPLATE)?(\s+STUB)?', # nopep8
flags=re.I)
offsetCommentRegex = re.compile(
r"\s*//\s*OFFSET:\s*(\w+)\s+(?:0x)?([a-f0-9]+)(\s+TEMPLATE)?(\s+STUB)?", # nopep8
flags=re.I,
)
# To match the exact syntax (text upper case, hex lower case, with spaces)
# that is used in most places
offsetCommentExactRegex = re.compile(r'^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)( TEMPLATE)?( STUB)?$') # nopep8
offsetCommentExactRegex = re.compile(
r"^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)( TEMPLATE)?( STUB)?$"
) # nopep8
# The goal here is to just read whatever is on the next line, so some
# flexibility in the formatting seems OK
templateCommentRegex = re.compile(r'\s*//\s+(.*)')
templateCommentRegex = re.compile(r"\s*//\s+(.*)")
# To remove any comment (//) or block comment (/*) and its leading spaces
# from the end of a code line
trailingCommentRegex = re.compile(r'(\s*(?://|/\*).*)$')
trailingCommentRegex = re.compile(r"(\s*(?://|/\*).*)$")
def get_template_function_name(line: str) -> str:
@ -47,7 +62,7 @@ def get_template_function_name(line: str) -> str:
def remove_trailing_comment(line: str) -> str:
return trailingCommentRegex.sub('', line)
return trailingCommentRegex.sub("", line)
def is_blank_or_comment(line: str) -> bool:
@ -55,10 +70,12 @@ def is_blank_or_comment(line: str) -> bool:
There could be blank lines or other comments before the
function signature, and we want to skip those."""
line_strip = line.strip()
return (len(line_strip) == 0
or line_strip.startswith('//')
or line_strip.startswith('/*')
or line_strip.endswith('*/'))
return (
len(line_strip) == 0
or line_strip.startswith("//")
or line_strip.startswith("/*")
or line_strip.endswith("*/")
)
def is_exact_offset_comment(line: str) -> bool:
@ -72,11 +89,13 @@ def match_offset_comment(line: str) -> OffsetMatch | None:
if match is None:
return None
return OffsetMatch(module=match.group(1),
return OffsetMatch(
module=match.group(1),
address=int(match.group(2), 16),
is_template=match.group(3) is not None,
is_stub=match.group(4) is not None,
comment=line.strip())
comment=line.strip(),
)
def distinct_by_module(offsets: List) -> List:

View file

@ -0,0 +1,138 @@
import os
import subprocess
from .utils import get_file_in_script_dir
class RecompiledInfo:
addr = None
size = None
name = None
start = None
# Declare a class that parses the output of cvdump for fast access later
class SymInfo:
funcs = {}
lines = {}
names = {}
def __init__(self, pdb, sym_recompfile, sym_logger, sym_wine_path_converter=None):
self.logger = sym_logger
call = [get_file_in_script_dir("cvdump.exe"), "-l", "-s"]
if sym_wine_path_converter:
# Run cvdump through wine and convert path to Windows-friendly wine path
call.insert(0, "wine")
call.append(sym_wine_path_converter.get_wine_path(pdb))
else:
call.append(pdb)
self.logger.info("Parsing %s ...", pdb)
self.logger.debug("Command = %s", call)
line_dump = subprocess.check_output(call).decode("utf-8").split("\r\n")
current_section = None
self.logger.debug("Parsing output of cvdump.exe ...")
for i, line in enumerate(line_dump):
if line.startswith("***"):
current_section = line[4:]
if current_section == "SYMBOLS" and "S_GPROC32" in line:
sym_addr = int(line[26:34], 16)
info = RecompiledInfo()
info.addr = (
sym_addr + sym_recompfile.imagebase + sym_recompfile.textvirt
)
use_dbg_offs = False
if use_dbg_offs:
debug_offs = line_dump[i + 2]
debug_start = int(debug_offs[22:30], 16)
debug_end = int(debug_offs[43:], 16)
info.start = debug_start
info.size = debug_end - debug_start
else:
info.start = 0
info.size = int(line[41:49], 16)
info.name = line[77:]
self.names[info.name] = info
self.funcs[sym_addr] = info
elif (
current_section == "LINES"
and line.startswith(" ")
and not line.startswith(" ")
):
sourcepath = line.split()[0]
if sym_wine_path_converter:
# Convert filename to Unix path for file compare
sourcepath = sym_wine_path_converter.get_unix_path(sourcepath)
if sourcepath not in self.lines:
self.lines[sourcepath] = {}
j = i + 2
while True:
ll = line_dump[j].split()
if len(ll) == 0:
break
k = 0
while k < len(ll):
linenum = int(ll[k + 0])
address = int(ll[k + 1], 16)
if linenum not in self.lines[sourcepath]:
self.lines[sourcepath][linenum] = address
k += 2
j += 1
self.logger.debug("... Parsing output of cvdump.exe finished")
def get_recompiled_address(self, filename, line):
recompiled_addr = None
self.logger.debug("Looking for %s:%s", filename, line)
filename_basename = os.path.basename(filename).lower()
for fn in self.lines:
# Sometimes a PDB is compiled with a relative path while we always have
# an absolute path. Therefore we must
try:
if os.path.basename(
fn
).lower() == filename_basename and os.path.samefile(fn, filename):
filename = fn
break
except FileNotFoundError:
continue
if filename in self.lines and line in self.lines[filename]:
recompiled_addr = self.lines[filename][line]
if recompiled_addr in self.funcs:
return self.funcs[recompiled_addr]
self.logger.error(
"Failed to find function symbol with address: %x", recompiled_addr
)
return None
self.logger.error(
"Failed to find function symbol with filename and line: %s:%s",
filename,
line,
)
return None
def get_recompiled_address_from_name(self, name):
self.logger.debug("Looking for %s", name)
if name in self.names:
return self.names[name]
self.logger.error("Failed to find function symbol with name: %s", name)
return None

View file

@ -0,0 +1,42 @@
import os
import sys
import colorama
def print_diff(udiff, plain):
has_diff = False
for line in udiff:
has_diff = True
color = ""
if line.startswith("++") or line.startswith("@@") or line.startswith("--"):
# Skip unneeded parts of the diff for the brief view
continue
# Work out color if we are printing color
if not plain:
if line.startswith("+"):
color = colorama.Fore.GREEN
elif line.startswith("-"):
color = colorama.Fore.RED
print(color + line)
# Reset color if we're printing in color
if not plain:
print(colorama.Style.RESET_ALL, end="")
return has_diff
def get_file_in_script_dir(fn):
return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn)
class OffsetPlaceholderGenerator:
def __init__(self):
self.counter = 0
self.replacements = {}
def get(self, replace_addr):
if replace_addr in self.replacements:
return self.replacements[replace_addr]
self.counter += 1
replacement = f"<OFFSET{self.counter}>"
self.replacements[replace_addr] = replacement
return replacement

View file

@ -1,9 +1,9 @@
from setuptools import setup, find_packages
setup(
name='isledecomp',
version='0.1.0',
description='Python tools for the isledecomp project',
name="isledecomp",
version="0.1.0",
description="Python tools for the isledecomp project",
packages=find_packages(),
tests_require=['pytest'],
tests_require=["pytest"],
)

View file

@ -1,17 +1,16 @@
import os
import pytest
from typing import List, TextIO
from isledecomp.parser import find_code_blocks
from isledecomp.parser.util import CodeBlock
SAMPLE_DIR = os.path.join(os.path.dirname(__file__), 'samples')
SAMPLE_DIR = os.path.join(os.path.dirname(__file__), "samples")
def sample_file(filename: str) -> TextIO:
"""Wrapper for opening the samples from the directory that does not
depend on the cwd where we run the test"""
full_path = os.path.join(SAMPLE_DIR, filename)
return open(full_path, 'r')
return open(full_path, "r", encoding="utf-8")
def code_blocks_are_sorted(blocks: List[CodeBlock]) -> bool:
@ -25,7 +24,7 @@ def code_blocks_are_sorted(blocks: List[CodeBlock]) -> bool:
def test_sanity():
"""Read a very basic file"""
with sample_file('basic_file.cpp') as f:
with sample_file("basic_file.cpp") as f:
blocks = find_code_blocks(f)
assert len(blocks) == 3
@ -39,7 +38,7 @@ def test_sanity():
def test_oneline():
"""(Assuming clang-format permits this) This sample has a function
on a single line. This will test the end-of-function detection"""
with sample_file('oneline_function.cpp') as f:
with sample_file("oneline_function.cpp") as f:
blocks = find_code_blocks(f)
assert len(blocks) == 2
@ -49,7 +48,7 @@ def test_oneline():
def test_missing_offset():
"""What if the function doesn't have an offset comment?"""
with sample_file('missing_offset.cpp') as f:
with sample_file("missing_offset.cpp") as f:
blocks = find_code_blocks(f)
# TODO: For now, the function without the offset will just be ignored.
@ -62,7 +61,7 @@ def test_jumbled_case():
"""The parser just reports what it sees. It is the responsibility of
the downstream tools to do something about a jumbled file.
Just verify that we are reading it correctly."""
with sample_file('out_of_order.cpp') as f:
with sample_file("out_of_order.cpp") as f:
blocks = find_code_blocks(f)
assert len(blocks) == 3
@ -70,7 +69,7 @@ def test_jumbled_case():
def test_bad_file():
with sample_file('poorly_formatted.cpp') as f:
with sample_file("poorly_formatted.cpp") as f:
blocks = find_code_blocks(f)
assert len(blocks) == 3
@ -78,7 +77,7 @@ def test_bad_file():
def test_indented():
"""Offsets for functions inside of a class will probably be indented."""
with sample_file('basic_class.cpp') as f:
with sample_file("basic_class.cpp") as f:
blocks = find_code_blocks(f)
# TODO: We don't properly detect the end of these functions
@ -87,17 +86,17 @@ def test_indented():
# all the functions that are there.
assert len(blocks) == 2
assert blocks[0].offset == int('0x12345678', 16)
assert blocks[0].offset == int("0x12345678", 16)
assert blocks[0].start_line == 15
# assert blocks[0].end_line == 18
assert blocks[1].offset == int('0xdeadbeef', 16)
assert blocks[1].offset == int("0xdeadbeef", 16)
assert blocks[1].start_line == 22
# assert blocks[1].end_line == 24
def test_inline():
with sample_file('inline.cpp') as f:
with sample_file("inline.cpp") as f:
blocks = find_code_blocks(f)
assert len(blocks) == 2
@ -110,19 +109,19 @@ def test_multiple_offsets():
"""If multiple offset marks appear before for a code block, take them
all but ensure module name (case-insensitive) is distinct.
Use first module occurrence in case of duplicates."""
with sample_file('multiple_offsets.cpp') as f:
with sample_file("multiple_offsets.cpp") as f:
blocks = find_code_blocks(f)
assert len(blocks) == 4
assert blocks[0].module == 'TEST'
assert blocks[0].module == "TEST"
assert blocks[0].start_line == 9
assert blocks[1].module == 'HELLO'
assert blocks[1].module == "HELLO"
assert blocks[1].start_line == 9
# Duplicate modules are ignored
assert blocks[2].start_line == 16
assert blocks[2].offset == 0x2345
assert blocks[3].module == 'TEST'
assert blocks[3].module == "TEST"
assert blocks[3].offset == 0x2002

View file

@ -1,6 +1,6 @@
import pytest
from collections import namedtuple
from typing import List
import pytest
from isledecomp.parser.util import (
is_blank_or_comment,
match_offset_comment,
@ -10,21 +10,20 @@
blank_or_comment_param = [
(True, ''),
(True, '\t'),
(True, ' '),
(False, '\tint abc=123;'),
(True, '// OFFSET: LEGO1 0xdeadbeef'),
(True, ' /* Block comment beginning'),
(True, 'Block comment ending */ '),
(True, ""),
(True, "\t"),
(True, " "),
(False, "\tint abc=123;"),
(True, "// OFFSET: LEGO1 0xdeadbeef"),
(True, " /* Block comment beginning"),
(True, "Block comment ending */ "),
# TODO: does clang-format have anything to say about these cases?
(False, 'x++; // Comment folows'),
(False, 'x++; /* Block comment begins'),
(False, "x++; // Comment folows"),
(False, "x++; /* Block comment begins"),
]
@pytest.mark.parametrize('expected, line', blank_or_comment_param)
@pytest.mark.parametrize("expected, line", blank_or_comment_param)
def test_is_blank_or_comment(line: str, expected: bool):
assert is_blank_or_comment(line) is expected
@ -32,82 +31,73 @@ def test_is_blank_or_comment(line: str, expected: bool):
offset_comment_samples = [
# (can_parse: bool, exact_match: bool, line: str)
# Should match both expected modules with optional STUB marker
(True, True, '// OFFSET: LEGO1 0xdeadbeef'),
(True, True, '// OFFSET: LEGO1 0xdeadbeef STUB'),
(True, True, '// OFFSET: ISLE 0x12345678'),
(True, True, '// OFFSET: ISLE 0x12345678 STUB'),
(True, True, "// OFFSET: LEGO1 0xdeadbeef"),
(True, True, "// OFFSET: LEGO1 0xdeadbeef STUB"),
(True, True, "// OFFSET: ISLE 0x12345678"),
(True, True, "// OFFSET: ISLE 0x12345678 STUB"),
# No trailing spaces allowed
(True, False, '// OFFSET: LEGO1 0xdeadbeef '),
(True, False, '// OFFSET: LEGO1 0xdeadbeef STUB '),
(True, False, "// OFFSET: LEGO1 0xdeadbeef "),
(True, False, "// OFFSET: LEGO1 0xdeadbeef STUB "),
# Must have exactly one space between elements
(True, False, '//OFFSET: ISLE 0xdeadbeef'),
(True, False, '// OFFSET:ISLE 0xdeadbeef'),
(True, False, '// OFFSET: ISLE 0xdeadbeef'),
(True, False, '// OFFSET: ISLE 0xdeadbeef'),
(True, False, '// OFFSET: ISLE 0xdeadbeef'),
(True, False, '// OFFSET: ISLE 0xdeadbeef STUB'),
(True, False, "//OFFSET: ISLE 0xdeadbeef"),
(True, False, "// OFFSET:ISLE 0xdeadbeef"),
(True, False, "// OFFSET: ISLE 0xdeadbeef"),
(True, False, "// OFFSET: ISLE 0xdeadbeef"),
(True, False, "// OFFSET: ISLE 0xdeadbeef"),
(True, False, "// OFFSET: ISLE 0xdeadbeef STUB"),
# Must have 0x prefix for hex number
(True, False, '// OFFSET: ISLE deadbeef'),
(True, False, "// OFFSET: ISLE deadbeef"),
# Offset, module name, and STUB must be uppercase
(True, False, '// offset: ISLE 0xdeadbeef'),
(True, False, '// offset: isle 0xdeadbeef'),
(True, False, '// OFFSET: LEGO1 0xdeadbeef stub'),
(True, False, "// offset: ISLE 0xdeadbeef"),
(True, False, "// offset: isle 0xdeadbeef"),
(True, False, "// OFFSET: LEGO1 0xdeadbeef stub"),
# Hex string must be lowercase
(True, False, '// OFFSET: ISLE 0xDEADBEEF'),
(True, False, "// OFFSET: ISLE 0xDEADBEEF"),
# TODO: How flexible should we be with matching the module name?
(True, True, '// OFFSET: OMNI 0x12345678'),
(True, True, '// OFFSET: LEG01 0x12345678'),
(True, False, '// OFFSET: hello 0x12345678'),
(True, True, "// OFFSET: OMNI 0x12345678"),
(True, True, "// OFFSET: LEG01 0x12345678"),
(True, False, "// OFFSET: hello 0x12345678"),
# Not close enough to match
(False, False, '// OFFSET: ISLE0x12345678'),
(False, False, '// OFFSET: 0x12345678'),
(False, False, '// LEGO1: 0x12345678'),
(False, False, "// OFFSET: ISLE0x12345678"),
(False, False, "// OFFSET: 0x12345678"),
(False, False, "// LEGO1: 0x12345678"),
# Hex string shorter than 8 characters
(True, True, '// OFFSET: LEGO1 0x1234'),
(True, True, "// OFFSET: LEGO1 0x1234"),
# TODO: These match but shouldn't.
# (False, False, '// OFFSET: LEGO1 0'),
# (False, False, '// OFFSET: LEGO1 0x'),
]
@pytest.mark.parametrize('match, exact, line', offset_comment_samples)
def test_offset_match(line: str, match: bool, exact):
@pytest.mark.parametrize("match, _, line", offset_comment_samples)
def test_offset_match(line: str, match: bool, _):
did_match = match_offset_comment(line) is not None
assert did_match is match
@pytest.mark.parametrize('match, exact, line', offset_comment_samples)
def test_exact_offset_comment(line: str, exact: bool, match):
@pytest.mark.parametrize("_, exact, line", offset_comment_samples)
def test_exact_offset_comment(line: str, exact: bool, _):
assert is_exact_offset_comment(line) is exact
# Helper for the next test: cut down version of OffsetMatch
MiniOfs = namedtuple('MiniOfs', ['module', 'value'])
MiniOfs = namedtuple("MiniOfs", ["module", "value"])
distinct_by_module_samples = [
# empty set
([], []),
# same module name
([MiniOfs('TEST', 123), MiniOfs('TEST', 555)],
[MiniOfs('TEST', 123)]),
([MiniOfs("TEST", 123), MiniOfs("TEST", 555)], [MiniOfs("TEST", 123)]),
# same module name, case-insensitive
([MiniOfs('test', 123), MiniOfs('TEST', 555)],
[MiniOfs('test', 123)]),
([MiniOfs("test", 123), MiniOfs("TEST", 555)], [MiniOfs("test", 123)]),
# duplicates, non-consecutive
([MiniOfs('test', 123), MiniOfs('abc', 111), MiniOfs('TEST', 555)],
[MiniOfs('test', 123), MiniOfs('abc', 111)]),
(
[MiniOfs("test", 123), MiniOfs("abc", 111), MiniOfs("TEST", 555)],
[MiniOfs("test", 123), MiniOfs("abc", 111)],
),
]
@pytest.mark.parametrize('sample, expected', distinct_by_module_samples)
@pytest.mark.parametrize("sample, expected", distinct_by_module_samples)
def test_distinct_by_module(sample: List[MiniOfs], expected: List[MiniOfs]):
assert distinct_by_module(sample) == expected

View file

@ -2,284 +2,51 @@
import argparse
import base64
from capstone import *
import difflib
import struct
import subprocess
import json
import logging
import os
import sys
import colorama
import json
import re
from isledecomp.dir import walk_source_dir
from isledecomp.parser import find_code_blocks
from isledecomp import (
Bin,
find_code_blocks,
get_file_in_script_dir,
OffsetPlaceholderGenerator,
print_diff,
SymInfo,
walk_source_dir,
WinePathConverter,
)
from capstone import Cs, CS_ARCH_X86, CS_MODE_32
import colorama
from pystache import Renderer
parser = argparse.ArgumentParser(allow_abbrev=False,
description='Recompilation Compare: compare an original EXE with a recompiled EXE + PDB.')
parser.add_argument('original', metavar='original-binary', help='The original binary')
parser.add_argument('recompiled', metavar='recompiled-binary', help='The recompiled binary')
parser.add_argument('pdb', metavar='recompiled-pdb', help='The PDB of the recompiled binary')
parser.add_argument('decomp_dir', metavar='decomp-dir', help='The decompiled source tree')
parser.add_argument('--total', '-T', metavar='<count>', help='Total number of expected functions (improves total accuracy statistic)')
parser.add_argument('--verbose', '-v', metavar='<offset>', help='Print assembly diff for specific function (original file\'s offset)')
parser.add_argument('--html', '-H', metavar='<file>', help='Generate searchable HTML summary of status and diffs')
parser.add_argument('--no-color', '-n', action='store_true', help='Do not color the output')
parser.add_argument('--svg', '-S', metavar='<file>', help='Generate SVG graphic of progress')
parser.add_argument('--svg-icon', metavar='icon', help='Icon to use in SVG (PNG)')
parser.add_argument('--print-rec-addr', action='store_true', help='Print addresses of recompiled functions too')
REGISTER_LIST = set(
[
"ax",
"bp",
"bx",
"cx",
"di",
"dx",
"eax",
"ebp",
"ebx",
"ecx",
"edi",
"edx",
"esi",
"esp",
"si",
"sp",
]
)
WORDS = re.compile(r"\w+")
parser.set_defaults(loglevel=logging.INFO)
parser.add_argument('--debug', action='store_const', const=logging.DEBUG, dest='loglevel', help='Print script debug information')
args = parser.parse_args()
logging.basicConfig(level=args.loglevel, format='[%(levelname)s] %(message)s')
logger = logging.getLogger(__name__)
colorama.init()
verbose = None
found_verbose_target = False
if args.verbose:
try:
verbose = int(args.verbose, 16)
except ValueError:
parser.error('invalid verbose argument')
html_path = args.html
plain = args.no_color
original = args.original
if not os.path.isfile(original):
parser.error(f'Original binary {original} does not exist')
recomp = args.recompiled
if not os.path.isfile(recomp):
parser.error(f'Recompiled binary {recomp} does not exist')
syms = args.pdb
if not os.path.isfile(syms):
parser.error(f'Symbols PDB {syms} does not exist')
source = args.decomp_dir
if not os.path.isdir(source):
parser.error(f'Source directory {source} does not exist')
svg = args.svg
# Declare a class that can automatically convert virtual executable addresses
# to file addresses
class Bin:
def __init__(self, filename):
logger.debug(f'Parsing headers of "{filename}"... ')
self.file = open(filename, 'rb')
#HACK: Strictly, we should be parsing the header, but we know where
# everything is in these two files so we just jump straight there
# Read ImageBase
self.file.seek(0xB4)
self.imagebase, = struct.unpack('<i', self.file.read(4))
# Read .text VirtualAddress
self.file.seek(0x184)
self.textvirt, = struct.unpack('<i', self.file.read(4))
# Read .text PointerToRawData
self.file.seek(0x18C)
self.textraw, = struct.unpack('<i', self.file.read(4))
logger.debug('... Parsing finished')
def __del__(self):
if self.file:
self.file.close()
def get_addr(self, virt):
return virt - self.imagebase - self.textvirt + self.textraw
def read(self, offset, size):
self.file.seek(self.get_addr(offset))
return self.file.read(size)
class RecompiledInfo:
def __init__(self):
self.addr = None
self.size = None
self.name = None
self.start = None
class WinePathConverter:
def __init__(self, unix_cwd):
self.unix_cwd = unix_cwd
self.win_cwd = self._call_winepath_unix2win(self.unix_cwd)
def get_wine_path(self, unix_fn: str) -> str:
if unix_fn.startswith('./'):
return self.win_cwd + '\\' + unix_fn[2:].replace('/', '\\')
if unix_fn.startswith(self.unix_cwd):
return self.win_cwd + '\\' + unix_fn.removeprefix(self.unix_cwd).replace('/', '\\').lstrip('\\')
return self._call_winepath_unix2win(unix_fn)
def get_unix_path(self, win_fn: str) -> str:
if win_fn.startswith('.\\') or win_fn.startswith('./'):
return self.unix_cwd + '/' + win_fn[2:].replace('\\', '/')
if win_fn.startswith(self.win_cwd):
return self.unix_cwd + '/' + win_fn.removeprefix(self.win_cwd).replace('\\', '/')
return self._call_winepath_win2unix(win_fn)
@staticmethod
def _call_winepath_unix2win(fn: str) -> str:
return subprocess.check_output(['winepath', '-w', fn], text=True).strip()
@staticmethod
def _call_winepath_win2unix(fn: str) -> str:
return subprocess.check_output(['winepath', fn], text=True).strip()
def get_file_in_script_dir(fn):
return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn)
# Declare a class that parses the output of cvdump for fast access later
class SymInfo:
funcs = {}
lines = {}
names = {}
def __init__(self, pdb, file, wine_path_converter):
call = [get_file_in_script_dir('cvdump.exe'), '-l', '-s']
if wine_path_converter:
# Run cvdump through wine and convert path to Windows-friendly wine path
call.insert(0, 'wine')
call.append(wine_path_converter.get_wine_path(pdb))
else:
call.append(pdb)
logger.info(f'Parsing {pdb} ...')
logger.debug(f'Command = {call}')
line_dump = subprocess.check_output(call).decode('utf-8').split('\r\n')
current_section = None
logger.debug('Parsing output of cvdump.exe ...')
for i, line in enumerate(line_dump):
if line.startswith('***'):
current_section = line[4:]
if current_section == 'SYMBOLS' and 'S_GPROC32' in line:
addr = int(line[26:34], 16)
info = RecompiledInfo()
info.addr = addr + recompfile.imagebase + recompfile.textvirt
use_dbg_offs = False
if use_dbg_offs:
debug_offs = line_dump[i + 2]
debug_start = int(debug_offs[22:30], 16)
debug_end = int(debug_offs[43:], 16)
info.start = debug_start
info.size = debug_end - debug_start
else:
info.start = 0
info.size = int(line[41:49], 16)
info.name = line[77:]
self.names[info.name] = info
self.funcs[addr] = info
elif current_section == 'LINES' and line.startswith(' ') and not line.startswith(' '):
sourcepath = line.split()[0]
if wine_path_converter:
# Convert filename to Unix path for file compare
sourcepath = wine_path_converter.get_unix_path(sourcepath)
if sourcepath not in self.lines:
self.lines[sourcepath] = {}
j = i + 2
while True:
ll = line_dump[j].split()
if len(ll) == 0:
break
k = 0
while k < len(ll):
linenum = int(ll[k + 0])
address = int(ll[k + 1], 16)
if linenum not in self.lines[sourcepath]:
self.lines[sourcepath][linenum] = address
k += 2
j += 1
logger.debug('... Parsing output of cvdump.exe finished')
def get_recompiled_address(self, filename, line):
addr = None
found = False
logger.debug(f'Looking for {filename}:{line}')
filename_basename = os.path.basename(filename).lower()
for fn in self.lines:
# Sometimes a PDB is compiled with a relative path while we always have
# an absolute path. Therefore we must
try:
if (os.path.basename(fn).lower() == filename_basename and
os.path.samefile(fn, filename)):
filename = fn
break
except FileNotFoundError as e:
continue
if filename in self.lines and line in self.lines[fn]:
addr = self.lines[fn][line]
if addr in self.funcs:
return self.funcs[addr]
else:
logger.error(f'Failed to find function symbol with address: 0x{addr:x}')
else:
logger.error(f'Failed to find function symbol with filename and line: {filename}:{line}')
def get_recompiled_address_from_name(self, name):
logger.debug('Looking for %s', name)
if name in self.names:
return self.names[name]
else:
logger.error(f'Failed to find function symbol with name: {name}')
wine_path_converter = None
if os.name != 'nt':
wine_path_converter = WinePathConverter(source)
origfile = Bin(original)
recompfile = Bin(recomp)
syminfo = SymInfo(syms, recompfile, wine_path_converter)
print()
md = Cs(CS_ARCH_X86, CS_MODE_32)
class OffsetPlaceholderGenerator:
def __init__(self):
self.counter = 0
self.replacements = {}
def get(self, addr):
if addr in self.replacements:
return self.replacements[addr]
else:
self.counter += 1
replacement = f'<OFFSET{self.counter}>'
self.replacements[addr] = replacement
return replacement
def sanitize(file, placeholderGenerator, mnemonic, op_str):
def sanitize(file, placeholder_generator, mnemonic, op_str):
op_str_is_number = False
try:
int(op_str, 16)
@ -287,76 +54,61 @@ def sanitize(file, placeholderGenerator, mnemonic, op_str):
except ValueError:
pass
if (mnemonic == 'call' or mnemonic == 'jmp') and op_str_is_number:
if (mnemonic in ["call", "jmp"]) and op_str_is_number:
# Filter out "calls" because the offsets we're not currently trying to
# match offsets. As long as there's a call in the right place, it's
# probably accurate.
op_str = placeholderGenerator.get(int(op_str, 16))
op_str = placeholder_generator.get(int(op_str, 16))
else:
def filter_out_ptr(ptype, op_str):
try:
ptrstr = ptype + ' ptr ['
ptrstr = ptype + " ptr ["
start = op_str.index(ptrstr) + len(ptrstr)
end = op_str.index(']', start)
end = op_str.index("]", start)
# This will throw ValueError if not hex
inttest = int(op_str[start:end], 16)
return op_str[0:start] + placeholderGenerator.get(inttest) + op_str[end:]
return (
op_str[0:start] + placeholder_generator.get(inttest) + op_str[end:]
)
except ValueError:
return op_str
# Filter out dword ptrs where the pointer is to an offset
op_str = filter_out_ptr('dword', op_str)
op_str = filter_out_ptr('word', op_str)
op_str = filter_out_ptr('byte', op_str)
op_str = filter_out_ptr("dword", op_str)
op_str = filter_out_ptr("word", op_str)
op_str = filter_out_ptr("byte", op_str)
# Use heuristics to filter out any args that look like offsets
words = op_str.split(' ')
words = op_str.split(" ")
for i, word in enumerate(words):
try:
inttest = int(word, 16)
if inttest >= file.imagebase + file.textvirt:
words[i] = placeholderGenerator.get(inttest)
words[i] = placeholder_generator.get(inttest)
except ValueError:
pass
op_str = ' '.join(words)
op_str = " ".join(words)
return mnemonic, op_str
def parse_asm(file, addr, size):
def parse_asm(disassembler, file, asm_addr, size):
asm = []
data = file.read(addr, size)
placeholderGenerator = OffsetPlaceholderGenerator()
for i in md.disasm(data, 0):
data = file.read(asm_addr, size)
placeholder_generator = OffsetPlaceholderGenerator()
for i in disassembler.disasm(data, 0):
# Use heuristics to disregard some differences that aren't representative
# of the accuracy of a function (e.g. global offsets)
mnemonic, op_str = sanitize(file, placeholderGenerator, i.mnemonic, i.op_str)
mnemonic, op_str = sanitize(file, placeholder_generator, i.mnemonic, i.op_str)
if op_str is None:
asm.append(mnemonic)
else:
asm.append(f'{mnemonic} {op_str}')
asm.append(f"{mnemonic} {op_str}")
return asm
REGISTER_LIST = set([
'ax',
'bp',
'bx',
'cx',
'di',
'dx',
'eax',
'ebp',
'ebx',
'ecx',
'edi',
'edx',
'esi',
'esp',
'si',
'sp',
])
WORDS = re.compile(r'\w+')
def get_registers(line: str):
to_replace = []
@ -367,8 +119,15 @@ def get_registers(line: str):
to_replace.append((reg, match.start()))
return to_replace
def replace_register(lines: list[str], start_line: int, reg: str, replacement: str) -> list[str]:
return [line.replace(reg, replacement) if i >= start_line else line for i, line in enumerate(lines)]
def replace_register(
lines: list[str], start_line: int, reg: str, replacement: str
) -> list[str]:
return [
line.replace(reg, replacement) if i >= start_line else line
for i, line in enumerate(lines)
]
# Is it possible to make new_asm the same as original_asm by swapping registers?
def can_resolve_register_differences(original_asm, new_asm):
@ -382,20 +141,19 @@ def can_resolve_register_differences(original_asm, new_asm):
return False
# Look for the mismatching lines
for i in range(len(original_asm)):
for i, original_line in enumerate(original_asm):
new_line = new_asm[i]
original_line = original_asm[i]
if new_line != original_line:
# Find all the registers to replace
to_replace = get_registers(original_line)
for j in range(len(to_replace)):
(reg, reg_index) = to_replace[j]
for replace in to_replace:
(reg, reg_index) = replace
replacing_reg = new_line[reg_index : reg_index + len(reg)]
if replacing_reg in REGISTER_LIST:
if replacing_reg != reg:
# Do a three-way swap replacing in all the subsequent lines
temp_reg = '&' * len(reg)
temp_reg = "&" * len(reg)
new_asm = replace_register(new_asm, i, replacing_reg, temp_reg)
new_asm = replace_register(new_asm, i, reg, replacing_reg)
new_asm = replace_register(new_asm, i, temp_reg, reg)
@ -403,11 +161,150 @@ def can_resolve_register_differences(original_asm, new_asm):
# No replacement to do, different code, bail out
return False
# Check if the lines are now the same
for i in range(len(original_asm)):
if new_asm[i] != original_asm[i]:
for i, original_line in enumerate(original_asm):
if new_asm[i] != original_line:
return False
return True
def gen_html(html_file, data):
output_data = Renderer().render_path(
get_file_in_script_dir("template.html"), {"data": data}
)
with open(html_file, "w", encoding="utf-8") as htmlfile:
htmlfile.write(output_data)
def gen_svg(svg_file, name_svg, icon, svg_implemented_funcs, total_funcs, raw_accuracy):
icon_data = None
if icon:
with open(icon, "rb") as iconfile:
icon_data = base64.b64encode(iconfile.read()).decode("utf-8")
total_statistic = raw_accuracy / total_funcs
full_percentbar_width = 127.18422
output_data = Renderer().render_path(
get_file_in_script_dir("template.svg"),
{
"name": name_svg,
"icon": icon_data,
"implemented": f"{(svg_implemented_funcs / total_funcs * 100):.2f}% ({svg_implemented_funcs}/{total_funcs})",
"accuracy": f"{(raw_accuracy / svg_implemented_funcs * 100):.2f}%",
"progbar": total_statistic * full_percentbar_width,
"percent": f"{(total_statistic * 100):.2f}%",
},
)
with open(svg_file, "w", encoding="utf-8") as svgfile:
svgfile.write(output_data)
# Do the actual work
if __name__ == "__main__":
parser = argparse.ArgumentParser(
allow_abbrev=False,
description="Recompilation Compare: compare an original EXE with a recompiled EXE + PDB.",
)
parser.add_argument(
"original", metavar="original-binary", help="The original binary"
)
parser.add_argument(
"recompiled", metavar="recompiled-binary", help="The recompiled binary"
)
parser.add_argument(
"pdb", metavar="recompiled-pdb", help="The PDB of the recompiled binary"
)
parser.add_argument(
"decomp_dir", metavar="decomp-dir", help="The decompiled source tree"
)
parser.add_argument(
"--total",
"-T",
metavar="<count>",
help="Total number of expected functions (improves total accuracy statistic)",
)
parser.add_argument(
"--verbose",
"-v",
metavar="<offset>",
help="Print assembly diff for specific function (original file's offset)",
)
parser.add_argument(
"--html",
"-H",
metavar="<file>",
help="Generate searchable HTML summary of status and diffs",
)
parser.add_argument(
"--no-color", "-n", action="store_true", help="Do not color the output"
)
parser.add_argument(
"--svg", "-S", metavar="<file>", help="Generate SVG graphic of progress"
)
parser.add_argument("--svg-icon", metavar="icon", help="Icon to use in SVG (PNG)")
parser.add_argument(
"--print-rec-addr",
action="store_true",
help="Print addresses of recompiled functions too",
)
parser.set_defaults(loglevel=logging.INFO)
parser.add_argument(
"--debug",
action="store_const",
const=logging.DEBUG,
dest="loglevel",
help="Print script debug information",
)
args = parser.parse_args()
logging.basicConfig(level=args.loglevel, format="[%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)
colorama.init()
verbose = None
found_verbose_target = False
if args.verbose:
try:
verbose = int(args.verbose, 16)
except ValueError:
parser.error("invalid verbose argument")
html_path = args.html
plain = args.no_color
original = args.original
if not os.path.isfile(original):
parser.error(f"Original binary {original} does not exist")
recomp = args.recompiled
if not os.path.isfile(recomp):
parser.error(f"Recompiled binary {recomp} does not exist")
syms = args.pdb
if not os.path.isfile(syms):
parser.error(f"Symbols PDB {syms} does not exist")
source = args.decomp_dir
if not os.path.isdir(source):
parser.error(f"Source directory {source} does not exist")
svg = args.svg
wine_path_converter = None
if os.name != "nt":
wine_path_converter = WinePathConverter(source)
with Bin(original, logger) as origfile, Bin(recomp, logger) as recompfile:
syminfo = SymInfo(
syms, recompfile, logger, sym_wine_path_converter=wine_path_converter
)
print()
capstone_disassembler = Cs(CS_ARCH_X86, CS_MODE_32)
function_count = 0
total_accuracy = 0
total_effective_accuracy = 0
@ -417,7 +314,7 @@ def can_resolve_register_differences(original_asm, new_asm):
basename = os.path.basename(os.path.splitext(original)[0])
for srcfilename in walk_source_dir(source):
with open(srcfilename, 'r') as srcfile:
with open(srcfilename, "r", encoding="utf-8") as srcfile:
blocks = find_code_blocks(srcfile)
for block in blocks:
@ -440,7 +337,9 @@ def can_resolve_register_differences(original_asm, new_asm):
if not recinfo:
continue
else:
recinfo = syminfo.get_recompiled_address(srcfilename, block.start_line)
recinfo = syminfo.get_recompiled_address(
srcfilename, block.start_line
)
if not recinfo:
continue
@ -449,8 +348,18 @@ def can_resolve_register_differences(original_asm, new_asm):
ratio = 0.0
effective_ratio = 0.0
if recinfo.size:
origasm = parse_asm(origfile, addr + recinfo.start, recinfo.size)
recompasm = parse_asm(recompfile, recinfo.addr + recinfo.start, recinfo.size)
origasm = parse_asm(
capstone_disassembler,
origfile,
addr + recinfo.start,
recinfo.size,
)
recompasm = parse_asm(
capstone_disassembler,
recompfile,
recinfo.addr + recinfo.start,
recinfo.size,
)
diff = difflib.SequenceMatcher(None, origasm, recompasm)
ratio = diff.ratio()
@ -464,28 +373,40 @@ def can_resolve_register_differences(original_asm, new_asm):
else:
ratio = 0
percenttext = f'{(effective_ratio * 100):.2f}%'
percenttext = f"{(effective_ratio * 100):.2f}%"
if not plain:
if effective_ratio == 1.0:
percenttext = colorama.Fore.GREEN + percenttext + colorama.Style.RESET_ALL
percenttext = (
colorama.Fore.GREEN + percenttext + colorama.Style.RESET_ALL
)
elif effective_ratio > 0.8:
percenttext = colorama.Fore.YELLOW + percenttext + colorama.Style.RESET_ALL
percenttext = (
colorama.Fore.YELLOW
+ percenttext
+ colorama.Style.RESET_ALL
)
else:
percenttext = colorama.Fore.RED + percenttext + colorama.Style.RESET_ALL
percenttext = (
colorama.Fore.RED + percenttext + colorama.Style.RESET_ALL
)
if effective_ratio == 1.0 and ratio != 1.0:
if plain:
percenttext += '*'
percenttext += "*"
else:
percenttext += colorama.Fore.RED + '*' + colorama.Style.RESET_ALL
percenttext += (
colorama.Fore.RED + "*" + colorama.Style.RESET_ALL
)
if args.print_rec_addr:
addrs = f'0x{addr:x} / 0x{recinfo.addr:x}'
addrs = f"0x{addr:x} / 0x{recinfo.addr:x}"
else:
addrs = hex(addr)
if not verbose:
print(f' {recinfo.name} ({addrs}) is {percenttext} similar to the original')
print(
f" {recinfo.name} ({addrs}) is {percenttext} similar to the original"
)
function_count += 1
total_accuracy += ratio
@ -497,80 +418,47 @@ def can_resolve_register_differences(original_asm, new_asm):
# If verbose, print the diff for that function to the output
if verbose:
if effective_ratio == 1.0:
ok_text = 'OK!' if plain else (colorama.Fore.GREEN + '✨ OK! ✨' + colorama.Style.RESET_ALL)
ok_text = (
"OK!"
if plain
else (
colorama.Fore.GREEN
+ "✨ OK! ✨"
+ colorama.Style.RESET_ALL
)
)
if ratio == 1.0:
print(f'{addrs}: {recinfo.name} 100% match.\n\n{ok_text}\n\n')
print(
f"{addrs}: {recinfo.name} 100% match.\n\n{ok_text}\n\n"
)
else:
print(f'{addrs}: {recinfo.name} Effective 100%% match. (Differs in register allocation only)\n\n{ok_text} (still differs in register allocation)\n\n')
print(
f"{addrs}: {recinfo.name} Effective 100%% match. (Differs in register allocation only)\n\n{ok_text} (still differs in register allocation)\n\n"
)
else:
for line in udiff:
if line.startswith('++') or line.startswith('@@') or line.startswith('--'):
# Skip unneeded parts of the diff for the brief view
pass
elif line.startswith('+'):
if plain:
print(line)
else:
print(colorama.Fore.GREEN + line)
elif line.startswith('-'):
if plain:
print(line)
else:
print(colorama.Fore.RED + line)
else:
print(line)
if not plain:
print(colorama.Style.RESET_ALL, end='')
print_diff(udiff, plain)
print(f'\n{recinfo.name} is only {percenttext} similar to the original, diff above')
print(
f"\n{recinfo.name} is only {percenttext} similar to the original, diff above"
)
# If html, record the diffs to an HTML file
if html_path:
htmlinsert.append({"address": f"0x{addr:x}",
htmlinsert.append(
{
"address": f"0x{addr:x}",
"name": recinfo.name,
"matching": effective_ratio,
"diff": '\n'.join(udiff)})
def gen_html(html_file, data):
output_data = Renderer().render_path(get_file_in_script_dir('template.html'),
{
"data": data,
"diff": "\n".join(udiff),
}
)
with open(html_file, 'w') as htmlfile:
htmlfile.write(output_data)
def gen_svg(svg_file, name_svg, icon, svg_implemented_funcs, total_funcs, raw_accuracy):
icon_data = None
if icon:
with open(icon, 'rb') as iconfile:
icon_data = base64.b64encode(iconfile.read()).decode('utf-8')
total_statistic = raw_accuracy / total_funcs
full_percentbar_width = 127.18422
output_data = Renderer().render_path(get_file_in_script_dir('template.svg'),
{
"name": name_svg,
"icon": icon_data,
"implemented": f'{(svg_implemented_funcs / total_funcs * 100):.2f}% ({svg_implemented_funcs}/{total_funcs})',
"accuracy": f'{(raw_accuracy / svg_implemented_funcs * 100):.2f}%',
"progbar": total_statistic * full_percentbar_width,
"percent": f'{(total_statistic * 100):.2f}%',
}
)
with open(svg_file, 'w') as svgfile:
svgfile.write(output_data)
if html_path:
gen_html(html_path, json.dumps(htmlinsert))
if verbose:
if not found_verbose_target:
print(f'Failed to find the function with address 0x{verbose:x}')
print(f"Failed to find the function with address 0x{verbose:x}")
else:
implemented_funcs = function_count
@ -580,7 +468,16 @@ def gen_svg(svg_file, name_svg, icon, svg_implemented_funcs, total_funcs, raw_ac
if function_count > 0:
effective_accuracy = total_effective_accuracy / function_count * 100
actual_accuracy = total_accuracy / function_count * 100
print(f'\nTotal effective accuracy {effective_accuracy:.2f}% across {function_count} functions ({actual_accuracy:.2f}% actual accuracy)')
print(
f"\nTotal effective accuracy {effective_accuracy:.2f}% across {function_count} functions ({actual_accuracy:.2f}% actual accuracy)"
)
if svg:
gen_svg(svg, os.path.basename(original), args.svg_icon, implemented_funcs, function_count, total_effective_accuracy)
gen_svg(
svg,
os.path.basename(original),
args.svg_icon,
implemented_funcs,
function_count,
total_effective_accuracy,
)

View file

@ -1,3 +1,4 @@
tools/isledecomp
capstone
colorama
isledecomp

View file

@ -1,76 +1,68 @@
#!/usr/bin/env python3
import argparse
import colorama
import difflib
import subprocess
import os
import sys
parser = argparse.ArgumentParser(allow_abbrev=False,
description='Verify Exports: Compare the exports of two DLLs.')
parser.add_argument('original', metavar='original-binary', help='The original binary')
parser.add_argument('recompiled', metavar='recompiled-binary', help='The recompiled binary')
parser.add_argument('--no-color', '-n', action='store_true', help='Do not color the output')
from isledecomp.utils import print_diff
parser = argparse.ArgumentParser(
allow_abbrev=False, description="Verify Exports: Compare the exports of two DLLs."
)
parser.add_argument("original", metavar="original-binary", help="The original binary")
parser.add_argument(
"recompiled", metavar="recompiled-binary", help="The recompiled binary"
)
parser.add_argument(
"--no-color", "-n", action="store_true", help="Do not color the output"
)
args = parser.parse_args()
if not os.path.isfile(args.original):
parser.error(f'Original binary file {args.original} does not exist')
parser.error(f"Original binary file {args.original} does not exist")
if not os.path.isfile(args.recompiled):
parser.error(f'Recompiled binary {args.recompiled} does not exist')
parser.error(f"Recompiled binary {args.recompiled} does not exist")
def get_file_in_script_dir(fn):
return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn)
def get_exports(file):
call = [get_file_in_script_dir('DUMPBIN.EXE'), '/EXPORTS']
if os.name != 'nt':
call.insert(0, 'wine')
file = subprocess.check_output(['winepath', '-w', file]).decode('utf-8').strip()
def get_exports(file):
call = [get_file_in_script_dir("DUMPBIN.EXE"), "/EXPORTS"]
if os.name != "nt":
call.insert(0, "wine")
file = subprocess.check_output(["winepath", "-w", file]).decode("utf-8").strip()
call.append(file)
raw = subprocess.check_output(call).decode('utf-8').split('\r\n')
raw = subprocess.check_output(call).decode("utf-8").split("\r\n")
exports = []
start = False
for line in raw:
if not start:
if line == ' ordinal hint name':
if line == " ordinal hint name":
start = True
else:
if line:
exports.append(line[27:line.rindex(' (')])
exports.append(line[27 : line.rindex(" (")])
elif exports:
break
return exports
og_exp = get_exports(args.original)
re_exp = get_exports(args.recompiled)
udiff = difflib.unified_diff(og_exp, re_exp)
has_diff = False
for line in udiff:
has_diff = True
color = ''
if line.startswith('++') or line.startswith('@@') or line.startswith('--'):
# Skip unneeded parts of the diff for the brief view
continue
# Work out color if we are printing color
if not args.no_color:
if line.startswith('+'):
color = colorama.Fore.GREEN
elif line.startswith('-'):
color = colorama.Fore.RED
print(color + line)
# Reset color if we're printing in color
if not args.no_color:
print(colorama.Style.RESET_ALL, end='')
has_diff = print_diff(udiff, args.no_color)
sys.exit(1 if has_diff else 0)