From b14116cc93a7a8eafc0df0538b7a50329a3db97d Mon Sep 17 00:00:00 2001 From: Thomas Phillips Date: Sun, 26 Nov 2023 07:27:42 +1300 Subject: [PATCH] Python Linting and Code Formatting (#298) * Create common print_diff function * Add pylint and black * Fix linting, move classes to utils * Add black/pylint to github actions * Fix linting * Move Bin and SymInfo into their own files * Split out format * Tidy up workdlows and pip, add readme * Lint tests, add tests to readme --- .github/workflows/build.yml | 19 +- .github/workflows/format.yml | 19 +- .github/workflows/order.yml | 5 +- .gitignore | 4 +- .pylintrc | 635 ++++++++++++ pyproject.toml | 2 + tools/README.md | 50 + tools/checkorder/checkorder.py | 104 +- tools/checkorder/requirements.txt | 1 - tools/isledecomp/isledecomp/__init__.py | 5 + tools/isledecomp/isledecomp/bin.py | 47 + tools/isledecomp/isledecomp/dir.py | 50 +- tools/isledecomp/isledecomp/parser/parser.py | 45 +- tools/isledecomp/isledecomp/parser/util.py | 69 +- tools/isledecomp/isledecomp/syminfo.py | 138 +++ tools/isledecomp/isledecomp/utils.py | 42 + tools/isledecomp/setup.py | 8 +- tools/isledecomp/tests/test_parser.py | 41 +- tools/isledecomp/tests/test_parser_util.py | 104 +- tools/reccmp/reccmp.py | 991 +++++++++---------- tools/{reccmp => }/requirements.txt | 3 +- tools/verexp/verexp.py | 82 +- 22 files changed, 1675 insertions(+), 789 deletions(-) create mode 100644 .pylintrc create mode 100644 pyproject.toml create mode 100644 tools/README.md delete mode 100644 tools/checkorder/requirements.txt create mode 100644 tools/isledecomp/isledecomp/bin.py create mode 100644 tools/isledecomp/isledecomp/syminfo.py create mode 100644 tools/isledecomp/isledecomp/utils.py rename tools/{reccmp => }/requirements.txt (52%) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e14f01d8..8434d906 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -38,7 +38,7 @@ jobs: build/ISLE.PDB build/LEGO1.DLL build/LEGO1.PDB - + compare: needs: build runs-on: windows-latest @@ -70,15 +70,14 @@ jobs: path: legobin key: legobin - - name: Build isledecomp library + - name: Install python packages shell: bash run: | - pip install tools/isledecomp + pip install -r tools/requirements.txt - name: Summarize Accuracy shell: bash run: | - pip install -r tools/reccmp/requirements.txt python3 tools/reccmp/reccmp.py -S ISLEPROGRESS.SVG --svg-icon tools/reccmp/isle.png -H ISLEPROGRESS.HTML legobin/ISLE.EXE build/ISLE.EXE build/ISLE.PDB . | tee ISLEPROGRESS.TXT python3 tools/reccmp/reccmp.py -S LEGO1PROGRESS.SVG -T 1929 --svg-icon tools/reccmp/lego1.png -H LEGO1PROGRESS.HTML legobin/LEGO1.DLL build/LEGO1.DLL build/LEGO1.PDB . | tee LEGO1PROGRESS.TXT @@ -96,7 +95,7 @@ jobs: shell: bash run: | python3 tools/verexp/verexp.py legobin/LEGO1.DLL build/LEGO1.DLL - + - name: Upload Artifact uses: actions/upload-artifact@master with: @@ -104,7 +103,7 @@ jobs: path: | ISLEPROGRESS.* LEGO1PROGRESS.* - + upload: needs: [build, compare] runs-on: ubuntu-latest @@ -113,16 +112,16 @@ jobs: - uses: actions/checkout@v3 with: repository: 'probonopd/uploadtool' - + - uses: actions/download-artifact@master with: name: Win32 path: build - + - uses: actions/download-artifact@master with: name: Accuracy Report - + - name: Upload Continuous Release env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} @@ -133,6 +132,6 @@ jobs: build/LEGO1.DLL \ ISLEPROGRESS.* \ LEGO1PROGRESS.* - + curl -X POST -F key=$UPLOAD_KEY -F 'file=@ISLEPROGRESS.SVG' https://legoisland.org/progress/ curl -X POST -F key=$UPLOAD_KEY -F 'file=@LEGO1PROGRESS.SVG' https://legoisland.org/progress/ diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index ff151342..16ad85ae 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -19,4 +19,21 @@ jobs: LEGO1/*.cpp LEGO1/*.h \ LEGO1/realtime/*.cpp LEGO1/realtime/*.h \ LEGO1/tgl/*.h \ - LEGO1/viewmanager/*.cpp LEGO1/viewmanager/*.h \ No newline at end of file + LEGO1/viewmanager/*.cpp LEGO1/viewmanager/*.h + + python-format: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: Install python libraries + shell: bash + run: | + pip install black pylint pytest -r tools/requirements.txt + + - name: Run pylint and black + shell: bash + run: | + pylint tools --ignore=build + black --check tools diff --git a/.github/workflows/order.yml b/.github/workflows/order.yml index 6a0ee029..d7ae315c 100644 --- a/.github/workflows/order.yml +++ b/.github/workflows/order.yml @@ -9,12 +9,11 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Build isledecomp library + - name: Install python libraries run: | - pip install tools/isledecomp + pip install -r tools/requirements.txt - name: Run checkorder.py run: | - pip install -r tools/checkorder/requirements.txt python3 tools/checkorder/checkorder.py --verbose --enforce ISLE python3 tools/checkorder/checkorder.py --verbose --enforce LEGO1 diff --git a/.gitignore b/.gitignore index 4e1970c2..72d6fef6 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,6 @@ ISLE.EXE LEGO1.DLL build/ *.swp -LEGO1PROGRESS.HTML -LEGO1PROGRESS.SVG +LEGO1PROGRESS.* +ISLEPROGRESS.* *.pyc diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 00000000..7afd8185 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,635 @@ +[MAIN] + +# Analyse import fallback blocks. This can be used to support both Python 2 and +# 3 compatible code, which means that the block might have code that exists +# only in one or another interpreter, leading to false positives when analysed. +analyse-fallback-blocks=no + +# Clear in-memory caches upon conclusion of linting. Useful if running pylint +# in a server-like mode. +clear-cache-post-run=no + +# Load and enable all available extensions. Use --list-extensions to see a list +# all available extensions. +#enable-all-extensions= + +# In error mode, messages with a category besides ERROR or FATAL are +# suppressed, and no reports are done by default. Error mode is compatible with +# disabling specific errors. +#errors-only= + +# Always return a 0 (non-error) status code, even if lint errors are found. +# This is primarily useful in continuous integration scripts. +#exit-zero= + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. +extension-pkg-allow-list= + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. (This is an alternative name to extension-pkg-allow-list +# for backward compatibility.) +extension-pkg-whitelist= + +# Return non-zero exit code if any of these messages/categories are detected, +# even if score is above --fail-under value. Syntax same as enable. Messages +# specified are enabled, while categories only check already-enabled messages. +fail-on= + +# Specify a score threshold under which the program will exit with error. +fail-under=10 + +# Interpret the stdin as a python script, whose filename needs to be passed as +# the module_or_package argument. +#from-stdin= + +# Files or directories to be skipped. They should be base names, not paths. +ignore=CVS + +# Add files or directories matching the regular expressions patterns to the +# ignore-list. The regex matches against paths and can be in Posix or Windows +# format. Because '\\' represents the directory delimiter on Windows systems, +# it can't be used as an escape character. +ignore-paths= + +# Files or directories matching the regular expression patterns are skipped. +# The regex matches against base names, not paths. The default value ignores +# Emacs file locks +ignore-patterns=^\.# + +# List of module names for which member attributes should not be checked +# (useful for modules/projects where namespaces are manipulated during runtime +# and thus existing member attributes cannot be deduced by static analysis). It +# supports qualified module names, as well as Unix pattern matching. +ignored-modules= + +# Python code to execute, usually for sys.path manipulation such as +# pygtk.require(). +#init-hook= + +# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the +# number of processors available to use, and will cap the count on Windows to +# avoid hangs. +jobs=1 + +# Control the amount of potential inferred values when inferring a single +# object. This can help the performance when dealing with large functions or +# complex, nested conditions. +limit-inference-results=100 + +# List of plugins (as comma separated values of python module names) to load, +# usually to register additional checkers. +load-plugins= + +# Pickle collected data for later comparisons. +persistent=yes + +# Minimum Python version to use for version dependent checks. Will default to +# the version used to run pylint. +py-version=3.11 + +# Discover python modules and packages in the file system subtree. +recursive=no + +# Add paths to the list of the source roots. Supports globbing patterns. The +# source root is an absolute path or a path relative to the current working +# directory used to determine a package namespace for modules located under the +# source root. +source-roots= + +# When enabled, pylint would attempt to guess common misconfiguration and emit +# user-friendly hints instead of false-positive error messages. +suggestion-mode=yes + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +unsafe-load-any-extension=no + +# In verbose mode, extra non-checker-related info will be displayed. +#verbose= + + +[BASIC] + +# Naming style matching correct argument names. +argument-naming-style=snake_case + +# Regular expression matching correct argument names. Overrides argument- +# naming-style. If left empty, argument names will be checked with the set +# naming style. +#argument-rgx= + +# Naming style matching correct attribute names. +attr-naming-style=snake_case + +# Regular expression matching correct attribute names. Overrides attr-naming- +# style. If left empty, attribute names will be checked with the set naming +# style. +#attr-rgx= + +# Bad variable names which should always be refused, separated by a comma. +bad-names=foo, + bar, + baz, + toto, + tutu, + tata + +# Bad variable names regexes, separated by a comma. If names match any regex, +# they will always be refused +bad-names-rgxs= + +# Naming style matching correct class attribute names. +class-attribute-naming-style=any + +# Regular expression matching correct class attribute names. Overrides class- +# attribute-naming-style. If left empty, class attribute names will be checked +# with the set naming style. +#class-attribute-rgx= + +# Naming style matching correct class constant names. +class-const-naming-style=UPPER_CASE + +# Regular expression matching correct class constant names. Overrides class- +# const-naming-style. If left empty, class constant names will be checked with +# the set naming style. +#class-const-rgx= + +# Naming style matching correct class names. +class-naming-style=PascalCase + +# Regular expression matching correct class names. Overrides class-naming- +# style. If left empty, class names will be checked with the set naming style. +#class-rgx= + +# Naming style matching correct constant names. +const-naming-style=snake_case + +# Regular expression matching correct constant names. Overrides const-naming- +# style. If left empty, constant names will be checked with the set naming +# style. +#const-rgx= + +# Minimum line length for functions/classes that require docstrings, shorter +# ones are exempt. +docstring-min-length=-1 + +# Naming style matching correct function names. +function-naming-style=snake_case + +# Regular expression matching correct function names. Overrides function- +# naming-style. If left empty, function names will be checked with the set +# naming style. +#function-rgx= + +# Good variable names which should always be accepted, separated by a comma. +good-names=i, + j, + k, + ex, + Run, + _ + +# Good variable names regexes, separated by a comma. If names match any regex, +# they will always be accepted +good-names-rgxs= + +# Include a hint for the correct naming format with invalid-name. +include-naming-hint=no + +# Naming style matching correct inline iteration names. +inlinevar-naming-style=any + +# Regular expression matching correct inline iteration names. Overrides +# inlinevar-naming-style. If left empty, inline iteration names will be checked +# with the set naming style. +#inlinevar-rgx= + +# Naming style matching correct method names. +method-naming-style=snake_case + +# Regular expression matching correct method names. Overrides method-naming- +# style. If left empty, method names will be checked with the set naming style. +#method-rgx= + +# Naming style matching correct module names. +module-naming-style=snake_case + +# Regular expression matching correct module names. Overrides module-naming- +# style. If left empty, module names will be checked with the set naming style. +#module-rgx= + +# Colon-delimited sets of names that determine each other's naming style when +# the name regexes allow several styles. +name-group= + +# Regular expression which should only match function or class names that do +# not require a docstring. +no-docstring-rgx=^_ + +# List of decorators that produce properties, such as abc.abstractproperty. Add +# to this list to register other decorators that produce valid properties. +# These decorators are taken in consideration only for invalid-name. +property-classes=abc.abstractproperty + +# Regular expression matching correct type alias names. If left empty, type +# alias names will be checked with the set naming style. +#typealias-rgx= + +# Regular expression matching correct type variable names. If left empty, type +# variable names will be checked with the set naming style. +#typevar-rgx= + +# Naming style matching correct variable names. +variable-naming-style=snake_case + +# Regular expression matching correct variable names. Overrides variable- +# naming-style. If left empty, variable names will be checked with the set +# naming style. +#variable-rgx= + + +[CLASSES] + +# Warn about protected attribute access inside special methods +check-protected-access-in-special-methods=no + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods=__init__, + __new__, + setUp, + asyncSetUp, + __post_init__ + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected=_asdict,_fields,_replace,_source,_make,os._exit + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg=cls + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg=mcs + + +[DESIGN] + +# List of regular expressions of class ancestor names to ignore when counting +# public methods (see R0903) +exclude-too-few-public-methods= + +# List of qualified class names to ignore when counting class parents (see +# R0901) +ignored-parents= + +# Maximum number of arguments for function / method. +max-args=6 + +# Maximum number of attributes for a class (see R0902). +max-attributes=7 + +# Maximum number of boolean expressions in an if statement (see R0916). +max-bool-expr=5 + +# Maximum number of branch for function / method body. +max-branches=30 + +# Maximum number of locals for function / method body. +max-locals=30 + +# Maximum number of parents for a class (see R0901). +max-parents=7 + +# Maximum number of public methods for a class (see R0904). +max-public-methods=20 + +# Maximum number of return / yield for function / method body. +max-returns=6 + +# Maximum number of statements in function / method body. +max-statements=75 + +# Minimum number of public methods for a class (see R0903). +min-public-methods=0 + + +[EXCEPTIONS] + +# Exceptions that will emit a warning when caught. +overgeneral-exceptions=builtins.BaseException,builtins.Exception + + +[FORMAT] + +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +expected-line-ending-format= + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines=^\s*(# )??$ + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren=2 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string=' ' + +# Maximum number of characters on a single line. +max-line-length=200 + +# Maximum number of lines in a module. +max-module-lines=1000 + +# Allow the body of a class to be on the same line as the declaration if body +# contains single statement. +single-line-class-stmt=no + +# Allow the body of an if to be on the same line as the test if there is no +# else. +single-line-if-stmt=no + + +[IMPORTS] + +# List of modules that can be imported at any level, not just the top level +# one. +allow-any-import-level= + +# Allow explicit reexports by alias from a package __init__. +allow-reexport-from-package=no + +# Allow wildcard imports from modules that define __all__. +allow-wildcard-with-all=no + +# Deprecated modules which should not be used, separated by a comma. +deprecated-modules= + +# Output a graph (.gv or any supported image format) of external dependencies +# to the given file (report RP0402 must not be disabled). +ext-import-graph= + +# Output a graph (.gv or any supported image format) of all (i.e. internal and +# external) dependencies to the given file (report RP0402 must not be +# disabled). +import-graph= + +# Output a graph (.gv or any supported image format) of internal dependencies +# to the given file (report RP0402 must not be disabled). +int-import-graph= + +# Force import order to recognize a module as part of the standard +# compatibility libraries. +known-standard-library= + +# Force import order to recognize a module as part of a third party library. +known-third-party=enchant + +# Couples of modules and preferred modules, separated by a comma. +preferred-modules= + + +[LOGGING] + +# The type of string formatting that logging methods do. `old` means using % +# formatting, `new` is for `{}` formatting. +logging-format-style=old + +# Logging modules to check that the string format arguments are in logging +# function parameter format. +logging-modules=logging + + +[MESSAGES CONTROL] + +# Only show warnings with the listed confidence levels. Leave empty to show +# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE, +# UNDEFINED. +confidence=HIGH, + CONTROL_FLOW, + INFERENCE, + INFERENCE_FAILURE, + UNDEFINED + +# Disable the message, report, category or checker with the given id(s). You +# can either give multiple identifiers separated by comma (,) or put this +# option multiple times (only on the command line, not in the configuration +# file where it should appear only once). You can also use "--disable=all" to +# disable everything first and then re-enable specific checks. For example, if +# you want to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use "--disable=all --enable=classes +# --disable=W". +disable=raw-checker-failed, + bad-inline-option, + locally-disabled, + file-ignored, + suppressed-message, + useless-suppression, + deprecated-pragma, + use-symbolic-message-instead, + missing-class-docstring, + missing-function-docstring, + missing-module-docstring, + fixme + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where +# it should appear only once). See also the "--disable" option for examples. +enable=c-extension-no-member + + +[METHOD_ARGS] + +# List of qualified names (i.e., library.method) which require a timeout +# parameter e.g. 'requests.api.get,requests.api.post' +timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.api.options,requests.api.patch,requests.api.post,requests.api.put,requests.api.request + + +[MISCELLANEOUS] + +# List of note tags to take in consideration, separated by a comma. +notes=FIXME, + XXX, + TODO + +# Regular expression of note tags to take in consideration. +notes-rgx= + + +[REFACTORING] + +# Maximum number of nested blocks for function / method body +max-nested-blocks=5 + +# Complete name of functions that never returns. When checking for +# inconsistent-return-statements if a never returning function is called then +# it will be considered as an explicit return statement and no message will be +# printed. +never-returning-functions=sys.exit,argparse.parse_error + + +[REPORTS] + +# Python expression which should return a score less than or equal to 10. You +# have access to the variables 'fatal', 'error', 'warning', 'refactor', +# 'convention', and 'info' which contain the number of messages in each +# category, as well as 'statement' which is the total number of statements +# analyzed. This score is used by the global evaluation report (RP0004). +evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)) + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details. +msg-template= + +# Set the output format. Available formats are text, parseable, colorized, json +# and msvs (visual studio). You can also give a reporter class, e.g. +# mypackage.mymodule.MyReporterClass. +#output-format= + +# Tells whether to display a full report or only the messages. +reports=no + +# Activate the evaluation score. +score=yes + + +[SIMILARITIES] + +# Comments are removed from the similarity computation +ignore-comments=yes + +# Docstrings are removed from the similarity computation +ignore-docstrings=yes + +# Imports are removed from the similarity computation +ignore-imports=yes + +# Signatures are removed from the similarity computation +ignore-signatures=yes + +# Minimum lines number of a similarity. +min-similarity-lines=4 + + +[SPELLING] + +# Limits count of emitted suggestions for spelling mistakes. +max-spelling-suggestions=4 + +# Spelling dictionary name. No available dictionaries : You need to install +# both the python package and the system dependency for enchant to work.. +spelling-dict= + +# List of comma separated words that should be considered directives if they +# appear at the beginning of a comment and should not be checked. +spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy: + +# List of comma separated words that should not be checked. +spelling-ignore-words= + +# A path to a file that contains the private dictionary; one word per line. +spelling-private-dict-file= + +# Tells whether to store unknown words to the private dictionary (see the +# --spelling-private-dict-file option) instead of raising a message. +spelling-store-unknown-words=no + + +[STRING] + +# This flag controls whether inconsistent-quotes generates a warning when the +# character used as a quote delimiter is used inconsistently within a module. +check-quote-consistency=no + +# This flag controls whether the implicit-str-concat should generate a warning +# on implicit string concatenation in sequences defined over several lines. +check-str-concat-over-line-jumps=no + + +[TYPECHECK] + +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators=contextlib.contextmanager + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +generated-members= + +# Tells whether to warn about missing members when the owner of the attribute +# is inferred to be None. +ignore-none=yes + +# This flag controls whether pylint should warn about no-member and similar +# checks whenever an opaque object is returned when inferring. The inference +# can return multiple potential results while evaluating a Python object, but +# some branches might not be evaluated, which results in partial inference. In +# that case, it might be useful to still emit no-member and other checks for +# the rest of the inferred objects. +ignore-on-opaque-inference=yes + +# List of symbolic message names to ignore for Mixin members. +ignored-checks-for-mixins=no-member, + not-async-context-manager, + not-context-manager, + attribute-defined-outside-init + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace + +# Show a hint with possible names when a member name was not found. The aspect +# of finding the hint is based on edit distance. +missing-member-hint=yes + +# The minimum edit distance a name should have in order to be considered a +# similar match for a missing member name. +missing-member-hint-distance=1 + +# The total number of similar names that should be taken in consideration when +# showing a hint for a missing member. +missing-member-max-choices=1 + +# Regex pattern to define which classes are considered mixins. +mixin-class-rgx=.*[Mm]ixin + +# List of decorators that change the signature of a decorated function. +signature-mutators= + + +[VARIABLES] + +# List of additional names supposed to be defined in builtins. Remember that +# you should avoid defining new builtins when possible. +additional-builtins= + +# Tells whether unused global variables should be treated as a violation. +allow-global-unused-variables=yes + +# List of names allowed to shadow builtins +allowed-redefined-builtins= + +# List of strings which can identify a callback function by name. A callback +# name must start or end with one of those strings. +callbacks=cb_, + _cb + +# A regular expression matching the name of dummy variables (i.e. expected to +# not be used). +dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ + +# Argument names that match this expression will be ignored. +ignored-argument-names=_.*|^ignored_|^unused_ + +# Tells whether we should check for unused import in __init__ files. +init-import=no + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..6deafc26 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 120 diff --git a/tools/README.md b/tools/README.md new file mode 100644 index 00000000..4c0cbae9 --- /dev/null +++ b/tools/README.md @@ -0,0 +1,50 @@ +# LEGO Island Decompilation Tools + +These are a set of Python tools for helping with the decomp project + +## Installing +Use pip to install the required packages: + +``` +pip install -r tools/requirements.txt +``` + +## reccmp +This is a script to compare the original EXE or DLL with a recpmpiled EXE or DLL, provided a .PDB file + +## verexp +This verifies exports by comparing the exports of an original DLL and the recompiled DLL + +## checkorder +This checks the order of C++ source and header files to make sure the functions are in order + +## isledecomp +This is a library that is used by rhe above scripts. it has a collection of useful classes and functions + +### Testing +`isledecomp` has a small suite of tests. Install pylint and run it, passing in the directory: + +``` +pip install pytest +pytest tools/isledecomp/tests/ +``` + +## Development +In order to keep the code clean and consistent, we use `pylint` and `black`: + +``` +pip install black pylint +``` +### To run pylint (ignores build and virtualenv): +``` +pylint tools/ --ignore=build,bin,lib +``` + +### To check code formatting without rewriting files: +``` +black --check tools/ +``` +### To apply code formatting: +``` +black tools/ +``` diff --git a/tools/checkorder/checkorder.py b/tools/checkorder/checkorder.py index eb243c2d..1ac8391f 100644 --- a/tools/checkorder/checkorder.py +++ b/tools/checkorder/checkorder.py @@ -1,32 +1,29 @@ import os import sys import argparse -from isledecomp.dir import ( - walk_source_dir, - is_file_cpp -) +from isledecomp.dir import walk_source_dir, is_file_cpp from isledecomp.parser import find_code_blocks -from isledecomp.parser.util import ( - is_exact_offset_comment -) +from isledecomp.parser.util import is_exact_offset_comment def sig_truncate(sig: str) -> str: """Helper to truncate function names to 50 chars and append ellipsis - if needed. Goal is to stay under 80 columns for tool output.""" + if needed. Goal is to stay under 80 columns for tool output.""" return f"{sig[:47]}{'...' if len(sig) >= 50 else ''}" def check_file(filename: str, verbose: bool = False) -> bool: """Open and read the given file, then check whether the code blocks - are in order. If verbose, print each block.""" + are in order. If verbose, print each block.""" - with open(filename, 'r') as f: + with open(filename, "r", encoding="utf-8") as f: code_blocks = find_code_blocks(f) - bad_comments = [(block.start_line, block.offset_comment) - for block in code_blocks - if not is_exact_offset_comment(block.offset_comment)] + bad_comments = [ + (block.start_line, block.offset_comment) + for block in code_blocks + if not is_exact_offset_comment(block.offset_comment) + ] just_offsets = [block.offset for block in code_blocks] sorted_offsets = sorted(just_offsets) @@ -35,8 +32,7 @@ def check_file(filename: str, verbose: bool = False) -> bool: # If we detect inexact comments, don't print anything unless we are # in verbose mode. If the file is out of order, we always print the # file name. - should_report = ((len(bad_comments) > 0 and verbose) - or file_out_of_order) + should_report = (len(bad_comments) > 0 and verbose) or file_out_of_order if not should_report and not file_out_of_order: return False @@ -49,19 +45,21 @@ def check_file(filename: str, verbose: bool = False) -> bool: prev_offset = 0 for block in code_blocks: - msg = ' '.join([ - ' ' if block.offset > prev_offset else '!', - f'{block.offset:08x}', - f'{block.end_line - block.start_line:4} lines', - f'{order_lookup[block.offset]:3}', - ' ', - sig_truncate(block.signature), - ]) + msg = " ".join( + [ + " " if block.offset > prev_offset else "!", + f"{block.offset:08x}", + f"{block.end_line - block.start_line:4} lines", + f"{order_lookup[block.offset]:3}", + " ", + sig_truncate(block.signature), + ] + ) print(msg) prev_offset = block.offset - for (line_no, line) in bad_comments: - print(f'* line {line_no:3} bad offset comment ({line})') + for line_no, line in bad_comments: + print(f"* line {line_no:3} bad offset comment ({line})") print() @@ -69,15 +67,25 @@ def check_file(filename: str, verbose: bool = False) -> bool: def parse_args(test_args: list | None = None) -> dict: - p = argparse.ArgumentParser() - p.add_argument('target', help='The file or directory to check.') - p.add_argument('--enforce', action=argparse.BooleanOptionalAction, - default=False, - help='Fail with error code if target is out of order.') - p.add_argument('--verbose', action=argparse.BooleanOptionalAction, - default=False, - help=('Display each code block in the file and show ' - 'where each consecutive run of blocks is broken.')) + p = argparse.ArgumentParser( + description="Checks the source files to make sure the function offset comments are in order", + ) + p.add_argument("target", help="The file or directory to check.") + p.add_argument( + "--enforce", + action=argparse.BooleanOptionalAction, + default=False, + help="Fail with error code if target is out of order.", + ) + p.add_argument( + "--verbose", + action=argparse.BooleanOptionalAction, + default=False, + help=( + "Display each code block in the file and show " + "where each consecutive run of blocks is broken." + ), + ) if test_args is None: args = p.parse_args() @@ -90,31 +98,33 @@ def parse_args(test_args: list | None = None) -> dict: def main(): args = parse_args() - if os.path.isdir(args['target']): - files_to_check = list(walk_source_dir(args['target'])) - elif os.path.isfile(args['target']) and is_file_cpp(args['target']): - files_to_check = [args['target']] + if os.path.isdir(args["target"]): + files_to_check = list(walk_source_dir(args["target"])) + elif os.path.isfile(args["target"]) and is_file_cpp(args["target"]): + files_to_check = [args["target"]] else: - sys.exit('Invalid target') + sys.exit("Invalid target") files_out_of_order = 0 for file in files_to_check: - is_jumbled = check_file(file, args['verbose']) + is_jumbled = check_file(file, args["verbose"]) if is_jumbled: files_out_of_order += 1 if files_out_of_order > 0: - error_message = ' '.join([ - str(files_out_of_order), - 'files are' if files_out_of_order > 1 else 'file is', - 'out of order' - ]) + error_message = " ".join( + [ + str(files_out_of_order), + "files are" if files_out_of_order > 1 else "file is", + "out of order", + ] + ) print(error_message) - if files_out_of_order > 0 and args['enforce']: + if files_out_of_order > 0 and args["enforce"]: sys.exit(1) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/tools/checkorder/requirements.txt b/tools/checkorder/requirements.txt deleted file mode 100644 index 176352a6..00000000 --- a/tools/checkorder/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -isledecomp \ No newline at end of file diff --git a/tools/isledecomp/isledecomp/__init__.py b/tools/isledecomp/isledecomp/__init__.py index e69de29b..a95f1869 100644 --- a/tools/isledecomp/isledecomp/__init__.py +++ b/tools/isledecomp/isledecomp/__init__.py @@ -0,0 +1,5 @@ +from .bin import * +from .dir import * +from .parser import * +from .syminfo import * +from .utils import * diff --git a/tools/isledecomp/isledecomp/bin.py b/tools/isledecomp/isledecomp/bin.py new file mode 100644 index 00000000..38245ace --- /dev/null +++ b/tools/isledecomp/isledecomp/bin.py @@ -0,0 +1,47 @@ +import struct + + +# Declare a class that can automatically convert virtual executable addresses +# to file addresses +class Bin: + def __init__(self, filename, logger): + self.logger = logger + self.logger.debug('Parsing headers of "%s"... ', filename) + self.filename = filename + self.file = None + self.imagebase = None + self.textvirt = None + self.textraw = None + + def __enter__(self): + self.logger.debug(f"Bin {self.filename} Enter") + self.file = open(self.filename, "rb") + + # HACK: Strictly, we should be parsing the header, but we know where + # everything is in these two files so we just jump straight there + + # Read ImageBase + self.file.seek(0xB4) + (self.imagebase,) = struct.unpack(" str: + if unix_fn.startswith("./"): + return self.win_cwd + "\\" + unix_fn[2:].replace("/", "\\") + if unix_fn.startswith(self.unix_cwd): + return ( + self.win_cwd + + "\\" + + unix_fn.removeprefix(self.unix_cwd).replace("/", "\\").lstrip("\\") + ) + return self._call_winepath_unix2win(unix_fn) + + def get_unix_path(self, win_fn: str) -> str: + if win_fn.startswith(".\\") or win_fn.startswith("./"): + return self.unix_cwd + "/" + win_fn[2:].replace("\\", "/") + if win_fn.startswith(self.win_cwd): + return ( + self.unix_cwd + + "/" + + win_fn.removeprefix(self.win_cwd).replace("\\", "/") + ) + return self._call_winepath_win2unix(win_fn) + + @staticmethod + def _call_winepath_unix2win(fn: str) -> str: + return subprocess.check_output(["winepath", "-w", fn], text=True).strip() + + @staticmethod + def _call_winepath_win2unix(fn: str) -> str: + return subprocess.check_output(["winepath", fn], text=True).strip() + + def is_file_cpp(filename: str) -> bool: - (basefile, ext) = os.path.splitext(filename) - return ext.lower() in ('.h', '.cpp') + (_, ext) = os.path.splitext(filename) + return ext.lower() in (".h", ".cpp") def walk_source_dir(source: str, recursive: bool = True) -> Iterator[str]: """Generator to walk the given directory recursively and return - any C++ files found.""" + any C++ files found.""" source = os.path.abspath(source) - for subdir, dirs, files in os.walk(source): + for subdir, _, files in os.walk(source): for file in files: if is_file_cpp(file): yield os.path.join(subdir, file) if not recursive: break + + +def get_file_in_script_dir(fn): + return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn) diff --git a/tools/isledecomp/isledecomp/parser/parser.py b/tools/isledecomp/isledecomp/parser/parser.py index 345c8a98..1039ac41 100644 --- a/tools/isledecomp/isledecomp/parser/parser.py +++ b/tools/isledecomp/isledecomp/parser/parser.py @@ -7,7 +7,6 @@ OffsetMatch, is_blank_or_comment, match_offset_comment, - is_exact_offset_comment, get_template_function_name, remove_trailing_comment, distinct_by_module, @@ -25,10 +24,10 @@ class ReaderState(Enum): def find_code_blocks(stream: TextIO) -> List[CodeBlock]: """Read the IO stream (file) line-by-line and give the following report: - Foreach code block (function) in the file, what are its starting and - ending line numbers, and what is the given offset in the original - binary. We expect the result to be ordered by line number because we - are reading the file from start to finish.""" + Foreach code block (function) in the file, what are its starting and + ending line numbers, and what is the given offset in the original + binary. We expect the result to be ordered by line number because we + are reading the file from start to finish.""" blocks: List[CodeBlock] = [] @@ -51,14 +50,16 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]: # Our list of offset marks could have duplicates on # module name, so we'll eliminate those now. for offset_match in distinct_by_module(offset_matches): - block = CodeBlock(offset=offset_match.address, - signature=function_sig, - start_line=start_line, - end_line=end_line, - offset_comment=offset_match.comment, - module=offset_match.module, - is_template=offset_match.is_template, - is_stub=offset_match.is_stub) + block = CodeBlock( + offset=offset_match.address, + signature=function_sig, + start_line=start_line, + end_line=end_line, + offset_comment=offset_match.comment, + module=offset_match.module, + is_template=offset_match.is_template, + is_stub=offset_match.is_stub, + ) blocks.append(block) offset_matches = [] state = ReaderState.WANT_OFFSET @@ -66,15 +67,18 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]: if can_seek: line_no += 1 line = stream.readline() - if line == '': + if line == "": break new_match = match_offset_comment(line) if new_match is not None: # We will allow multiple offsets if we have just begun # the code block, but not after we hit the curly brace. - if state in (ReaderState.WANT_OFFSET, ReaderState.IN_TEMPLATE, - ReaderState.WANT_SIG): + if state in ( + ReaderState.WANT_OFFSET, + ReaderState.IN_TEMPLATE, + ReaderState.WANT_SIG, + ): # If we detected an offset marker unexpectedly, # we are handling it here so we can continue seeking. can_seek = True @@ -116,11 +120,10 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]: # same line. clang-format should prevent this (BraceWrapping) # but it is easy to detect. # If the entire function is on one line, handle that too. - if function_sig.endswith('{'): + if function_sig.endswith("{"): start_line = line_no state = ReaderState.IN_FUNC - elif (function_sig.endswith('}') or - function_sig.endswith('};')): + elif function_sig.endswith("}") or function_sig.endswith("};"): start_line = line_no end_line = line_no state = ReaderState.FUNCTION_DONE @@ -128,14 +131,14 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]: state = ReaderState.WANT_CURLY elif state == ReaderState.WANT_CURLY: - if line.strip() == '{': + if line.strip() == "{": start_line = line_no state = ReaderState.IN_FUNC elif state == ReaderState.IN_FUNC: # Naive but reasonable assumption that functions will end with # a curly brace on its own line with no prepended spaces. - if line.startswith('}'): + if line.startswith("}"): end_line = line_no state = ReaderState.FUNCTION_DONE diff --git a/tools/isledecomp/isledecomp/parser/util.py b/tools/isledecomp/isledecomp/parser/util.py index 848fd5ff..59fca75b 100644 --- a/tools/isledecomp/isledecomp/parser/util.py +++ b/tools/isledecomp/isledecomp/parser/util.py @@ -5,34 +5,49 @@ from collections import namedtuple -CodeBlock = namedtuple('CodeBlock', - ['offset', 'signature', 'start_line', 'end_line', - 'offset_comment', 'module', 'is_template', 'is_stub']) +CodeBlock = namedtuple( + "CodeBlock", + [ + "offset", + "signature", + "start_line", + "end_line", + "offset_comment", + "module", + "is_template", + "is_stub", + ], +) -OffsetMatch = namedtuple('OffsetMatch', ['module', 'address', 'is_template', - 'is_stub', 'comment']) +OffsetMatch = namedtuple( + "OffsetMatch", ["module", "address", "is_template", "is_stub", "comment"] +) # This has not been formally established, but considering that "STUB" # is a temporary state for a function, we assume it will appear last, # after any other modifiers (i.e. TEMPLATE) # To match a reasonable variance of formatting for the offset comment -offsetCommentRegex = re.compile(r'\s*//\s*OFFSET:\s*(\w+)\s+(?:0x)?([a-f0-9]+)(\s+TEMPLATE)?(\s+STUB)?', # nopep8 - flags=re.I) +offsetCommentRegex = re.compile( + r"\s*//\s*OFFSET:\s*(\w+)\s+(?:0x)?([a-f0-9]+)(\s+TEMPLATE)?(\s+STUB)?", # nopep8 + flags=re.I, +) # To match the exact syntax (text upper case, hex lower case, with spaces) # that is used in most places -offsetCommentExactRegex = re.compile(r'^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)( TEMPLATE)?( STUB)?$') # nopep8 +offsetCommentExactRegex = re.compile( + r"^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)( TEMPLATE)?( STUB)?$" +) # nopep8 # The goal here is to just read whatever is on the next line, so some # flexibility in the formatting seems OK -templateCommentRegex = re.compile(r'\s*//\s+(.*)') +templateCommentRegex = re.compile(r"\s*//\s+(.*)") # To remove any comment (//) or block comment (/*) and its leading spaces # from the end of a code line -trailingCommentRegex = re.compile(r'(\s*(?://|/\*).*)$') +trailingCommentRegex = re.compile(r"(\s*(?://|/\*).*)$") def get_template_function_name(line: str) -> str: @@ -47,23 +62,25 @@ def get_template_function_name(line: str) -> str: def remove_trailing_comment(line: str) -> str: - return trailingCommentRegex.sub('', line) + return trailingCommentRegex.sub("", line) def is_blank_or_comment(line: str) -> bool: """Helper to read ahead after the offset comment is matched. - There could be blank lines or other comments before the - function signature, and we want to skip those.""" + There could be blank lines or other comments before the + function signature, and we want to skip those.""" line_strip = line.strip() - return (len(line_strip) == 0 - or line_strip.startswith('//') - or line_strip.startswith('/*') - or line_strip.endswith('*/')) + return ( + len(line_strip) == 0 + or line_strip.startswith("//") + or line_strip.startswith("/*") + or line_strip.endswith("*/") + ) def is_exact_offset_comment(line: str) -> bool: """If the offset comment does not match our (unofficial) syntax - we may want to alert the user to fix it for style points.""" + we may want to alert the user to fix it for style points.""" return offsetCommentExactRegex.match(line) is not None @@ -72,17 +89,19 @@ def match_offset_comment(line: str) -> OffsetMatch | None: if match is None: return None - return OffsetMatch(module=match.group(1), - address=int(match.group(2), 16), - is_template=match.group(3) is not None, - is_stub=match.group(4) is not None, - comment=line.strip()) + return OffsetMatch( + module=match.group(1), + address=int(match.group(2), 16), + is_template=match.group(3) is not None, + is_stub=match.group(4) is not None, + comment=line.strip(), + ) def distinct_by_module(offsets: List) -> List: """Given a list of offset markers, return a list with distinct - module names. If module names (case-insensitive) are repeated, - choose the offset that appears first.""" + module names. If module names (case-insensitive) are repeated, + choose the offset that appears first.""" if len(offsets) < 2: return offsets diff --git a/tools/isledecomp/isledecomp/syminfo.py b/tools/isledecomp/isledecomp/syminfo.py new file mode 100644 index 00000000..a14bbbe2 --- /dev/null +++ b/tools/isledecomp/isledecomp/syminfo.py @@ -0,0 +1,138 @@ +import os +import subprocess +from .utils import get_file_in_script_dir + + +class RecompiledInfo: + addr = None + size = None + name = None + start = None + + +# Declare a class that parses the output of cvdump for fast access later +class SymInfo: + funcs = {} + lines = {} + names = {} + + def __init__(self, pdb, sym_recompfile, sym_logger, sym_wine_path_converter=None): + self.logger = sym_logger + call = [get_file_in_script_dir("cvdump.exe"), "-l", "-s"] + + if sym_wine_path_converter: + # Run cvdump through wine and convert path to Windows-friendly wine path + call.insert(0, "wine") + call.append(sym_wine_path_converter.get_wine_path(pdb)) + else: + call.append(pdb) + + self.logger.info("Parsing %s ...", pdb) + self.logger.debug("Command = %s", call) + line_dump = subprocess.check_output(call).decode("utf-8").split("\r\n") + + current_section = None + + self.logger.debug("Parsing output of cvdump.exe ...") + + for i, line in enumerate(line_dump): + if line.startswith("***"): + current_section = line[4:] + + if current_section == "SYMBOLS" and "S_GPROC32" in line: + sym_addr = int(line[26:34], 16) + + info = RecompiledInfo() + info.addr = ( + sym_addr + sym_recompfile.imagebase + sym_recompfile.textvirt + ) + + use_dbg_offs = False + if use_dbg_offs: + debug_offs = line_dump[i + 2] + debug_start = int(debug_offs[22:30], 16) + debug_end = int(debug_offs[43:], 16) + + info.start = debug_start + info.size = debug_end - debug_start + else: + info.start = 0 + info.size = int(line[41:49], 16) + + info.name = line[77:] + + self.names[info.name] = info + self.funcs[sym_addr] = info + elif ( + current_section == "LINES" + and line.startswith(" ") + and not line.startswith(" ") + ): + sourcepath = line.split()[0] + + if sym_wine_path_converter: + # Convert filename to Unix path for file compare + sourcepath = sym_wine_path_converter.get_unix_path(sourcepath) + + if sourcepath not in self.lines: + self.lines[sourcepath] = {} + + j = i + 2 + while True: + ll = line_dump[j].split() + if len(ll) == 0: + break + + k = 0 + while k < len(ll): + linenum = int(ll[k + 0]) + address = int(ll[k + 1], 16) + if linenum not in self.lines[sourcepath]: + self.lines[sourcepath][linenum] = address + k += 2 + + j += 1 + + self.logger.debug("... Parsing output of cvdump.exe finished") + + def get_recompiled_address(self, filename, line): + recompiled_addr = None + + self.logger.debug("Looking for %s:%s", filename, line) + filename_basename = os.path.basename(filename).lower() + + for fn in self.lines: + # Sometimes a PDB is compiled with a relative path while we always have + # an absolute path. Therefore we must + try: + if os.path.basename( + fn + ).lower() == filename_basename and os.path.samefile(fn, filename): + filename = fn + break + except FileNotFoundError: + continue + + if filename in self.lines and line in self.lines[filename]: + recompiled_addr = self.lines[filename][line] + + if recompiled_addr in self.funcs: + return self.funcs[recompiled_addr] + self.logger.error( + "Failed to find function symbol with address: %x", recompiled_addr + ) + return None + self.logger.error( + "Failed to find function symbol with filename and line: %s:%s", + filename, + line, + ) + return None + + def get_recompiled_address_from_name(self, name): + self.logger.debug("Looking for %s", name) + + if name in self.names: + return self.names[name] + self.logger.error("Failed to find function symbol with name: %s", name) + return None diff --git a/tools/isledecomp/isledecomp/utils.py b/tools/isledecomp/isledecomp/utils.py new file mode 100644 index 00000000..ce4896fd --- /dev/null +++ b/tools/isledecomp/isledecomp/utils.py @@ -0,0 +1,42 @@ +import os +import sys +import colorama + + +def print_diff(udiff, plain): + has_diff = False + for line in udiff: + has_diff = True + color = "" + if line.startswith("++") or line.startswith("@@") or line.startswith("--"): + # Skip unneeded parts of the diff for the brief view + continue + # Work out color if we are printing color + if not plain: + if line.startswith("+"): + color = colorama.Fore.GREEN + elif line.startswith("-"): + color = colorama.Fore.RED + print(color + line) + # Reset color if we're printing in color + if not plain: + print(colorama.Style.RESET_ALL, end="") + return has_diff + + +def get_file_in_script_dir(fn): + return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn) + + +class OffsetPlaceholderGenerator: + def __init__(self): + self.counter = 0 + self.replacements = {} + + def get(self, replace_addr): + if replace_addr in self.replacements: + return self.replacements[replace_addr] + self.counter += 1 + replacement = f"" + self.replacements[replace_addr] = replacement + return replacement diff --git a/tools/isledecomp/setup.py b/tools/isledecomp/setup.py index 897b802e..1e3c0863 100644 --- a/tools/isledecomp/setup.py +++ b/tools/isledecomp/setup.py @@ -1,9 +1,9 @@ from setuptools import setup, find_packages setup( - name='isledecomp', - version='0.1.0', - description='Python tools for the isledecomp project', + name="isledecomp", + version="0.1.0", + description="Python tools for the isledecomp project", packages=find_packages(), - tests_require=['pytest'], + tests_require=["pytest"], ) diff --git a/tools/isledecomp/tests/test_parser.py b/tools/isledecomp/tests/test_parser.py index a55bf549..48bb0e44 100644 --- a/tools/isledecomp/tests/test_parser.py +++ b/tools/isledecomp/tests/test_parser.py @@ -1,17 +1,16 @@ import os -import pytest from typing import List, TextIO from isledecomp.parser import find_code_blocks from isledecomp.parser.util import CodeBlock -SAMPLE_DIR = os.path.join(os.path.dirname(__file__), 'samples') +SAMPLE_DIR = os.path.join(os.path.dirname(__file__), "samples") def sample_file(filename: str) -> TextIO: """Wrapper for opening the samples from the directory that does not - depend on the cwd where we run the test""" + depend on the cwd where we run the test""" full_path = os.path.join(SAMPLE_DIR, filename) - return open(full_path, 'r') + return open(full_path, "r", encoding="utf-8") def code_blocks_are_sorted(blocks: List[CodeBlock]) -> bool: @@ -25,7 +24,7 @@ def code_blocks_are_sorted(blocks: List[CodeBlock]) -> bool: def test_sanity(): """Read a very basic file""" - with sample_file('basic_file.cpp') as f: + with sample_file("basic_file.cpp") as f: blocks = find_code_blocks(f) assert len(blocks) == 3 @@ -39,7 +38,7 @@ def test_sanity(): def test_oneline(): """(Assuming clang-format permits this) This sample has a function on a single line. This will test the end-of-function detection""" - with sample_file('oneline_function.cpp') as f: + with sample_file("oneline_function.cpp") as f: blocks = find_code_blocks(f) assert len(blocks) == 2 @@ -49,7 +48,7 @@ def test_oneline(): def test_missing_offset(): """What if the function doesn't have an offset comment?""" - with sample_file('missing_offset.cpp') as f: + with sample_file("missing_offset.cpp") as f: blocks = find_code_blocks(f) # TODO: For now, the function without the offset will just be ignored. @@ -60,9 +59,9 @@ def test_missing_offset(): def test_jumbled_case(): """The parser just reports what it sees. It is the responsibility of - the downstream tools to do something about a jumbled file. - Just verify that we are reading it correctly.""" - with sample_file('out_of_order.cpp') as f: + the downstream tools to do something about a jumbled file. + Just verify that we are reading it correctly.""" + with sample_file("out_of_order.cpp") as f: blocks = find_code_blocks(f) assert len(blocks) == 3 @@ -70,7 +69,7 @@ def test_jumbled_case(): def test_bad_file(): - with sample_file('poorly_formatted.cpp') as f: + with sample_file("poorly_formatted.cpp") as f: blocks = find_code_blocks(f) assert len(blocks) == 3 @@ -78,7 +77,7 @@ def test_bad_file(): def test_indented(): """Offsets for functions inside of a class will probably be indented.""" - with sample_file('basic_class.cpp') as f: + with sample_file("basic_class.cpp") as f: blocks = find_code_blocks(f) # TODO: We don't properly detect the end of these functions @@ -87,17 +86,17 @@ def test_indented(): # all the functions that are there. assert len(blocks) == 2 - assert blocks[0].offset == int('0x12345678', 16) + assert blocks[0].offset == int("0x12345678", 16) assert blocks[0].start_line == 15 # assert blocks[0].end_line == 18 - assert blocks[1].offset == int('0xdeadbeef', 16) + assert blocks[1].offset == int("0xdeadbeef", 16) assert blocks[1].start_line == 22 # assert blocks[1].end_line == 24 def test_inline(): - with sample_file('inline.cpp') as f: + with sample_file("inline.cpp") as f: blocks = find_code_blocks(f) assert len(blocks) == 2 @@ -108,21 +107,21 @@ def test_inline(): def test_multiple_offsets(): """If multiple offset marks appear before for a code block, take them - all but ensure module name (case-insensitive) is distinct. - Use first module occurrence in case of duplicates.""" - with sample_file('multiple_offsets.cpp') as f: + all but ensure module name (case-insensitive) is distinct. + Use first module occurrence in case of duplicates.""" + with sample_file("multiple_offsets.cpp") as f: blocks = find_code_blocks(f) assert len(blocks) == 4 - assert blocks[0].module == 'TEST' + assert blocks[0].module == "TEST" assert blocks[0].start_line == 9 - assert blocks[1].module == 'HELLO' + assert blocks[1].module == "HELLO" assert blocks[1].start_line == 9 # Duplicate modules are ignored assert blocks[2].start_line == 16 assert blocks[2].offset == 0x2345 - assert blocks[3].module == 'TEST' + assert blocks[3].module == "TEST" assert blocks[3].offset == 0x2002 diff --git a/tools/isledecomp/tests/test_parser_util.py b/tools/isledecomp/tests/test_parser_util.py index 3fc9bb61..91fd285b 100644 --- a/tools/isledecomp/tests/test_parser_util.py +++ b/tools/isledecomp/tests/test_parser_util.py @@ -1,6 +1,6 @@ -import pytest from collections import namedtuple from typing import List +import pytest from isledecomp.parser.util import ( is_blank_or_comment, match_offset_comment, @@ -10,21 +10,20 @@ blank_or_comment_param = [ - (True, ''), - (True, '\t'), - (True, ' '), - (False, '\tint abc=123;'), - (True, '// OFFSET: LEGO1 0xdeadbeef'), - (True, ' /* Block comment beginning'), - (True, 'Block comment ending */ '), - + (True, ""), + (True, "\t"), + (True, " "), + (False, "\tint abc=123;"), + (True, "// OFFSET: LEGO1 0xdeadbeef"), + (True, " /* Block comment beginning"), + (True, "Block comment ending */ "), # TODO: does clang-format have anything to say about these cases? - (False, 'x++; // Comment folows'), - (False, 'x++; /* Block comment begins'), + (False, "x++; // Comment folows"), + (False, "x++; /* Block comment begins"), ] -@pytest.mark.parametrize('expected, line', blank_or_comment_param) +@pytest.mark.parametrize("expected, line", blank_or_comment_param) def test_is_blank_or_comment(line: str, expected: bool): assert is_blank_or_comment(line) is expected @@ -32,82 +31,73 @@ def test_is_blank_or_comment(line: str, expected: bool): offset_comment_samples = [ # (can_parse: bool, exact_match: bool, line: str) # Should match both expected modules with optional STUB marker - (True, True, '// OFFSET: LEGO1 0xdeadbeef'), - (True, True, '// OFFSET: LEGO1 0xdeadbeef STUB'), - (True, True, '// OFFSET: ISLE 0x12345678'), - (True, True, '// OFFSET: ISLE 0x12345678 STUB'), - + (True, True, "// OFFSET: LEGO1 0xdeadbeef"), + (True, True, "// OFFSET: LEGO1 0xdeadbeef STUB"), + (True, True, "// OFFSET: ISLE 0x12345678"), + (True, True, "// OFFSET: ISLE 0x12345678 STUB"), # No trailing spaces allowed - (True, False, '// OFFSET: LEGO1 0xdeadbeef '), - (True, False, '// OFFSET: LEGO1 0xdeadbeef STUB '), - + (True, False, "// OFFSET: LEGO1 0xdeadbeef "), + (True, False, "// OFFSET: LEGO1 0xdeadbeef STUB "), # Must have exactly one space between elements - (True, False, '//OFFSET: ISLE 0xdeadbeef'), - (True, False, '// OFFSET:ISLE 0xdeadbeef'), - (True, False, '// OFFSET: ISLE 0xdeadbeef'), - (True, False, '// OFFSET: ISLE 0xdeadbeef'), - (True, False, '// OFFSET: ISLE 0xdeadbeef'), - (True, False, '// OFFSET: ISLE 0xdeadbeef STUB'), - + (True, False, "//OFFSET: ISLE 0xdeadbeef"), + (True, False, "// OFFSET:ISLE 0xdeadbeef"), + (True, False, "// OFFSET: ISLE 0xdeadbeef"), + (True, False, "// OFFSET: ISLE 0xdeadbeef"), + (True, False, "// OFFSET: ISLE 0xdeadbeef"), + (True, False, "// OFFSET: ISLE 0xdeadbeef STUB"), # Must have 0x prefix for hex number - (True, False, '// OFFSET: ISLE deadbeef'), - + (True, False, "// OFFSET: ISLE deadbeef"), # Offset, module name, and STUB must be uppercase - (True, False, '// offset: ISLE 0xdeadbeef'), - (True, False, '// offset: isle 0xdeadbeef'), - (True, False, '// OFFSET: LEGO1 0xdeadbeef stub'), - + (True, False, "// offset: ISLE 0xdeadbeef"), + (True, False, "// offset: isle 0xdeadbeef"), + (True, False, "// OFFSET: LEGO1 0xdeadbeef stub"), # Hex string must be lowercase - (True, False, '// OFFSET: ISLE 0xDEADBEEF'), - + (True, False, "// OFFSET: ISLE 0xDEADBEEF"), # TODO: How flexible should we be with matching the module name? - (True, True, '// OFFSET: OMNI 0x12345678'), - (True, True, '// OFFSET: LEG01 0x12345678'), - (True, False, '// OFFSET: hello 0x12345678'), - + (True, True, "// OFFSET: OMNI 0x12345678"), + (True, True, "// OFFSET: LEG01 0x12345678"), + (True, False, "// OFFSET: hello 0x12345678"), # Not close enough to match - (False, False, '// OFFSET: ISLE0x12345678'), - (False, False, '// OFFSET: 0x12345678'), - (False, False, '// LEGO1: 0x12345678'), - + (False, False, "// OFFSET: ISLE0x12345678"), + (False, False, "// OFFSET: 0x12345678"), + (False, False, "// LEGO1: 0x12345678"), # Hex string shorter than 8 characters - (True, True, '// OFFSET: LEGO1 0x1234'), - + (True, True, "// OFFSET: LEGO1 0x1234"), # TODO: These match but shouldn't. # (False, False, '// OFFSET: LEGO1 0'), # (False, False, '// OFFSET: LEGO1 0x'), ] -@pytest.mark.parametrize('match, exact, line', offset_comment_samples) -def test_offset_match(line: str, match: bool, exact): +@pytest.mark.parametrize("match, _, line", offset_comment_samples) +def test_offset_match(line: str, match: bool, _): did_match = match_offset_comment(line) is not None assert did_match is match -@pytest.mark.parametrize('match, exact, line', offset_comment_samples) -def test_exact_offset_comment(line: str, exact: bool, match): +@pytest.mark.parametrize("_, exact, line", offset_comment_samples) +def test_exact_offset_comment(line: str, exact: bool, _): assert is_exact_offset_comment(line) is exact # Helper for the next test: cut down version of OffsetMatch -MiniOfs = namedtuple('MiniOfs', ['module', 'value']) +MiniOfs = namedtuple("MiniOfs", ["module", "value"]) distinct_by_module_samples = [ # empty set ([], []), # same module name - ([MiniOfs('TEST', 123), MiniOfs('TEST', 555)], - [MiniOfs('TEST', 123)]), + ([MiniOfs("TEST", 123), MiniOfs("TEST", 555)], [MiniOfs("TEST", 123)]), # same module name, case-insensitive - ([MiniOfs('test', 123), MiniOfs('TEST', 555)], - [MiniOfs('test', 123)]), + ([MiniOfs("test", 123), MiniOfs("TEST", 555)], [MiniOfs("test", 123)]), # duplicates, non-consecutive - ([MiniOfs('test', 123), MiniOfs('abc', 111), MiniOfs('TEST', 555)], - [MiniOfs('test', 123), MiniOfs('abc', 111)]), + ( + [MiniOfs("test", 123), MiniOfs("abc", 111), MiniOfs("TEST", 555)], + [MiniOfs("test", 123), MiniOfs("abc", 111)], + ), ] -@pytest.mark.parametrize('sample, expected', distinct_by_module_samples) +@pytest.mark.parametrize("sample, expected", distinct_by_module_samples) def test_distinct_by_module(sample: List[MiniOfs], expected: List[MiniOfs]): assert distinct_by_module(sample) == expected diff --git a/tools/reccmp/reccmp.py b/tools/reccmp/reccmp.py index 2cd2e43a..02c16029 100755 --- a/tools/reccmp/reccmp.py +++ b/tools/reccmp/reccmp.py @@ -2,585 +2,482 @@ import argparse import base64 -from capstone import * import difflib -import struct -import subprocess +import json import logging import os -import sys -import colorama -import json import re -from isledecomp.dir import walk_source_dir -from isledecomp.parser import find_code_blocks + +from isledecomp import ( + Bin, + find_code_blocks, + get_file_in_script_dir, + OffsetPlaceholderGenerator, + print_diff, + SymInfo, + walk_source_dir, + WinePathConverter, +) + +from capstone import Cs, CS_ARCH_X86, CS_MODE_32 +import colorama from pystache import Renderer -parser = argparse.ArgumentParser(allow_abbrev=False, - description='Recompilation Compare: compare an original EXE with a recompiled EXE + PDB.') -parser.add_argument('original', metavar='original-binary', help='The original binary') -parser.add_argument('recompiled', metavar='recompiled-binary', help='The recompiled binary') -parser.add_argument('pdb', metavar='recompiled-pdb', help='The PDB of the recompiled binary') -parser.add_argument('decomp_dir', metavar='decomp-dir', help='The decompiled source tree') -parser.add_argument('--total', '-T', metavar='', help='Total number of expected functions (improves total accuracy statistic)') -parser.add_argument('--verbose', '-v', metavar='', help='Print assembly diff for specific function (original file\'s offset)') -parser.add_argument('--html', '-H', metavar='', help='Generate searchable HTML summary of status and diffs') -parser.add_argument('--no-color', '-n', action='store_true', help='Do not color the output') -parser.add_argument('--svg', '-S', metavar='', help='Generate SVG graphic of progress') -parser.add_argument('--svg-icon', metavar='icon', help='Icon to use in SVG (PNG)') -parser.add_argument('--print-rec-addr', action='store_true', help='Print addresses of recompiled functions too') +REGISTER_LIST = set( + [ + "ax", + "bp", + "bx", + "cx", + "di", + "dx", + "eax", + "ebp", + "ebx", + "ecx", + "edi", + "edx", + "esi", + "esp", + "si", + "sp", + ] +) +WORDS = re.compile(r"\w+") -parser.set_defaults(loglevel=logging.INFO) -parser.add_argument('--debug', action='store_const', const=logging.DEBUG, dest='loglevel', help='Print script debug information') -args = parser.parse_args() - -logging.basicConfig(level=args.loglevel, format='[%(levelname)s] %(message)s') -logger = logging.getLogger(__name__) - -colorama.init() - -verbose = None -found_verbose_target = False -if args.verbose: - try: - verbose = int(args.verbose, 16) - except ValueError: - parser.error('invalid verbose argument') -html_path = args.html - -plain = args.no_color - -original = args.original -if not os.path.isfile(original): - parser.error(f'Original binary {original} does not exist') - -recomp = args.recompiled -if not os.path.isfile(recomp): - parser.error(f'Recompiled binary {recomp} does not exist') - -syms = args.pdb -if not os.path.isfile(syms): - parser.error(f'Symbols PDB {syms} does not exist') - -source = args.decomp_dir -if not os.path.isdir(source): - parser.error(f'Source directory {source} does not exist') - -svg = args.svg - -# Declare a class that can automatically convert virtual executable addresses -# to file addresses -class Bin: - def __init__(self, filename): - logger.debug(f'Parsing headers of "{filename}"... ') - self.file = open(filename, 'rb') - - #HACK: Strictly, we should be parsing the header, but we know where - # everything is in these two files so we just jump straight there - - # Read ImageBase - self.file.seek(0xB4) - self.imagebase, = struct.unpack(' str: - if unix_fn.startswith('./'): - return self.win_cwd + '\\' + unix_fn[2:].replace('/', '\\') - if unix_fn.startswith(self.unix_cwd): - return self.win_cwd + '\\' + unix_fn.removeprefix(self.unix_cwd).replace('/', '\\').lstrip('\\') - return self._call_winepath_unix2win(unix_fn) - - def get_unix_path(self, win_fn: str) -> str: - if win_fn.startswith('.\\') or win_fn.startswith('./'): - return self.unix_cwd + '/' + win_fn[2:].replace('\\', '/') - if win_fn.startswith(self.win_cwd): - return self.unix_cwd + '/' + win_fn.removeprefix(self.win_cwd).replace('\\', '/') - return self._call_winepath_win2unix(win_fn) - - @staticmethod - def _call_winepath_unix2win(fn: str) -> str: - return subprocess.check_output(['winepath', '-w', fn], text=True).strip() - - @staticmethod - def _call_winepath_win2unix(fn: str) -> str: - return subprocess.check_output(['winepath', fn], text=True).strip() - -def get_file_in_script_dir(fn): - return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn) - -# Declare a class that parses the output of cvdump for fast access later -class SymInfo: - funcs = {} - lines = {} - names = {} - - def __init__(self, pdb, file, wine_path_converter): - call = [get_file_in_script_dir('cvdump.exe'), '-l', '-s'] - - if wine_path_converter: - # Run cvdump through wine and convert path to Windows-friendly wine path - call.insert(0, 'wine') - call.append(wine_path_converter.get_wine_path(pdb)) - else: - call.append(pdb) - - logger.info(f'Parsing {pdb} ...') - logger.debug(f'Command = {call}') - line_dump = subprocess.check_output(call).decode('utf-8').split('\r\n') - - current_section = None - - logger.debug('Parsing output of cvdump.exe ...') - - for i, line in enumerate(line_dump): - if line.startswith('***'): - current_section = line[4:] - - if current_section == 'SYMBOLS' and 'S_GPROC32' in line: - addr = int(line[26:34], 16) - - info = RecompiledInfo() - info.addr = addr + recompfile.imagebase + recompfile.textvirt - - use_dbg_offs = False - if use_dbg_offs: - debug_offs = line_dump[i + 2] - debug_start = int(debug_offs[22:30], 16) - debug_end = int(debug_offs[43:], 16) - - info.start = debug_start - info.size = debug_end - debug_start - else: - info.start = 0 - info.size = int(line[41:49], 16) - - info.name = line[77:] - - self.names[info.name] = info - self.funcs[addr] = info - elif current_section == 'LINES' and line.startswith(' ') and not line.startswith(' '): - sourcepath = line.split()[0] - - if wine_path_converter: - # Convert filename to Unix path for file compare - sourcepath = wine_path_converter.get_unix_path(sourcepath) - - if sourcepath not in self.lines: - self.lines[sourcepath] = {} - - j = i + 2 - while True: - ll = line_dump[j].split() - if len(ll) == 0: - break - - k = 0 - while k < len(ll): - linenum = int(ll[k + 0]) - address = int(ll[k + 1], 16) - if linenum not in self.lines[sourcepath]: - self.lines[sourcepath][linenum] = address - k += 2 - - j += 1 - - logger.debug('... Parsing output of cvdump.exe finished') - - def get_recompiled_address(self, filename, line): - addr = None - found = False - - logger.debug(f'Looking for {filename}:{line}') - filename_basename = os.path.basename(filename).lower() - - for fn in self.lines: - # Sometimes a PDB is compiled with a relative path while we always have - # an absolute path. Therefore we must - try: - if (os.path.basename(fn).lower() == filename_basename and - os.path.samefile(fn, filename)): - filename = fn - break - except FileNotFoundError as e: - continue - - if filename in self.lines and line in self.lines[fn]: - addr = self.lines[fn][line] - - if addr in self.funcs: - return self.funcs[addr] - else: - logger.error(f'Failed to find function symbol with address: 0x{addr:x}') - else: - logger.error(f'Failed to find function symbol with filename and line: {filename}:{line}') - - def get_recompiled_address_from_name(self, name): - logger.debug('Looking for %s', name) - - if name in self.names: - return self.names[name] - else: - logger.error(f'Failed to find function symbol with name: {name}') - -wine_path_converter = None -if os.name != 'nt': - wine_path_converter = WinePathConverter(source) -origfile = Bin(original) -recompfile = Bin(recomp) -syminfo = SymInfo(syms, recompfile, wine_path_converter) - -print() - -md = Cs(CS_ARCH_X86, CS_MODE_32) - -class OffsetPlaceholderGenerator: - def __init__(self): - self.counter = 0 - self.replacements = {} - - def get(self, addr): - if addr in self.replacements: - return self.replacements[addr] - else: - self.counter += 1 - replacement = f'' - self.replacements[addr] = replacement - return replacement - -def sanitize(file, placeholderGenerator, mnemonic, op_str): - op_str_is_number = False - try: - int(op_str, 16) - op_str_is_number = True - except ValueError: - pass - - if (mnemonic == 'call' or mnemonic == 'jmp') and op_str_is_number: - # Filter out "calls" because the offsets we're not currently trying to - # match offsets. As long as there's a call in the right place, it's - # probably accurate. - op_str = placeholderGenerator.get(int(op_str, 16)) - else: - def filter_out_ptr(ptype, op_str): - try: - ptrstr = ptype + ' ptr [' - start = op_str.index(ptrstr) + len(ptrstr) - end = op_str.index(']', start) - - # This will throw ValueError if not hex - inttest = int(op_str[start:end], 16) - - return op_str[0:start] + placeholderGenerator.get(inttest) + op_str[end:] - except ValueError: - return op_str - - # Filter out dword ptrs where the pointer is to an offset - op_str = filter_out_ptr('dword', op_str) - op_str = filter_out_ptr('word', op_str) - op_str = filter_out_ptr('byte', op_str) - - # Use heuristics to filter out any args that look like offsets - words = op_str.split(' ') - for i, word in enumerate(words): - try: - inttest = int(word, 16) - if inttest >= file.imagebase + file.textvirt: - words[i] = placeholderGenerator.get(inttest) - except ValueError: +def sanitize(file, placeholder_generator, mnemonic, op_str): + op_str_is_number = False + try: + int(op_str, 16) + op_str_is_number = True + except ValueError: pass - op_str = ' '.join(words) - return mnemonic, op_str - -def parse_asm(file, addr, size): - asm = [] - data = file.read(addr, size) - placeholderGenerator = OffsetPlaceholderGenerator() - for i in md.disasm(data, 0): - # Use heuristics to disregard some differences that aren't representative - # of the accuracy of a function (e.g. global offsets) - mnemonic, op_str = sanitize(file, placeholderGenerator, i.mnemonic, i.op_str) - if op_str is None: - asm.append(mnemonic) + if (mnemonic in ["call", "jmp"]) and op_str_is_number: + # Filter out "calls" because the offsets we're not currently trying to + # match offsets. As long as there's a call in the right place, it's + # probably accurate. + op_str = placeholder_generator.get(int(op_str, 16)) else: - asm.append(f'{mnemonic} {op_str}') - return asm -REGISTER_LIST = set([ - 'ax', - 'bp', - 'bx', - 'cx', - 'di', - 'dx', - 'eax', - 'ebp', - 'ebx', - 'ecx', - 'edi', - 'edx', - 'esi', - 'esp', - 'si', - 'sp', -]) -WORDS = re.compile(r'\w+') + def filter_out_ptr(ptype, op_str): + try: + ptrstr = ptype + " ptr [" + start = op_str.index(ptrstr) + len(ptrstr) + end = op_str.index("]", start) + + # This will throw ValueError if not hex + inttest = int(op_str[start:end], 16) + + return ( + op_str[0:start] + placeholder_generator.get(inttest) + op_str[end:] + ) + except ValueError: + return op_str + + # Filter out dword ptrs where the pointer is to an offset + op_str = filter_out_ptr("dword", op_str) + op_str = filter_out_ptr("word", op_str) + op_str = filter_out_ptr("byte", op_str) + + # Use heuristics to filter out any args that look like offsets + words = op_str.split(" ") + for i, word in enumerate(words): + try: + inttest = int(word, 16) + if inttest >= file.imagebase + file.textvirt: + words[i] = placeholder_generator.get(inttest) + except ValueError: + pass + op_str = " ".join(words) + + return mnemonic, op_str + + +def parse_asm(disassembler, file, asm_addr, size): + asm = [] + data = file.read(asm_addr, size) + placeholder_generator = OffsetPlaceholderGenerator() + for i in disassembler.disasm(data, 0): + # Use heuristics to disregard some differences that aren't representative + # of the accuracy of a function (e.g. global offsets) + mnemonic, op_str = sanitize(file, placeholder_generator, i.mnemonic, i.op_str) + if op_str is None: + asm.append(mnemonic) + else: + asm.append(f"{mnemonic} {op_str}") + return asm + def get_registers(line: str): - to_replace = [] - # use words regex to find all matching positions: - for match in WORDS.finditer(line): - reg = match.group(0) - if reg in REGISTER_LIST: - to_replace.append((reg, match.start())) - return to_replace + to_replace = [] + # use words regex to find all matching positions: + for match in WORDS.finditer(line): + reg = match.group(0) + if reg in REGISTER_LIST: + to_replace.append((reg, match.start())) + return to_replace + + +def replace_register( + lines: list[str], start_line: int, reg: str, replacement: str +) -> list[str]: + return [ + line.replace(reg, replacement) if i >= start_line else line + for i, line in enumerate(lines) + ] -def replace_register(lines: list[str], start_line: int, reg: str, replacement: str) -> list[str]: - return [line.replace(reg, replacement) if i >= start_line else line for i, line in enumerate(lines)] # Is it possible to make new_asm the same as original_asm by swapping registers? def can_resolve_register_differences(original_asm, new_asm): - # Split the ASM on spaces to get more granularity, and so - # that we don't modify the original arrays passed in. - original_asm = [part for line in original_asm for part in line.split()] - new_asm = [part for line in new_asm for part in line.split()] + # Split the ASM on spaces to get more granularity, and so + # that we don't modify the original arrays passed in. + original_asm = [part for line in original_asm for part in line.split()] + new_asm = [part for line in new_asm for part in line.split()] - # Swapping ain't gonna help if the lengths are different - if len(original_asm) != len(new_asm): - return False + # Swapping ain't gonna help if the lengths are different + if len(original_asm) != len(new_asm): + return False - # Look for the mismatching lines - for i in range(len(original_asm)): - new_line = new_asm[i] - original_line = original_asm[i] - if new_line != original_line: - # Find all the registers to replace - to_replace = get_registers(original_line) + # Look for the mismatching lines + for i, original_line in enumerate(original_asm): + new_line = new_asm[i] + if new_line != original_line: + # Find all the registers to replace + to_replace = get_registers(original_line) - for j in range(len(to_replace)): - (reg, reg_index) = to_replace[j] - replacing_reg = new_line[reg_index:reg_index + len(reg)] - if replacing_reg in REGISTER_LIST: - if replacing_reg != reg: - # Do a three-way swap replacing in all the subsequent lines - temp_reg = '&' * len(reg) - new_asm = replace_register(new_asm, i, replacing_reg, temp_reg) - new_asm = replace_register(new_asm, i, reg, replacing_reg) - new_asm = replace_register(new_asm, i, temp_reg, reg) - else: - # No replacement to do, different code, bail out - return False - # Check if the lines are now the same - for i in range(len(original_asm)): - if new_asm[i] != original_asm[i]: - return False - return True - -function_count = 0 -total_accuracy = 0 -total_effective_accuracy = 0 -htmlinsert = [] - -# Generate basename of original file, used in locating OFFSET lines -basename = os.path.basename(os.path.splitext(original)[0]) - -for srcfilename in walk_source_dir(source): - with open(srcfilename, 'r') as srcfile: - blocks = find_code_blocks(srcfile) - - for block in blocks: - if block.is_stub: - continue - - if block.module != basename: - continue - - addr = block.offset - # Verbose flag handling - if verbose: - if addr == verbose: - found_verbose_target = True - else: - continue - - if block.is_template: - recinfo = syminfo.get_recompiled_address_from_name(block.signature) - if not recinfo: - continue - else: - recinfo = syminfo.get_recompiled_address(srcfilename, block.start_line) - if not recinfo: - continue - - # The effective_ratio is the ratio when ignoring differing register - # allocation vs the ratio is the true ratio. - ratio = 0.0 - effective_ratio = 0.0 - if recinfo.size: - origasm = parse_asm(origfile, addr + recinfo.start, recinfo.size) - recompasm = parse_asm(recompfile, recinfo.addr + recinfo.start, recinfo.size) - - diff = difflib.SequenceMatcher(None, origasm, recompasm) - ratio = diff.ratio() - effective_ratio = ratio - - if ratio != 1.0: - # Check whether we can resolve register swaps which are actually - # perfect matches modulo compiler entropy. - if can_resolve_register_differences(origasm, recompasm): - effective_ratio = 1.0 - else: - ratio = 0 - - percenttext = f'{(effective_ratio * 100):.2f}%' - if not plain: - if effective_ratio == 1.0: - percenttext = colorama.Fore.GREEN + percenttext + colorama.Style.RESET_ALL - elif effective_ratio > 0.8: - percenttext = colorama.Fore.YELLOW + percenttext + colorama.Style.RESET_ALL - else: - percenttext = colorama.Fore.RED + percenttext + colorama.Style.RESET_ALL - - if effective_ratio == 1.0 and ratio != 1.0: - if plain: - percenttext += '*' - else: - percenttext += colorama.Fore.RED + '*' + colorama.Style.RESET_ALL - - if args.print_rec_addr: - addrs = f'0x{addr:x} / 0x{recinfo.addr:x}' - else: - addrs = hex(addr) - - if not verbose: - print(f' {recinfo.name} ({addrs}) is {percenttext} similar to the original') - - function_count += 1 - total_accuracy += ratio - total_effective_accuracy += effective_ratio - - if recinfo.size: - udiff = difflib.unified_diff(origasm, recompasm, n=10) - - # If verbose, print the diff for that function to the output - if verbose: - if effective_ratio == 1.0: - ok_text = 'OK!' if plain else (colorama.Fore.GREEN + '✨ OK! ✨' + colorama.Style.RESET_ALL) - if ratio == 1.0: - print(f'{addrs}: {recinfo.name} 100% match.\n\n{ok_text}\n\n') - else: - print(f'{addrs}: {recinfo.name} Effective 100%% match. (Differs in register allocation only)\n\n{ok_text} (still differs in register allocation)\n\n') - else: - for line in udiff: - if line.startswith('++') or line.startswith('@@') or line.startswith('--'): - # Skip unneeded parts of the diff for the brief view - pass - elif line.startswith('+'): - if plain: - print(line) - else: - print(colorama.Fore.GREEN + line) - elif line.startswith('-'): - if plain: - print(line) - else: - print(colorama.Fore.RED + line) - else: - print(line) - if not plain: - print(colorama.Style.RESET_ALL, end='') - - print(f'\n{recinfo.name} is only {percenttext} similar to the original, diff above') - - # If html, record the diffs to an HTML file - if html_path: - htmlinsert.append({"address": f"0x{addr:x}", - "name": recinfo.name, - "matching": effective_ratio, - "diff": '\n'.join(udiff)}) + for replace in to_replace: + (reg, reg_index) = replace + replacing_reg = new_line[reg_index : reg_index + len(reg)] + if replacing_reg in REGISTER_LIST: + if replacing_reg != reg: + # Do a three-way swap replacing in all the subsequent lines + temp_reg = "&" * len(reg) + new_asm = replace_register(new_asm, i, replacing_reg, temp_reg) + new_asm = replace_register(new_asm, i, reg, replacing_reg) + new_asm = replace_register(new_asm, i, temp_reg, reg) + else: + # No replacement to do, different code, bail out + return False + # Check if the lines are now the same + for i, original_line in enumerate(original_asm): + if new_asm[i] != original_line: + return False + return True def gen_html(html_file, data): - output_data = Renderer().render_path(get_file_in_script_dir('template.html'), - { - "data": data, - } - ) + output_data = Renderer().render_path( + get_file_in_script_dir("template.html"), {"data": data} + ) - with open(html_file, 'w') as htmlfile: - htmlfile.write(output_data) + with open(html_file, "w", encoding="utf-8") as htmlfile: + htmlfile.write(output_data) def gen_svg(svg_file, name_svg, icon, svg_implemented_funcs, total_funcs, raw_accuracy): - icon_data = None - if icon: - with open(icon, 'rb') as iconfile: - icon_data = base64.b64encode(iconfile.read()).decode('utf-8') + icon_data = None + if icon: + with open(icon, "rb") as iconfile: + icon_data = base64.b64encode(iconfile.read()).decode("utf-8") - total_statistic = raw_accuracy / total_funcs - full_percentbar_width = 127.18422 - output_data = Renderer().render_path(get_file_in_script_dir('template.svg'), - { - "name": name_svg, - "icon": icon_data, - "implemented": f'{(svg_implemented_funcs / total_funcs * 100):.2f}% ({svg_implemented_funcs}/{total_funcs})', - "accuracy": f'{(raw_accuracy / svg_implemented_funcs * 100):.2f}%', - "progbar": total_statistic * full_percentbar_width, - "percent": f'{(total_statistic * 100):.2f}%', - } - ) - with open(svg_file, 'w') as svgfile: - svgfile.write(output_data) + total_statistic = raw_accuracy / total_funcs + full_percentbar_width = 127.18422 + output_data = Renderer().render_path( + get_file_in_script_dir("template.svg"), + { + "name": name_svg, + "icon": icon_data, + "implemented": f"{(svg_implemented_funcs / total_funcs * 100):.2f}% ({svg_implemented_funcs}/{total_funcs})", + "accuracy": f"{(raw_accuracy / svg_implemented_funcs * 100):.2f}%", + "progbar": total_statistic * full_percentbar_width, + "percent": f"{(total_statistic * 100):.2f}%", + }, + ) + with open(svg_file, "w", encoding="utf-8") as svgfile: + svgfile.write(output_data) -if html_path: - gen_html(html_path, json.dumps(htmlinsert)) +# Do the actual work +if __name__ == "__main__": + parser = argparse.ArgumentParser( + allow_abbrev=False, + description="Recompilation Compare: compare an original EXE with a recompiled EXE + PDB.", + ) + parser.add_argument( + "original", metavar="original-binary", help="The original binary" + ) + parser.add_argument( + "recompiled", metavar="recompiled-binary", help="The recompiled binary" + ) + parser.add_argument( + "pdb", metavar="recompiled-pdb", help="The PDB of the recompiled binary" + ) + parser.add_argument( + "decomp_dir", metavar="decomp-dir", help="The decompiled source tree" + ) + parser.add_argument( + "--total", + "-T", + metavar="", + help="Total number of expected functions (improves total accuracy statistic)", + ) + parser.add_argument( + "--verbose", + "-v", + metavar="", + help="Print assembly diff for specific function (original file's offset)", + ) + parser.add_argument( + "--html", + "-H", + metavar="", + help="Generate searchable HTML summary of status and diffs", + ) + parser.add_argument( + "--no-color", "-n", action="store_true", help="Do not color the output" + ) + parser.add_argument( + "--svg", "-S", metavar="", help="Generate SVG graphic of progress" + ) + parser.add_argument("--svg-icon", metavar="icon", help="Icon to use in SVG (PNG)") + parser.add_argument( + "--print-rec-addr", + action="store_true", + help="Print addresses of recompiled functions too", + ) -if verbose: - if not found_verbose_target: - print(f'Failed to find the function with address 0x{verbose:x}') -else: - implemented_funcs = function_count + parser.set_defaults(loglevel=logging.INFO) + parser.add_argument( + "--debug", + action="store_const", + const=logging.DEBUG, + dest="loglevel", + help="Print script debug information", + ) - if args.total: - function_count = int(args.total) + args = parser.parse_args() - if function_count > 0: - effective_accuracy = total_effective_accuracy / function_count * 100 - actual_accuracy = total_accuracy / function_count * 100 - print(f'\nTotal effective accuracy {effective_accuracy:.2f}% across {function_count} functions ({actual_accuracy:.2f}% actual accuracy)') + logging.basicConfig(level=args.loglevel, format="[%(levelname)s] %(message)s") + logger = logging.getLogger(__name__) - if svg: - gen_svg(svg, os.path.basename(original), args.svg_icon, implemented_funcs, function_count, total_effective_accuracy) + colorama.init() + + verbose = None + found_verbose_target = False + if args.verbose: + try: + verbose = int(args.verbose, 16) + except ValueError: + parser.error("invalid verbose argument") + html_path = args.html + + plain = args.no_color + + original = args.original + if not os.path.isfile(original): + parser.error(f"Original binary {original} does not exist") + + recomp = args.recompiled + if not os.path.isfile(recomp): + parser.error(f"Recompiled binary {recomp} does not exist") + + syms = args.pdb + if not os.path.isfile(syms): + parser.error(f"Symbols PDB {syms} does not exist") + + source = args.decomp_dir + if not os.path.isdir(source): + parser.error(f"Source directory {source} does not exist") + + svg = args.svg + + wine_path_converter = None + if os.name != "nt": + wine_path_converter = WinePathConverter(source) + with Bin(original, logger) as origfile, Bin(recomp, logger) as recompfile: + syminfo = SymInfo( + syms, recompfile, logger, sym_wine_path_converter=wine_path_converter + ) + + print() + + capstone_disassembler = Cs(CS_ARCH_X86, CS_MODE_32) + + function_count = 0 + total_accuracy = 0 + total_effective_accuracy = 0 + htmlinsert = [] + + # Generate basename of original file, used in locating OFFSET lines + basename = os.path.basename(os.path.splitext(original)[0]) + + for srcfilename in walk_source_dir(source): + with open(srcfilename, "r", encoding="utf-8") as srcfile: + blocks = find_code_blocks(srcfile) + + for block in blocks: + if block.is_stub: + continue + + if block.module != basename: + continue + + addr = block.offset + # Verbose flag handling + if verbose: + if addr == verbose: + found_verbose_target = True + else: + continue + + if block.is_template: + recinfo = syminfo.get_recompiled_address_from_name(block.signature) + if not recinfo: + continue + else: + recinfo = syminfo.get_recompiled_address( + srcfilename, block.start_line + ) + if not recinfo: + continue + + # The effective_ratio is the ratio when ignoring differing register + # allocation vs the ratio is the true ratio. + ratio = 0.0 + effective_ratio = 0.0 + if recinfo.size: + origasm = parse_asm( + capstone_disassembler, + origfile, + addr + recinfo.start, + recinfo.size, + ) + recompasm = parse_asm( + capstone_disassembler, + recompfile, + recinfo.addr + recinfo.start, + recinfo.size, + ) + + diff = difflib.SequenceMatcher(None, origasm, recompasm) + ratio = diff.ratio() + effective_ratio = ratio + + if ratio != 1.0: + # Check whether we can resolve register swaps which are actually + # perfect matches modulo compiler entropy. + if can_resolve_register_differences(origasm, recompasm): + effective_ratio = 1.0 + else: + ratio = 0 + + percenttext = f"{(effective_ratio * 100):.2f}%" + if not plain: + if effective_ratio == 1.0: + percenttext = ( + colorama.Fore.GREEN + percenttext + colorama.Style.RESET_ALL + ) + elif effective_ratio > 0.8: + percenttext = ( + colorama.Fore.YELLOW + + percenttext + + colorama.Style.RESET_ALL + ) + else: + percenttext = ( + colorama.Fore.RED + percenttext + colorama.Style.RESET_ALL + ) + + if effective_ratio == 1.0 and ratio != 1.0: + if plain: + percenttext += "*" + else: + percenttext += ( + colorama.Fore.RED + "*" + colorama.Style.RESET_ALL + ) + + if args.print_rec_addr: + addrs = f"0x{addr:x} / 0x{recinfo.addr:x}" + else: + addrs = hex(addr) + + if not verbose: + print( + f" {recinfo.name} ({addrs}) is {percenttext} similar to the original" + ) + + function_count += 1 + total_accuracy += ratio + total_effective_accuracy += effective_ratio + + if recinfo.size: + udiff = difflib.unified_diff(origasm, recompasm, n=10) + + # If verbose, print the diff for that function to the output + if verbose: + if effective_ratio == 1.0: + ok_text = ( + "OK!" + if plain + else ( + colorama.Fore.GREEN + + "✨ OK! ✨" + + colorama.Style.RESET_ALL + ) + ) + if ratio == 1.0: + print( + f"{addrs}: {recinfo.name} 100% match.\n\n{ok_text}\n\n" + ) + else: + print( + f"{addrs}: {recinfo.name} Effective 100%% match. (Differs in register allocation only)\n\n{ok_text} (still differs in register allocation)\n\n" + ) + else: + print_diff(udiff, plain) + + print( + f"\n{recinfo.name} is only {percenttext} similar to the original, diff above" + ) + + # If html, record the diffs to an HTML file + if html_path: + htmlinsert.append( + { + "address": f"0x{addr:x}", + "name": recinfo.name, + "matching": effective_ratio, + "diff": "\n".join(udiff), + } + ) + + if html_path: + gen_html(html_path, json.dumps(htmlinsert)) + + if verbose: + if not found_verbose_target: + print(f"Failed to find the function with address 0x{verbose:x}") + else: + implemented_funcs = function_count + + if args.total: + function_count = int(args.total) + + if function_count > 0: + effective_accuracy = total_effective_accuracy / function_count * 100 + actual_accuracy = total_accuracy / function_count * 100 + print( + f"\nTotal effective accuracy {effective_accuracy:.2f}% across {function_count} functions ({actual_accuracy:.2f}% actual accuracy)" + ) + + if svg: + gen_svg( + svg, + os.path.basename(original), + args.svg_icon, + implemented_funcs, + function_count, + total_effective_accuracy, + ) diff --git a/tools/reccmp/requirements.txt b/tools/requirements.txt similarity index 52% rename from tools/reccmp/requirements.txt rename to tools/requirements.txt index c7de0d0c..06c0bed3 100644 --- a/tools/reccmp/requirements.txt +++ b/tools/requirements.txt @@ -1,4 +1,5 @@ +tools/isledecomp capstone colorama isledecomp -pystache \ No newline at end of file +pystache diff --git a/tools/verexp/verexp.py b/tools/verexp/verexp.py index 3a469fb6..37e3e641 100755 --- a/tools/verexp/verexp.py +++ b/tools/verexp/verexp.py @@ -1,76 +1,68 @@ #!/usr/bin/env python3 import argparse -import colorama import difflib import subprocess import os import sys -parser = argparse.ArgumentParser(allow_abbrev=False, - description='Verify Exports: Compare the exports of two DLLs.') -parser.add_argument('original', metavar='original-binary', help='The original binary') -parser.add_argument('recompiled', metavar='recompiled-binary', help='The recompiled binary') -parser.add_argument('--no-color', '-n', action='store_true', help='Do not color the output') +from isledecomp.utils import print_diff + +parser = argparse.ArgumentParser( + allow_abbrev=False, description="Verify Exports: Compare the exports of two DLLs." +) +parser.add_argument("original", metavar="original-binary", help="The original binary") +parser.add_argument( + "recompiled", metavar="recompiled-binary", help="The recompiled binary" +) +parser.add_argument( + "--no-color", "-n", action="store_true", help="Do not color the output" +) args = parser.parse_args() if not os.path.isfile(args.original): - parser.error(f'Original binary file {args.original} does not exist') + parser.error(f"Original binary file {args.original} does not exist") if not os.path.isfile(args.recompiled): - parser.error(f'Recompiled binary {args.recompiled} does not exist') + parser.error(f"Recompiled binary {args.recompiled} does not exist") + def get_file_in_script_dir(fn): - return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn) + return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn) + def get_exports(file): - call = [get_file_in_script_dir('DUMPBIN.EXE'), '/EXPORTS'] + call = [get_file_in_script_dir("DUMPBIN.EXE"), "/EXPORTS"] - if os.name != 'nt': - call.insert(0, 'wine') - file = subprocess.check_output(['winepath', '-w', file]).decode('utf-8').strip() + if os.name != "nt": + call.insert(0, "wine") + file = subprocess.check_output(["winepath", "-w", file]).decode("utf-8").strip() - call.append(file) + call.append(file) - raw = subprocess.check_output(call).decode('utf-8').split('\r\n') - exports = [] + raw = subprocess.check_output(call).decode("utf-8").split("\r\n") + exports = [] - start = False + start = False - for line in raw: - if not start: - if line == ' ordinal hint name': - start = True - else: - if line: - exports.append(line[27:line.rindex(' (')]) - elif exports: - break + for line in raw: + if not start: + if line == " ordinal hint name": + start = True + else: + if line: + exports.append(line[27 : line.rindex(" (")]) + elif exports: + break + + return exports - return exports og_exp = get_exports(args.original) re_exp = get_exports(args.recompiled) udiff = difflib.unified_diff(og_exp, re_exp) -has_diff = False - -for line in udiff: - has_diff = True - color = '' - if line.startswith('++') or line.startswith('@@') or line.startswith('--'): - # Skip unneeded parts of the diff for the brief view - continue - # Work out color if we are printing color - if not args.no_color: - if line.startswith('+'): - color = colorama.Fore.GREEN - elif line.startswith('-'): - color = colorama.Fore.RED - print(color + line) - # Reset color if we're printing in color - if not args.no_color: - print(colorama.Style.RESET_ALL, end='') +has_diff = print_diff(udiff, args.no_color) sys.exit(1 if has_diff else 0)