From d869d565c282b89bffefd8b298048dfbbbc7808b Mon Sep 17 00:00:00 2001
From: jonschz <17198703+jonschz@users.noreply.github.com>
Date: Sun, 16 Jun 2024 14:50:32 +0200
Subject: [PATCH] Fix Ghidra import script re-importing unchanged functions
 (#1035)

* feat: Reuse enums instead of recreating them every time

* feat: Support verifying return values larger than 4 bytes

* feat: Ignore `__$ReturnUdt` in template functions

* run formatter

---------

Co-authored-by: jonschz <jonschz@users.noreply.github.com>
---
 .../lego_util/function_importer.py            | 45 +++++++++--
 .../ghidra_scripts/lego_util/type_importer.py | 81 +++++++++++++++----
 2 files changed, 105 insertions(+), 21 deletions(-)

diff --git a/tools/ghidra_scripts/lego_util/function_importer.py b/tools/ghidra_scripts/lego_util/function_importer.py
index bf99f7f8..80176fc5 100644
--- a/tools/ghidra_scripts/lego_util/function_importer.py
+++ b/tools/ghidra_scripts/lego_util/function_importer.py
@@ -17,6 +17,7 @@
     CppStackSymbol,
 )
 from lego_util.ghidra_helper import (
+    add_pointer_type,
     get_ghidra_namespace,
     sanitize_name,
 )
@@ -82,7 +83,26 @@ def matches_ghidra_function(self, ghidra_function: Function) -> bool:
         """Checks whether this function declaration already matches the description in Ghidra"""
         name_match = self.name == ghidra_function.getName(False)
         namespace_match = self.namespace == ghidra_function.getParentNamespace()
-        return_type_match = self.return_type == ghidra_function.getReturnType()
+        ghidra_return_type = ghidra_function.getReturnType()
+        return_type_match = self.return_type == ghidra_return_type
+
+        # Handle edge case: Return type X that is larger than the return register.
+        # In that case, the function returns `X*` and has another argument `X* __return_storage_ptr`.
+        if (
+            (not return_type_match)
+            and (self.return_type.getLength() > 4)
+            and (add_pointer_type(self.api, self.return_type) == ghidra_return_type)
+            and any(
+                param
+                for param in ghidra_function.getParameters()
+                if param.getName() == "__return_storage_ptr__"
+            )
+        ):
+            logger.debug(
+                "%s has a return type larger than 4 bytes", self.get_full_name()
+            )
+            return_type_match = True
+
         # match arguments: decide if thiscall or not
         thiscall_matches = (
             self.signature.call_type == ghidra_function.getCallingConventionName()
@@ -128,6 +148,14 @@ def _matches_thiscall_parameters(self, ghidra_function: Function) -> bool:
         return self._parameter_lists_match(ghidra_params)
 
     def _parameter_lists_match(self, ghidra_params: "list[Parameter]") -> bool:
+        # Remove return storage pointer from comparison if present.
+        # This is relevant to returning values larger than 4 bytes, and is not mentioned in the PDB
+        ghidra_params = [
+            param
+            for param in ghidra_params
+            if param.getName() != "__return_storage_ptr__"
+        ]
+
         if len(self.arguments) != len(ghidra_params):
             logger.info("Mismatching argument count")
             return False
@@ -146,11 +174,16 @@ def _parameter_lists_match(self, ghidra_params: "list[Parameter]") -> bool:
             if stack_match is None:
                 logger.debug("Not found on stack: %s", ghidra_arg)
                 return False
-            # "__formal" is the placeholder for arguments without a name
-            if (
-                stack_match.name != ghidra_arg.getName()
-                and not stack_match.name.startswith("__formal")
-            ):
+
+            if stack_match.name.startswith("__formal"):
+                # "__formal" is the placeholder for arguments without a name
+                continue
+
+            if stack_match.name == "__$ReturnUdt":
+                # These appear in templates and cannot be set automatically, as they are a NOTYPE
+                continue
+
+            if stack_match.name != ghidra_arg.getName():
                 logger.debug(
                     "Argument name mismatch: expected %s, found %s",
                     stack_match.name,
diff --git a/tools/ghidra_scripts/lego_util/type_importer.py b/tools/ghidra_scripts/lego_util/type_importer.py
index 0d3ee5df..c645ebf8 100644
--- a/tools/ghidra_scripts/lego_util/type_importer.py
+++ b/tools/ghidra_scripts/lego_util/type_importer.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Any
+from typing import Any, Callable, TypeVar
 
 # Disable spurious warnings in vscode / pylance
 # pyright: reportMissingModuleSource=false
@@ -29,6 +29,7 @@
     CategoryPath,
     DataType,
     DataTypeConflictHandler,
+    Enum,
     EnumDataType,
     StructureDataType,
     StructureInternal,
@@ -47,7 +48,9 @@ def __init__(self, api: FlatProgramAPI, extraction: PdbFunctionExtractor):
         self.extraction = extraction
         # tracks the structs/classes we have already started to import, otherwise we run into infinite recursion
         self.handled_structs: set[str] = set()
-        self.struct_call_stack: list[str] = []
+
+        # tracks the enums we have already handled for the sake of efficiency
+        self.handled_enums: dict[str, Enum] = {}
 
     @property
     def types(self):
@@ -166,9 +169,13 @@ def _import_enum(self, type_pdb: dict[str, Any]) -> DataType:
         field_list = self.extraction.compare.cv.types.keys.get(type_pdb["field_type"])
         assert field_list is not None, f"Failed to find field list for enum {type_pdb}"
 
-        result = EnumDataType(
-            CategoryPath("/imported"), type_pdb["name"], underlying_type.getLength()
+        result = self._get_or_create_enum_data_type(
+            type_pdb["name"], underlying_type.getLength()
         )
+        # clear existing variant if there are any
+        for existing_variant in result.getNames():
+            result.remove(existing_variant)
+
         variants: list[dict[str, Any]] = field_list["variants"]
         for variant in variants:
             result.add(variant["name"], variant["value"])
@@ -259,30 +266,74 @@ def _get_or_create_namespace(self, class_name_with_namespace: str):
             parent_namespace = create_ghidra_namespace(self.api, colon_split)
             self.api.createClass(parent_namespace, class_name)
 
+    def _get_or_create_enum_data_type(
+        self, enum_type_name: str, enum_type_size: int
+    ) -> Enum:
+        if (known_enum := self.handled_enums.get(enum_type_name, None)) is not None:
+            return known_enum
+
+        result = self._get_or_create_data_type(
+            enum_type_name,
+            "enum",
+            Enum,
+            lambda: EnumDataType(
+                CategoryPath("/imported"), enum_type_name, enum_type_size
+            ),
+        )
+        self.handled_enums[enum_type_name] = result
+        return result
+
     def _get_or_create_struct_data_type(
         self, class_name_with_namespace: str, class_size: int
     ) -> StructureInternal:
+        return self._get_or_create_data_type(
+            class_name_with_namespace,
+            "class/struct",
+            StructureInternal,
+            lambda: StructureDataType(
+                CategoryPath("/imported"), class_name_with_namespace, class_size
+            ),
+        )
+
+    T = TypeVar("T", bound=DataType)
+
+    def _get_or_create_data_type(
+        self,
+        type_name: str,
+        readable_name_of_type_category: str,
+        expected_type: type[T],
+        new_instance_callback: Callable[[], T],
+    ) -> T:
+        """
+        Checks if a data type provided under the given name exists in Ghidra.
+        Creates one using `new_instance_callback` if there is not.
+        Also verifies the data type.
+
+        Note that the return value of `addDataType()` is not the same instance as the input
+        even if there is no name collision.
+        """
         try:
-            data_type = get_ghidra_type(self.api, class_name_with_namespace)
+            data_type = get_ghidra_type(self.api, type_name)
             logger.debug(
-                "Found existing data type %s under category path %s",
-                class_name_with_namespace,
+                "Found existing %s type %s under category path %s",
+                readable_name_of_type_category,
+                type_name,
                 data_type.getCategoryPath(),
             )
         except TypeNotFoundInGhidraError:
-            # Create a new struct data type
-            data_type = StructureDataType(
-                CategoryPath("/imported"), class_name_with_namespace, class_size
-            )
             data_type = (
                 self.api.getCurrentProgram()
                 .getDataTypeManager()
-                .addDataType(data_type, DataTypeConflictHandler.KEEP_HANDLER)
+                .addDataType(
+                    new_instance_callback(), DataTypeConflictHandler.KEEP_HANDLER
+                )
+            )
+            logger.info(
+                "Created new %s data type %s", readable_name_of_type_category, type_name
             )
-            logger.info("Created new data type %s", class_name_with_namespace)
         assert isinstance(
-            data_type, StructureInternal
-        ), f"Found type sharing its name with a class/struct, but is not a struct: {class_name_with_namespace}"
+            data_type, expected_type
+        ), f"Found existing type named {type_name} that is not a {readable_name_of_type_category}"
         return data_type
 
     def _delete_and_recreate_struct_data_type(