Data comparison tool (#618)

* Parse cvdump TYPES section. Add datacmp tool. * Corrections * Use static * Revert "Use static" This reverts commit e0a4324e00. * Handle partially initialized variable * Shuffle order of legounksavedatawriter * Revert "Shuffle order of legounksavedatawriter" This reverts commit 506e06f117. --------- Co-authored-by: Christian Semmler <mail@csemmler.com>
2024-11-25 17:18:16 -05:00 · 2024-03-05 03:45:09 -05:00 · 2024-03-05 03:45:09 -05:00 · ec1fcce08c
commit ec1fcce08c
parent 068760056a
15 changed files with 1347 additions and 82 deletions
--- a/CONFIG/AboutDlg.h
+++ b/CONFIG/AboutDlg.h
@ -24,6 +24,9 @@ class CAboutDialog : public CDialog {
 // SYNTHETIC: CONFIG 0x00403cb0
 // CAboutDialog::`scalar deleting destructor'

+// FUNCTION: CONFIG 0x00403d30
+// CAboutDialog::_GetBaseMessageMap
+
 // FUNCTION: CONFIG 0x00403d40
 // CAboutDialog::GetMessageMap

--- a/CONFIG/MainDlg.h
+++ b/CONFIG/MainDlg.h
@ -55,6 +55,9 @@ class CMainDialog : public CDialog {
 // SYNTHETIC: CONFIG 0x00403de0
 // CMainDialog::`scalar deleting destructor'

+// FUNCTION: CONFIG 0x00403e60
+// CMainDialog::_GetBaseMessageMap
+
 // FUNCTION: CONFIG 0x00403e70
 // CMainDialog::GetMessageMap

--- a/CONFIG/config.h
+++ b/CONFIG/config.h
@ -76,6 +76,9 @@ class CConfigApp : public CWinApp {
 // SYNTHETIC: CONFIG 0x00402cd0
 // CConfigApp::`scalar deleting destructor'

+// FUNCTION: CONFIG 0x402c20
+// CConfigApp::_GetBaseMessageMap
+
 // FUNCTION: CONFIG 0x402c30
 // CConfigApp::GetMessageMap

--- a/LEGO1/lego/legoomni/src/gasstation/gasstation.cpp
+++ b/LEGO1/lego/legoomni/src/gasstation/gasstation.cpp
@ -8,7 +8,7 @@
 #include "mxticklemanager.h"

 // GLOBAL: LEGO1 0x100f0160
-undefined4 g_unk0x100f0160;
+undefined4 g_unk0x100f0160 = 3;

 // FUNCTION: LEGO1 0x100046a0
 GasStation::GasStation()
--- a/LEGO1/omni/src/common/mxutil.cpp
+++ b/LEGO1/omni/src/common/mxutil.cpp
@ -10,7 +10,7 @@
 #include "mxrect32.h"

 // GLOBAL: LEGO1 0x101020e8
-void (*g_omniUserMessage)(const char*, int);
+void (*g_omniUserMessage)(const char*, int) = NULL;

 // FUNCTION: LEGO1 0x100b6e10
 MxBool GetRectIntersection(
--- a/LEGO1/omni/src/main/mxomni.cpp
+++ b/LEGO1/omni/src/main/mxomni.cpp
@ -17,13 +17,13 @@
 #include "mxvideomanager.h"

 // GLOBAL: LEGO1 0x101015b8
-char g_hdPath[1024];
+char g_hdPath[1024] = "";

 // GLOBAL: LEGO1 0x101019b8
-char g_cdPath[1024];
+char g_cdPath[1024] = "E:";

 // GLOBAL: LEGO1 0x10101db8
-MxBool g_use3dSound;
+MxBool g_use3dSound = FALSE;

 // GLOBAL: LEGO1 0x101015b0
 MxOmni* MxOmni::g_instance = NULL;
--- a/tools/datacmp.py
+++ b/tools/datacmp.py
@ -0,0 +1,341 @@
+# (New) Data comparison.
+
+import os
+import argparse
+import logging
+from enum import Enum
+from typing import Iterable, List, NamedTuple, Optional, Tuple
+from struct import unpack
+from isledecomp.compare import Compare as IsleCompare
+from isledecomp.compare.db import MatchInfo
+from isledecomp.cvdump import Cvdump
+from isledecomp.cvdump.types import (
+    CvdumpKeyError,
+    CvdumpIntegrityError,
+)
+from isledecomp.bin import Bin as IsleBin
+import colorama
+
+colorama.init()
+
+
+# Ignore all compare-db messages.
+logging.getLogger("isledecomp.compare").addHandler(logging.NullHandler())
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Comparing data values.")
+    parser.add_argument(
+        "original", metavar="original-binary", help="The original binary"
+    )
+    parser.add_argument(
+        "recompiled", metavar="recompiled-binary", help="The recompiled binary"
+    )
+    parser.add_argument(
+        "pdb", metavar="recompiled-pdb", help="The PDB of the recompiled binary"
+    )
+    parser.add_argument(
+        "decomp_dir", metavar="decomp-dir", help="The decompiled source tree"
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="",
+    )
+    parser.add_argument(
+        "--no-color", "-n", action="store_true", help="Do not color the output"
+    )
+    parser.add_argument(
+        "--print-rec-addr",
+        action="store_true",
+        help="Print addresses of recompiled functions too",
+    )
+
+    (args, _) = parser.parse_known_args()
+
+    if not os.path.isfile(args.original):
+        parser.error(f"Original binary {args.original} does not exist")
+
+    if not os.path.isfile(args.recompiled):
+        parser.error(f"Recompiled binary {args.recompiled} does not exist")
+
+    if not os.path.isfile(args.pdb):
+        parser.error(f"Symbols PDB {args.pdb} does not exist")
+
+    if not os.path.isdir(args.decomp_dir):
+        parser.error(f"Source directory {args.decomp_dir} does not exist")
+
+    return args
+
+
+class CompareResult(Enum):
+    MATCH = 1
+    DIFF = 2
+    ERROR = 3
+    WARN = 4
+
+
+class ComparedOffset(NamedTuple):
+    offset: int
+    # name is None for scalar types
+    name: Optional[str]
+    match: bool
+    values: Tuple[str, str]
+
+
+class ComparisonItem(NamedTuple):
+    """Each variable that was compared"""
+
+    orig_addr: int
+    recomp_addr: int
+    name: str
+
+    # The list of items that were compared.
+    # For a complex type, these are the members.
+    # For a scalar type, this is a list of size one.
+    # If we could not retrieve type information, this is
+    # a list of size one but without any specific type.
+    compared: List[ComparedOffset]
+
+    # If present, the error message from the types parser.
+    error: Optional[str] = None
+
+    # If true, there is no type specified for this variable. (i.e. non-public)
+    # In this case, we can only compare the raw bytes.
+    # This is different from the situation where a type id _is_ given, but
+    # we could not retrieve it for some reason. (This is an error.)
+    raw_only: bool = False
+
+    @property
+    def result(self) -> CompareResult:
+        if self.error is not None:
+            return CompareResult.ERROR
+
+        if all(c.match for c in self.compared):
+            return CompareResult.MATCH
+
+        # Prefer WARN for a diff without complete type information.
+        return CompareResult.WARN if self.raw_only else CompareResult.DIFF
+
+
+def create_comparison_item(
+    var: MatchInfo,
+    compared: Optional[List[ComparedOffset]] = None,
+    error: Optional[str] = None,
+    raw_only: bool = False,
+) -> ComparisonItem:
+    """Helper to create the ComparisonItem from the fields in MatchInfo."""
+    if compared is None:
+        compared = []
+
+    return ComparisonItem(
+        orig_addr=var.orig_addr,
+        recomp_addr=var.recomp_addr,
+        name=var.name,
+        compared=compared,
+        error=error,
+        raw_only=raw_only,
+    )
+
+
+def do_the_comparison(args: argparse.Namespace) -> Iterable[ComparisonItem]:
+    """Run through each variable in our compare DB, then do the comparison
+    according to the variable's type. Emit the result."""
+    with IsleBin(args.original, find_str=True) as origfile, IsleBin(
+        args.recompiled
+    ) as recompfile:
+        isle_compare = IsleCompare(origfile, recompfile, args.pdb, args.decomp_dir)
+
+        # TODO: We don't currently retain the type information of each variable
+        # in our compare DB. To get those, we build this mini-lookup table that
+        # maps recomp addresses to their type.
+        # We still need to build the full compare DB though, because we may
+        # need the matched symbols to compare pointers (e.g. on strings)
+        mini_cvdump = Cvdump(args.pdb).globals().types().run()
+
+        recomp_type_reference = {
+            recompfile.get_abs_addr(g.section, g.offset): g.type
+            for g in mini_cvdump.globals
+            if recompfile.is_valid_section(g.section)
+        }
+
+        for var in isle_compare.get_variables():
+            type_name = recomp_type_reference.get(var.recomp_addr)
+
+            # Start by assuming we can only compare the raw bytes
+            data_size = var.size
+            is_type_aware = type_name is not None
+
+            if is_type_aware:
+                try:
+                    # If we are type-aware, we can get the precise
+                    # data size for the variable.
+                    data_type = mini_cvdump.types.get(type_name)
+                    data_size = data_type.size
+                except (CvdumpKeyError, CvdumpIntegrityError) as ex:
+                    yield create_comparison_item(var, error=repr(ex))
+                    continue
+
+            orig_raw = origfile.read(var.orig_addr, data_size)
+            recomp_raw = recompfile.read(var.recomp_addr, data_size)
+
+            # If either read exceeded the raw data size for the section,
+            # assume the entire variable is uninitialized.
+            # TODO: This is not correct, strictly speaking. However,
+            # it is probably impossible for a variable to exceed
+            # the virtual size of the section, so all that is left is
+            # the uninitialized data.
+            # If the variable falls at the end of the section like this,
+            # it is highly likely to be uninitialized.
+            if orig_raw is not None and len(orig_raw) < data_size:
+                orig_raw = None
+
+            if recomp_raw is not None and len(recomp_raw) < data_size:
+                recomp_raw = None
+
+            # If both variables are uninitialized, we consider them equal.
+            # Otherwise, this is a diff but there is nothing to compare.
+            if orig_raw is None or recomp_raw is None:
+                match = orig_raw is None and recomp_raw is None
+                orig_value = "(uninitialized)" if orig_raw is None else "(initialized)"
+                recomp_value = (
+                    "(uninitialized)" if recomp_raw is None else "(initialized)"
+                )
+                yield create_comparison_item(
+                    var,
+                    compared=[
+                        ComparedOffset(
+                            offset=0,
+                            name=None,
+                            match=match,
+                            values=(orig_value, recomp_value),
+                        )
+                    ],
+                )
+                continue
+
+            if not is_type_aware:
+                # If there is no specific type information available
+                # (i.e. if this is a static or non-public variable)
+                # then we can only compare the raw bytes.
+                yield create_comparison_item(
+                    var,
+                    compared=[
+                        ComparedOffset(
+                            offset=0,
+                            name="(raw)",
+                            match=orig_raw == recomp_raw,
+                            values=(orig_raw, recomp_raw),
+                        )
+                    ],
+                    raw_only=True,
+                )
+                continue
+
+            # If we are here, we can do the type-aware comparison.
+            compared = []
+            compare_items = mini_cvdump.types.get_scalars(type_name)
+            format_str = mini_cvdump.types.get_format_string(type_name)
+
+            orig_data = unpack(format_str, orig_raw)
+            recomp_data = unpack(format_str, recomp_raw)
+
+            def pointer_display(addr: int, is_orig: bool) -> str:
+                """Helper to streamline pointer textual display."""
+                if addr == 0:
+                    return "nullptr"
+
+                ptr_match = (
+                    isle_compare.get_by_orig(addr)
+                    if is_orig
+                    else isle_compare.get_by_recomp(addr)
+                )
+
+                if ptr_match is not None:
+                    return f"Pointer to {ptr_match.match_name()}"
+
+                # This variable did not match if we do not have
+                # the pointer target in our DB.
+                return f"Unknown pointer 0x{addr:x}"
+
+            # Could zip here
+            for i, member in enumerate(compare_items):
+                if member.is_pointer:
+                    match = isle_compare.is_pointer_match(orig_data[i], recomp_data[i])
+
+                    value_a = pointer_display(orig_data[i], True)
+                    value_b = pointer_display(recomp_data[i], False)
+
+                    values = (value_a, value_b)
+                else:
+                    match = orig_data[i] == recomp_data[i]
+                    values = (orig_data[i], recomp_data[i])
+
+                compared.append(
+                    ComparedOffset(
+                        offset=member.offset,
+                        name=member.name,
+                        match=match,
+                        values=values,
+                    )
+                )
+
+            yield create_comparison_item(var, compared=compared)
+
+
+def value_get(value: Optional[str], default: str):
+    return value if value is not None else default
+
+
+def main():
+    args = parse_args()
+
+    def display_match(result: CompareResult) -> str:
+        """Helper to return color string or not, depending on user preference"""
+        if args.no_color:
+            return result.name
+
+        match_color = (
+            colorama.Fore.GREEN
+            if result == CompareResult.MATCH
+            else (
+                colorama.Fore.YELLOW
+                if result == CompareResult.WARN
+                else colorama.Fore.RED
+            )
+        )
+        return f"{match_color}{result.name}{colorama.Style.RESET_ALL}"
+
+    for item in do_the_comparison(args):
+        if not args.verbose and item.result == CompareResult.MATCH:
+            continue
+
+        address_display = (
+            f"0x{item.orig_addr:x} / 0x{item.recomp_addr:x}"
+            if args.print_rec_addr
+            else f"0x{item.orig_addr:x}"
+        )
+
+        print(f"{item.name[:80]} ({address_display}) ... {display_match(item.result)} ")
+        if item.error is not None:
+            print(f"  {item.error}")
+
+        for c in item.compared:
+            if not args.verbose and c.match:
+                continue
+
+            (value_a, value_b) = c.values
+            if c.match:
+                print(f"  {c.offset:5} {value_get(c.name, '(value)'):30} {value_a}")
+            else:
+                print(
+                    f"  {c.offset:5} {value_get(c.name, '(value)'):30} {value_a} : {value_b}"
+                )
+
+        print()
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/isledecomp/isledecomp/compare/core.py
+++ b/tools/isledecomp/isledecomp/compare/core.py
@ -95,6 +95,7 @@ def _load_cvdump(self):
            .publics()
            .symbols()
            .section_contributions()
+            .types()
            .run()
        )
        res = CvdumpAnalysis(cv)
@ -454,6 +455,25 @@ def _compare_match(self, match: MatchInfo) -> Optional[DiffReport]:

    ## Public API

+    def is_pointer_match(self, orig_addr, recomp_addr) -> bool:
+        """Check whether these pointers point at the same thing"""
+
+        # Null pointers considered matching
+        if orig_addr == 0 and recomp_addr == 0:
+            return True
+
+        match = self._db.get_by_orig(orig_addr)
+        if match is None:
+            return False
+
+        return match.recomp_addr == recomp_addr
+
+    def get_by_orig(self, addr: int) -> Optional[MatchInfo]:
+        return self._db.get_by_orig(addr)
+
+    def get_by_recomp(self, addr: int) -> Optional[MatchInfo]:
+        return self._db.get_by_recomp(addr)
+
    def get_all(self) -> List[MatchInfo]:
        return self._db.get_all()

@ -463,6 +483,9 @@ def get_functions(self) -> List[MatchInfo]:
    def get_vtables(self) -> List[MatchInfo]:
        return self._db.get_matches_by_type(SymbolType.VTABLE)

+    def get_variables(self) -> List[MatchInfo]:
+        return self._db.get_matches_by_type(SymbolType.DATA)
+
    def compare_address(self, addr: int) -> Optional[DiffReport]:
        match = self._db.get_one_match(addr)
        if match is None:
--- a/tools/isledecomp/isledecomp/cvdump/init.py
+++ b/tools/isledecomp/isledecomp/cvdump/init.py
@ -1,3 +1,4 @@
 from .analysis import CvdumpAnalysis
 from .parser import CvdumpParser
 from .runner import Cvdump
+from .types import CvdumpTypesParser
--- a/tools/isledecomp/isledecomp/cvdump/analysis.py
+++ b/tools/isledecomp/isledecomp/cvdump/analysis.py
@ -1,45 +1,9 @@
 """For collating the results from parsing cvdump.exe into a more directly useful format."""
-from typing import List, Optional, Tuple
+from typing import List, Optional
 from isledecomp.types import SymbolType
 from .parser import CvdumpParser
 from .demangler import demangle_string_const, demangle_vtable
-
-
-def data_type_info(type_name: str) -> Optional[Tuple[int, bool]]:
-    """cvdump type aliases are listed here:
-    https://github.com/microsoft/microsoft-pdb/blob/master/include/cvinfo.h
-    For the given type, return tuple(size, is_pointer) if possible."""
-    # pylint: disable=too-many-return-statements
-    # TODO: refactor to be as simple as possble
-
-    # Ignore complex types. We can get the size of those from the TYPES section.
-    if not type_name.startswith("T"):
-        return None
-
-    # if 32-bit pointer
-    if type_name.startswith("T_32P"):
-        return (4, True)
-
-    if type_name.endswith("QUAD") or type_name.endswith("64"):
-        return (8, False)
-
-    if (
-        type_name.endswith("LONG")
-        or type_name.endswith("INT4")
-        or type_name.endswith("32")
-    ):
-        return (4, False)
-
-    if type_name.endswith("SHORT") or type_name.endswith("WCHAR"):
-        return (2, False)
-
-    if "CHAR" in type_name:
-        return (1, False)
-
-    if type_name in ("T_NOTYPE", "T_VOID"):
-        return (0, False)
-
-    return None
+from .types import CvdumpKeyError, CvdumpIntegrityError


 class CvdumpNode:
@ -146,11 +110,21 @@ def __init__(self, parser: CvdumpParser):
            node_dict[key].node_type = SymbolType.DATA
            node_dict[key].friendly_name = glo.name

-            if (g_info := data_type_info(glo.type)) is not None:
-                (size, is_pointer) = g_info
-                node_dict[key].confirmed_size = size
-                if is_pointer:
-                    node_dict[key].node_type = SymbolType.POINTER
+            try:
+                # Check our types database for type information.
+                # If we did not parse the TYPES section, we can only
+                # get information for built-in "T_" types.
+                g_info = parser.types.get(glo.type)
+                node_dict[key].confirmed_size = g_info.size
+                # Previously we set the symbol type to POINTER here if
+                # the variable was known to be a pointer. We can derive this
+                # information later when it's time to compare the variable,
+                # so let's set these to symbol type DATA instead.
+                # POINTER will be reserved for non-variable pointer data.
+                # e.g. thunks, unwind section.
+            except (CvdumpKeyError, CvdumpIntegrityError):
+                # No big deal if we don't have complete type information.
+                pass

        for lin in parser.lines:
            key = (lin.section, lin.offset)
--- a/tools/isledecomp/isledecomp/cvdump/parser.py
+++ b/tools/isledecomp/isledecomp/cvdump/parser.py
@ -1,9 +1,10 @@
 import re
 from typing import Iterable, Tuple
 from collections import namedtuple
+from .types import CvdumpTypesParser

 # e.g. `*** PUBLICS`
-_section_change_regex = re.compile(r"^\*\*\* (?P<section>[A-Z/ ]+)")
+_section_change_regex = re.compile(r"^\*\*\* (?P<section>[A-Z/ ]+)$")

 # e.g. `     27 00034EC0     28 00034EE2     29 00034EE7     30 00034EF4`
 _line_addr_pairs_findall = re.compile(r"\s+(?P<line_no>\d+) (?P<addr>[A-F0-9]{8})")
@ -76,6 +77,8 @@ def __init__(self) -> None:
        self.globals = []
        self.modules = []

+        self.types = CvdumpTypesParser()
+
    def _lines_section(self, line: str):
        """Parsing entries from the LINES section. We only care about the pairs of
        line_number and address and the subsection header to indicate which code file
@ -198,6 +201,9 @@ def read_line(self, line: str):
        elif self._section == "MODULES":
            self._modules_section(line)

+        elif self._section == "TYPES":
+            self.types.read_line(line)
+
    def read_lines(self, lines: Iterable[str]):
        for line in lines:
            self.read_line(line)
--- a/tools/isledecomp/isledecomp/cvdump/runner.py
+++ b/tools/isledecomp/isledecomp/cvdump/runner.py
@ -14,6 +14,7 @@ class DumpOpt(Enum):
    PUBLICS = 3
    SECTION_CONTRIB = 4
    MODULES = 5
+    TYPES = 6


 cvdump_opt_map = {
@ -23,6 +24,7 @@ class DumpOpt(Enum):
    DumpOpt.PUBLICS: "-p",
    DumpOpt.SECTION_CONTRIB: "-seccontrib",
    DumpOpt.MODULES: "-m",
+    DumpOpt.TYPES: "-t",
 }


@ -55,6 +57,10 @@ def modules(self):
        self._options.add(DumpOpt.MODULES)
        return self

+    def types(self):
+        self._options.add(DumpOpt.TYPES)
+        return self
+
    def cmd_line(self) -> List[str]:
        cvdump_exe = lib_path_join("cvdump.exe")
        flags = [cvdump_opt_map[opt] for opt in self._options]
--- a/tools/isledecomp/isledecomp/cvdump/types.py
+++ b/tools/isledecomp/isledecomp/cvdump/types.py
@ -0,0 +1,433 @@
+import re
+from typing import Dict, Iterator, List, NamedTuple, Optional
+
+
+class CvdumpTypeError(Exception):
+    pass
+
+
+class CvdumpKeyError(KeyError):
+    pass
+
+
+class CvdumpIntegrityError(Exception):
+    pass
+
+
+class FieldListItem(NamedTuple):
+    """Member of a class or structure"""
+
+    offset: int
+    name: str
+    type: str
+
+
+class ScalarType(NamedTuple):
+    offset: int
+    name: Optional[str]
+    type: str
+
+    @property
+    def size(self) -> int:
+        return scalar_type_size(self.type)
+
+    @property
+    def format_char(self) -> str:
+        return scalar_type_format_char(self.type)
+
+    @property
+    def is_pointer(self) -> bool:
+        return scalar_type_pointer(self.type)
+
+
+class TypeInfo(NamedTuple):
+    key: str
+    size: int
+    name: Optional[str] = None
+    members: Optional[List[FieldListItem]] = None
+
+    def is_scalar(self) -> bool:
+        # TODO: distinction between a class with zero members and no vtable?
+        return self.members is None
+
+
+def normalize_type_id(key: str) -> str:
+    """Helper for TYPES parsing to ensure a consistent format.
+    If key begins with "T_" it is a built-in type.
+    Else it is a hex string. We prefer lower case letters and
+    no leading zeroes. (UDT identifier pads to 8 characters.)"""
+    if key.startswith("T_"):
+        # Remove numeric value for "T_" type. We don't use this.
+        return key[: key.index("(")] if "(" in key else key
+
+    return hex(int(key, 16)).lower()
+
+
+def scalar_type_pointer(type_name: str) -> bool:
+    return type_name.startswith("T_32P")
+
+
+def scalar_type_size(type_name: str) -> int:
+    if scalar_type_pointer(type_name):
+        return 4
+
+    if "CHAR" in type_name:
+        return 2 if "WCHAR" in type_name else 1
+
+    if "SHORT" in type_name:
+        return 2
+
+    if "QUAD" in type_name or "64" in type_name:
+        return 8
+
+    return 4
+
+
+def scalar_type_signed(type_name: str) -> bool:
+    if scalar_type_pointer(type_name):
+        return False
+
+    # According to cvinfo.h, T_WCHAR is unsigned
+    return not type_name.startswith("T_U") and not type_name.startswith("T_W")
+
+
+def scalar_type_format_char(type_name: str) -> str:
+    if scalar_type_pointer(type_name):
+        return "L"
+
+    # "Really a char"
+    if type_name.startswith("T_RCHAR"):
+        return "c"
+
+    # floats
+    if type_name.startswith("T_REAL"):
+        return "d" if "64" in type_name else "f"
+
+    size = scalar_type_size(type_name)
+    char = ({1: "b", 2: "h", 4: "l", 8: "q"}).get(size, "l")
+
+    return char if scalar_type_signed(type_name) else char.upper()
+
+
+def member_string_iter(
+    members: List[ScalarType], size: Optional[int] = None
+) -> Iterator[str]:
+    if len(members) == 0:
+        yield "x" * (size or 0)
+
+    last_offset = 0
+    last_size = 0
+    for m in members:
+        padding = m.offset - last_offset - last_size
+        if padding > 0:
+            yield "x" * padding
+
+        yield m.format_char
+        last_offset = m.offset
+        last_size = m.size
+
+    if size is not None:
+        padding = size - (last_offset + last_size)
+        if padding > 0:
+            yield "x" * padding
+
+
+def member_list_to_struct_string(
+    members: List[ScalarType], size: Optional[int] = None
+) -> str:
+    """Create a string for use with struct.unpack
+    Will pad to `size` bytes if present."""
+    if len(members) == 0:
+        return "x" * (size or 0)
+
+    format_string = "".join(list(member_string_iter(members, size)))
+    if len(format_string) > 0:
+        return "<" + format_string
+
+    return ""
+
+
+def join_member_names(parent: str, child: Optional[str]) -> str:
+    """Helper method to combine parent/child member names.
+    Child member name is None if the child is a scalar type."""
+
+    if child is None:
+        return parent
+
+    # If the child is an array index, join without the dot
+    if child.startswith("["):
+        return f"{parent}{child}"
+
+    return f"{parent}.{child}"
+
+
+class CvdumpTypesParser:
+    """Parser for cvdump output, TYPES section.
+    Tricky enough that it demands its own parser."""
+
+    # Marks the start of a new type
+    INDEX_RE = re.compile(r"(?P<key>0x\w+) : .* (?P<type>LF_\w+)")
+
+    # LF_FIELDLIST class/struct member (1/2)
+    LIST_RE = re.compile(
+        r"\s+list\[\d+\] = LF_MEMBER, (?P<scope>\w+), type = (?P<type>.*), offset = (?P<offset>\d+)"
+    )
+
+    # LF_FIELDLIST vtable indicator
+    VTABLE_RE = re.compile(r"^\s+list\[\d+\] = LF_VFUNCTAB")
+
+    # LF_FIELDLIST superclass indicator
+    SUPERCLASS_RE = re.compile(
+        r"^\s+list\[\d+\] = LF_BCLASS, (?P<scope>\w+), type = (?P<type>.*), offset = (?P<offset>\d+)"
+    )
+
+    # LF_FIELDLIST member name (2/2)
+    MEMBER_RE = re.compile(r"^\s+member name = '(?P<name>.*)'$")
+
+    # LF_ARRAY element type
+    ARRAY_ELEMENT_RE = re.compile(r"^\s+Element type = (?P<type>.*)")
+
+    # LF_ARRAY total array size
+    ARRAY_LENGTH_RE = re.compile(r"^\s+length = (?P<length>\d+)")
+
+    # LF_CLASS/LF_STRUCTURE field list reference
+    CLASS_FIELD_RE = re.compile(
+        r"^\s+# members = \d+,  field list type (?P<field_type>0x\w+),"
+    )
+
+    # LF_CLASS/LF_STRUCTURE name and other info
+    CLASS_NAME_RE = re.compile(
+        r"^\s+Size = (?P<size>\d+), class name = (?P<name>.+), UDT\((?P<udt>0x\w+)\)"
+    )
+
+    # LF_MODIFIER, type being modified
+    MODIFIES_RE = re.compile(r".*modifies type (?P<type>.*)$")
+
+    def __init__(self) -> None:
+        self.mode = ""
+        self.last_key = ""
+        self.keys = {}
+
+    def _new_type(self):
+        """Prepare a new dict for the type we just parsed.
+        The id is self.last_key and the "type" of type is self.mode.
+        e.g. LF_CLASS"""
+        self.keys[self.last_key] = {"type": self.mode}
+
+    def _set(self, key: str, value):
+        self.keys[self.last_key][key] = value
+
+    def _add_member(self, offset: int, type_: str):
+        obj = self.keys[self.last_key]
+        if "members" not in obj:
+            obj["members"] = []
+
+        obj["members"].append({"offset": offset, "type": type_})
+
+    def _set_member_name(self, name: str):
+        """Set name for most recently added member."""
+        obj = self.keys[self.last_key]
+        obj["members"][-1]["name"] = name
+
+    def _get_field_list(self, type_obj: Dict) -> List[FieldListItem]:
+        """Return the field list for the given LF_CLASS/LF_STRUCTURE reference"""
+
+        if type_obj.get("type") == "LF_FIELDLIST":
+            field_obj = type_obj
+        else:
+            field_list_type = type_obj.get("field_list_type")
+            field_obj = self.keys[field_list_type]
+
+        members: List[FieldListItem] = []
+
+        super_id = field_obj.get("super")
+        if super_id is not None:
+            # May need to resolve forward ref.
+            superclass = self.get(super_id)
+            if superclass.members is not None:
+                members = superclass.members
+
+        raw_members = field_obj.get("members", [])
+        members += [
+            FieldListItem(
+                offset=m["offset"],
+                type=m["type"],
+                name=m["name"],
+            )
+            for m in raw_members
+        ]
+
+        return sorted(members, key=lambda m: m.offset)
+
+    def _mock_array_members(self, type_obj: Dict) -> List[FieldListItem]:
+        """LF_ARRAY elements provide the element type and the total size.
+        We want the list of "members" as if this was a struct."""
+
+        if type_obj.get("type") != "LF_ARRAY":
+            raise CvdumpTypeError("Type is not an LF_ARRAY")
+
+        array_type = type_obj.get("array_type")
+        if array_type is None:
+            raise CvdumpIntegrityError("No array element type")
+
+        array_element_size = self.get(array_type).size
+
+        n_elements = type_obj["size"] // array_element_size
+
+        return [
+            FieldListItem(
+                offset=i * array_element_size,
+                type=array_type,
+                name=f"[{i}]",
+            )
+            for i in range(n_elements)
+        ]
+
+    def get(self, type_key: str) -> TypeInfo:
+        """Convert our dictionary values read from the cvdump output
+        into a consistent format for the given type."""
+
+        # Scalar type. Handled here because it makes the recursive steps
+        # much simpler.
+        if type_key.startswith("T_"):
+            size = scalar_type_size(type_key)
+            return TypeInfo(
+                key=type_key,
+                size=size,
+            )
+
+        # Go to our dictionary to find it.
+        obj = self.keys.get(type_key.lower())
+        if obj is None:
+            raise CvdumpKeyError(type_key)
+
+        # These type references are just a wrapper around a scalar
+        if obj.get("type") == "LF_ENUM":
+            return self.get("T_INT4")
+
+        if obj.get("type") == "LF_POINTER":
+            return self.get("T_32PVOID")
+
+        if obj.get("is_forward_ref", False):
+            # Get the forward reference to follow.
+            # If this is LF_CLASS/LF_STRUCTURE, it is the UDT value.
+            # For LF_MODIFIER, it is the type being modified.
+            forward_ref = obj.get("udt", None) or obj.get("modifies", None)
+            if forward_ref is None:
+                raise CvdumpIntegrityError(f"Null forward ref for type {type_key}")
+
+            return self.get(forward_ref)
+
+        # Else it is not a forward reference, so build out the object here.
+        if obj.get("type") == "LF_ARRAY":
+            members = self._mock_array_members(obj)
+        else:
+            members = self._get_field_list(obj)
+
+        return TypeInfo(
+            key=type_key,
+            size=obj.get("size"),
+            name=obj.get("name"),
+            members=members,
+        )
+
+    def get_by_name(self, name: str) -> TypeInfo:
+        """Find the complex type with the given name."""
+        # TODO
+        raise NotImplementedError
+
+    def get_scalars(self, type_key: str) -> List[ScalarType]:
+        """Reduce the given type to a list of scalars so we can
+        compare each component value."""
+
+        obj = self.get(type_key)
+        if obj.is_scalar():
+            # Use obj.key here for alias types like LF_POINTER
+            return [ScalarType(offset=0, type=obj.key, name=None)]
+
+        # mypy?
+        assert obj.members is not None
+
+        # Dedupe repeated offsets if this is a union type
+        unique_offsets = {m.offset: m for m in obj.members}
+        unique_members = [m for _, m in unique_offsets.items()]
+
+        return [
+            ScalarType(
+                offset=m.offset + cm.offset,
+                type=cm.type,
+                name=join_member_names(m.name, cm.name),
+            )
+            for m in unique_members
+            for cm in self.get_scalars(m.type)
+        ]
+
+    def get_format_string(self, type_key: str) -> str:
+        obj = self.get(type_key)
+        members = self.get_scalars(type_key)
+        # We need both to pad the data to size
+        return member_list_to_struct_string(members, obj.size)
+
+    def read_line(self, line: str):
+        if (match := self.INDEX_RE.match(line)) is not None:
+            self.last_key = normalize_type_id(match.group("key"))
+            self.mode = match.group("type")
+            self._new_type()
+
+            # We don't need to read anything else from here (for now)
+            if self.mode in ("LF_ENUM", "LF_POINTER"):
+                self._set("size", 4)
+
+        if self.mode == "LF_MODIFIER":
+            if (match := self.MODIFIES_RE.match(line)) is not None:
+                # For convenience, because this is essentially the same thing
+                # as an LF_CLASS forward ref.
+                self._set("is_forward_ref", True)
+                self._set("modifies", normalize_type_id(match.group("type")))
+
+        if self.mode == "LF_ARRAY":
+            if (match := self.ARRAY_ELEMENT_RE.match(line)) is not None:
+                self._set("array_type", normalize_type_id(match.group("type")))
+
+            if (match := self.ARRAY_LENGTH_RE.match(line)) is not None:
+                self._set("size", int(match.group("length")))
+
+        if self.mode == "LF_FIELDLIST":
+            # If this class has a vtable, create a mock member at offset 0
+            if (match := self.VTABLE_RE.match(line)) is not None:
+                # For our purposes, any pointer type will do
+                self._add_member(0, "T_32PVOID")
+                self._set_member_name("vftable")
+
+            # Superclass is set here in the fieldlist rather than in LF_CLASS
+            if (match := self.SUPERCLASS_RE.match(line)) is not None:
+                self._set("super", normalize_type_id(match.group("type")))
+
+            # Member offset and type given on the first of two lines.
+            if (match := self.LIST_RE.match(line)) is not None:
+                self._add_member(
+                    int(match.group("offset")), normalize_type_id(match.group("type"))
+                )
+
+            # Name of the member read on the second of two lines.
+            if (match := self.MEMBER_RE.match(line)) is not None:
+                self._set_member_name(match.group("name"))
+
+        if self.mode in ("LF_STRUCTURE", "LF_CLASS"):
+            # Match the reference to the associated LF_FIELDLIST
+            if (match := self.CLASS_FIELD_RE.match(line)) is not None:
+                if match.group("field_type") == "0x0000":
+                    # Not redundant. UDT might not match the key.
+                    # These cases get reported as UDT mismatch.
+                    self._set("is_forward_ref", True)
+                else:
+                    field_list_type = normalize_type_id(match.group("field_type"))
+                    self._set("field_list_type", field_list_type)
+
+            # Last line has the vital information.
+            # If this is a FORWARD REF, we need to follow the UDT pointer
+            # to get the actual class details.
+            if (match := self.CLASS_NAME_RE.match(line)) is not None:
+                self._set("name", match.group("name"))
+                self._set("udt", normalize_type_id(match.group("udt")))
+                self._set("size", int(match.group("size")))
--- a/tools/isledecomp/tests/test_cvdump.py
+++ b/tools/isledecomp/tests/test_cvdump.py
@ -1,39 +1,59 @@
 import pytest
-from isledecomp.cvdump.analysis import data_type_info
+from isledecomp.cvdump.types import (
+    scalar_type_size,
+    scalar_type_pointer,
+    scalar_type_signed,
+)
+
+# These are all the types seen in the cvdump.
+# We have char, short, int, long, long long, float, and double all represented
+# in both signed and unsigned.
+# We can also identify a 4 byte pointer with the T_32 prefix.
+# The type T_VOID is used to designate a function's return type.
+# T_NOTYPE is specified as the type of "this" for a static function in a class.
+
+# For reference: https://github.com/microsoft/microsoft-pdb/blob/master/include/cvinfo.h

 # fmt: off
-type_check_cases = [
-    ("T_32PINT4",      4, True),
-    ("T_32PLONG",      4, True),
-    ("T_32PRCHAR",     4, True),
-    ("T_32PREAL32",    4, True),
-    ("T_32PUCHAR",     4, True),
-    ("T_32PUINT4",     4, True),
-    ("T_32PULONG",     4, True),
-    ("T_32PUSHORT",    4, True),
-    ("T_32PVOID",      4, True),
-    ("T_CHAR",         1, False),
-    ("T_INT4",         4, False),
-    ("T_LONG",         4, False),
-    ("T_NOTYPE",       0, False),  # ?
-    ("T_QUAD",         8, False),
-    ("T_RCHAR",        1, False),
-    ("T_REAL32",       4, False),
-    ("T_REAL64",       8, False),
-    ("T_SHORT",        2, False),
-    ("T_UCHAR",        1, False),
-    ("T_UINT4",        4, False),
-    ("T_ULONG",        4, False),
-    ("T_UQUAD",        8, False),
-    ("T_USHORT",       2, False),
-    ("T_VOID",         0, False),  # ?
-    ("T_WCHAR",        2, False),
-]
+# Fields are: type_name, size, is_signed, is_pointer
+type_check_cases = (
+    ("T_32PINT4",      4,  False,  True),
+    ("T_32PLONG",      4,  False,  True),
+    ("T_32PRCHAR",     4,  False,  True),
+    ("T_32PREAL32",    4,  False,  True),
+    ("T_32PUCHAR",     4,  False,  True),
+    ("T_32PUINT4",     4,  False,  True),
+    ("T_32PULONG",     4,  False,  True),
+    ("T_32PUSHORT",    4,  False,  True),
+    ("T_32PVOID",      4,  False,  True),
+    ("T_CHAR",         1,  True,   False),
+    ("T_INT4",         4,  True,   False),
+    ("T_LONG",         4,  True,   False),
+    ("T_QUAD",         8,  True,   False),
+    ("T_RCHAR",        1,  True,   False),
+    ("T_REAL32",       4,  True,   False),
+    ("T_REAL64",       8,  True,   False),
+    ("T_SHORT",        2,  True,   False),
+    ("T_UCHAR",        1,  False,  False),
+    ("T_UINT4",        4,  False,  False),
+    ("T_ULONG",        4,  False,  False),
+    ("T_UQUAD",        8,  False,  False),
+    ("T_USHORT",       2,  False,  False),
+    ("T_WCHAR",        2,  False,  False),
+)
 # fmt: on


-@pytest.mark.parametrize("type_name, size, is_pointer", type_check_cases)
-def test_type_check(type_name: str, size: int, is_pointer: bool):
-    assert (info := data_type_info(type_name)) is not None
-    assert info[0] == size
-    assert info[1] == is_pointer
+@pytest.mark.parametrize("type_name, size, _, __", type_check_cases)
+def test_scalar_size(type_name: str, size: int, _, __):
+    assert scalar_type_size(type_name) == size
+
+
+@pytest.mark.parametrize("type_name, _, is_signed, __", type_check_cases)
+def test_scalar_signed(type_name: str, _, is_signed: bool, __):
+    assert scalar_type_signed(type_name) == is_signed
+
+
+@pytest.mark.parametrize("type_name, _, __, is_pointer", type_check_cases)
+def test_scalar_pointer(type_name: str, _, __, is_pointer: bool):
+    assert scalar_type_pointer(type_name) == is_pointer
--- a/tools/isledecomp/tests/test_cvdump_types.py
+++ b/tools/isledecomp/tests/test_cvdump_types.py
@ -0,0 +1,452 @@
+"""Specifically testing the Cvdump TYPES parser
+and type dependency tree walker."""
+
+import pytest
+from isledecomp.cvdump.types import (
+    CvdumpTypesParser,
+    CvdumpKeyError,
+    CvdumpIntegrityError,
+)
+
+TEST_LINES = """
+0x1028 : Length = 10, Leaf = 0x1001 LF_MODIFIER
+    const, modifies type T_REAL32(0040)
+
+0x103b : Length = 14, Leaf = 0x1503 LF_ARRAY
+    Element type = T_REAL32(0040)
+    Index type = T_SHORT(0011)
+    length = 16
+    Name =
+
+0x103c : Length = 14, Leaf = 0x1503 LF_ARRAY
+    Element type = 0x103B
+    Index type = T_SHORT(0011)
+    length = 64
+    Name =
+
+0x10e0 : Length = 86, Leaf = 0x1203 LF_FIELDLIST
+    list[0] = LF_MEMBER, public, type = T_REAL32(0040), offset = 0
+        member name = 'x'
+    list[1] = LF_MEMBER, public, type = T_REAL32(0040), offset = 0
+        member name = 'dvX'
+    list[2] = LF_MEMBER, public, type = T_REAL32(0040), offset = 4
+        member name = 'y'
+    list[3] = LF_MEMBER, public, type = T_REAL32(0040), offset = 4
+        member name = 'dvY'
+    list[4] = LF_MEMBER, public, type = T_REAL32(0040), offset = 8
+        member name = 'z'
+    list[5] = LF_MEMBER, public, type = T_REAL32(0040), offset = 8
+        member name = 'dvZ'
+
+0x10e1 : Length = 34, Leaf = 0x1505 LF_STRUCTURE
+    # members = 6,  field list type 0x10e0,
+    Derivation list type 0x0000, VT shape type 0x0000
+    Size = 12, class name = _D3DVECTOR, UDT(0x000010e1)
+
+0x10e4 : Length = 14, Leaf = 0x1503 LF_ARRAY
+    Element type = T_UCHAR(0020)
+    Index type = T_SHORT(0011)
+    length = 8
+    Name = 
+
+0x10ea : Length = 14, Leaf = 0x1503 LF_ARRAY
+    Element type = 0x1028
+    Index type = T_SHORT(0011)
+    length = 12
+    Name = 
+
+0x11f0 : Length = 30, Leaf = 0x1504 LF_CLASS
+    # members = 0,  field list type 0x0000, FORWARD REF, 
+    Derivation list type 0x0000, VT shape type 0x0000
+    Size = 0, class name = MxRect32, UDT(0x00001214)
+
+0x11f2 : Length = 10, Leaf = 0x1001 LF_MODIFIER
+    const, modifies type 0x11F0
+
+0x1213 : Length = 530, Leaf = 0x1203 LF_FIELDLIST
+    list[0] = LF_METHOD, count = 5, list = 0x1203, name = 'MxRect32'
+    list[1] = LF_ONEMETHOD, public, VANILLA, index = 0x1205, name = 'operator='
+    list[2] = LF_ONEMETHOD, public, VANILLA, index = 0x11F5, name = 'Intersect'
+    list[3] = LF_ONEMETHOD, public, VANILLA, index = 0x1207, name = 'SetPoint'
+    list[4] = LF_ONEMETHOD, public, VANILLA, index = 0x1207, name = 'AddPoint'
+    list[5] = LF_ONEMETHOD, public, VANILLA, index = 0x1207, name = 'SubtractPoint'
+    list[6] = LF_ONEMETHOD, public, VANILLA, index = 0x11F5, name = 'UpdateBounds'
+    list[7] = LF_ONEMETHOD, public, VANILLA, index = 0x1209, name = 'IsValid'
+    list[8] = LF_ONEMETHOD, public, VANILLA, index = 0x120A, name = 'IntersectsWith'
+    list[9] = LF_ONEMETHOD, public, VANILLA, index = 0x120B, name = 'GetWidth'
+    list[10] = LF_ONEMETHOD, public, VANILLA, index = 0x120B, name = 'GetHeight'
+    list[11] = LF_ONEMETHOD, public, VANILLA, index = 0x120C, name = 'GetPoint'
+    list[12] = LF_ONEMETHOD, public, VANILLA, index = 0x120D, name = 'GetSize'
+    list[13] = LF_ONEMETHOD, public, VANILLA, index = 0x120B, name = 'GetLeft'
+    list[14] = LF_ONEMETHOD, public, VANILLA, index = 0x120B, name = 'GetTop'
+    list[15] = LF_ONEMETHOD, public, VANILLA, index = 0x120B, name = 'GetRight'
+    list[16] = LF_ONEMETHOD, public, VANILLA, index = 0x120B, name = 'GetBottom'
+    list[17] = LF_ONEMETHOD, public, VANILLA, index = 0x120E, name = 'SetLeft'
+    list[18] = LF_ONEMETHOD, public, VANILLA, index = 0x120E, name = 'SetTop'
+    list[19] = LF_ONEMETHOD, public, VANILLA, index = 0x120E, name = 'SetRight'
+    list[20] = LF_ONEMETHOD, public, VANILLA, index = 0x120E, name = 'SetBottom'
+    list[21] = LF_METHOD, count = 3, list = 0x1211, name = 'CopyFrom'
+    list[22] = LF_ONEMETHOD, private, STATIC, index = 0x1212, name = 'Min'
+    list[23] = LF_ONEMETHOD, private, STATIC, index = 0x1212, name = 'Max'
+    list[24] = LF_MEMBER, private, type = T_INT4(0074), offset = 0
+        member name = 'm_left'
+    list[25] = LF_MEMBER, private, type = T_INT4(0074), offset = 4
+        member name = 'm_top'
+    list[26] = LF_MEMBER, private, type = T_INT4(0074), offset = 8
+        member name = 'm_right'
+    list[27] = LF_MEMBER, private, type = T_INT4(0074), offset = 12
+        member name = 'm_bottom'
+
+0x1214 : Length = 30, Leaf = 0x1504 LF_CLASS
+    # members = 34,  field list type 0x1213, CONSTRUCTOR, OVERLOAD, 
+    Derivation list type 0x0000, VT shape type 0x0000
+    Size = 16, class name = MxRect32, UDT(0x00001214)
+
+0x1220 : Length = 30, Leaf = 0x1504 LF_CLASS
+    # members = 0,  field list type 0x0000, FORWARD REF, 
+    Derivation list type 0x0000, VT shape type 0x0000
+    Size = 0, class name = MxCore, UDT(0x00004060)
+
+0x14db : Length = 30, Leaf = 0x1504 LF_CLASS
+    # members = 0,  field list type 0x0000, FORWARD REF, 
+    Derivation list type 0x0000, VT shape type 0x0000
+    Size = 0, class name = MxString, UDT(0x00004db6)
+
+0x19b0 : Length = 34, Leaf = 0x1505 LF_STRUCTURE
+    # members = 0,  field list type 0x0000, FORWARD REF, 
+    Derivation list type 0x0000, VT shape type 0x0000
+    Size = 0, class name = ROIColorAlias, UDT(0x00002a76)
+
+0x19b1 : Length = 14, Leaf = 0x1503 LF_ARRAY
+    Element type = 0x19B0
+    Index type = T_SHORT(0011)
+    length = 440
+    Name =
+
+0x2a75 : Length = 98, Leaf = 0x1203 LF_FIELDLIST
+    list[0] = LF_MEMBER, public, type = T_32PRCHAR(0470), offset = 0
+        member name = 'm_name'
+    list[1] = LF_MEMBER, public, type = T_INT4(0074), offset = 4
+        member name = 'm_red'
+    list[2] = LF_MEMBER, public, type = T_INT4(0074), offset = 8
+        member name = 'm_green'
+    list[3] = LF_MEMBER, public, type = T_INT4(0074), offset = 12
+        member name = 'm_blue'
+    list[4] = LF_MEMBER, public, type = T_INT4(0074), offset = 16
+        member name = 'm_unk0x10'
+
+0x2a76 : Length = 34, Leaf = 0x1505 LF_STRUCTURE
+    # members = 5,  field list type 0x2a75, 
+    Derivation list type 0x0000, VT shape type 0x0000
+    Size = 20, class name = ROIColorAlias, UDT(0x00002a76)
+
+0x22d4 : Length = 154, Leaf = 0x1203 LF_FIELDLIST
+    list[0] = LF_VFUNCTAB, type = 0x20FC
+    list[1] = LF_METHOD, count = 3, list = 0x22D0, name = 'MxVariable'
+    list[2] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1F0F, 
+        vfptr offset = 0, name = 'GetValue'
+    list[3] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1F10, 
+        vfptr offset = 4, name = 'SetValue'
+    list[4] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1F11, 
+        vfptr offset = 8, name = '~MxVariable'
+    list[5] = LF_ONEMETHOD, public, VANILLA, index = 0x22D3, name = 'GetKey'
+    list[6] = LF_MEMBER, protected, type = 0x14DB, offset = 4
+        member name = 'm_key'
+    list[7] = LF_MEMBER, protected, type = 0x14DB, offset = 20
+        member name = 'm_value'
+
+0x22d5 : Length = 34, Leaf = 0x1504 LF_CLASS
+    # members = 10,  field list type 0x22d4, CONSTRUCTOR, 
+    Derivation list type 0x0000, VT shape type 0x20fb
+    Size = 36, class name = MxVariable, UDT(0x00004041)
+
+0x3cc2 : Length = 38, Leaf = 0x1507 LF_ENUM
+    # members = 64,  type = T_INT4(0074) field list type 0x3cc1
+NESTED,     enum name = JukeBox::JukeBoxScript, UDT(0x00003cc2)
+
+0x3fab : Length = 10, Leaf = 0x1002 LF_POINTER
+    Pointer (NEAR32), Size: 0
+    Element type : 0x3FAA
+
+0x405f : Length = 158, Leaf = 0x1203 LF_FIELDLIST
+    list[0] = LF_VFUNCTAB, type = 0x2090
+    list[1] = LF_ONEMETHOD, public, VANILLA, index = 0x176A, name = 'MxCore'
+    list[2] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x176A, 
+        vfptr offset = 0, name = '~MxCore'
+    list[3] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x176B, 
+        vfptr offset = 4, name = 'Notify'
+    list[4] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x2087, 
+        vfptr offset = 8, name = 'Tickle'
+    list[5] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x202F, 
+        vfptr offset = 12, name = 'ClassName'
+    list[6] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x2030, 
+        vfptr offset = 16, name = 'IsA'
+    list[7] = LF_ONEMETHOD, public, VANILLA, index = 0x2091, name = 'GetId'
+    list[8] = LF_MEMBER, private, type = T_UINT4(0075), offset = 4
+        member name = 'm_id'
+
+0x4060 : Length = 30, Leaf = 0x1504 LF_CLASS
+    # members = 9,  field list type 0x405f, CONSTRUCTOR, 
+    Derivation list type 0x0000, VT shape type 0x1266
+    Size = 8, class name = MxCore, UDT(0x00004060)
+
+0x4262 : Length = 14, Leaf = 0x1503 LF_ARRAY
+    Element type = 0x3CC2
+    Index type = T_SHORT(0011)
+    length = 24
+    Name = 
+
+0x432f : Length = 14, Leaf = 0x1503 LF_ARRAY
+    Element type = T_INT4(0074)
+    Index type = T_SHORT(0011)
+    length = 12
+    Name =
+
+0x4db5 : Length = 246, Leaf = 0x1203 LF_FIELDLIST
+    list[0] = LF_BCLASS, public, type = 0x1220, offset = 0
+    list[1] = LF_METHOD, count = 3, list = 0x14E3, name = 'MxString'
+    list[2] = LF_ONEMETHOD, public, VIRTUAL, index = 0x14DE, name = '~MxString'
+    list[3] = LF_METHOD, count = 2, list = 0x14E7, name = 'operator='
+    list[4] = LF_ONEMETHOD, public, VANILLA, index = 0x14DE, name = 'ToUpperCase'
+    list[5] = LF_ONEMETHOD, public, VANILLA, index = 0x14DE, name = 'ToLowerCase'
+    list[6] = LF_ONEMETHOD, public, VANILLA, index = 0x14E8, name = 'operator+'
+    list[7] = LF_ONEMETHOD, public, VANILLA, index = 0x14E9, name = 'operator+='
+    list[8] = LF_ONEMETHOD, public, VANILLA, index = 0x14EB, name = 'Compare'
+    list[9] = LF_ONEMETHOD, public, VANILLA, index = 0x14EC, name = 'GetData'
+    list[10] = LF_ONEMETHOD, public, VANILLA, index = 0x4DB4, name = 'GetLength'
+    list[11] = LF_MEMBER, private, type = T_32PRCHAR(0470), offset = 8
+        member name = 'm_data'
+    list[12] = LF_MEMBER, private, type = T_USHORT(0021), offset = 12
+        member name = 'm_length'
+
+0x4db6 : Length = 30, Leaf = 0x1504 LF_CLASS
+    # members = 16,  field list type 0x4db5, CONSTRUCTOR, OVERLOAD, 
+    Derivation list type 0x0000, VT shape type 0x1266
+    Size = 16, class name = MxString, UDT(0x00004db6)
+"""
+
+
+@pytest.fixture(name="parser")
+def types_parser_fixture():
+    parser = CvdumpTypesParser()
+    for line in TEST_LINES.split("\n"):
+        parser.read_line(line)
+
+    return parser
+
+
+def test_basic_parsing(parser):
+    obj = parser.keys["0x4db6"]
+    assert obj["type"] == "LF_CLASS"
+    assert obj["name"] == "MxString"
+    assert obj["udt"] == "0x4db6"
+
+    assert len(parser.keys["0x4db5"]["members"]) == 2
+
+
+def test_scalar_types(parser):
+    """Full tests on the scalar_* methods are in another file.
+    Here we are just testing the passthrough of the "T_" types."""
+    assert parser.get("T_CHAR").name is None
+    assert parser.get("T_CHAR").size == 1
+
+    assert parser.get("T_32PVOID").name is None
+    assert parser.get("T_32PVOID").size == 4
+
+
+def test_resolve_forward_ref(parser):
+    # Non-forward ref
+    assert parser.get("0x22d5").name == "MxVariable"
+    # Forward ref
+    assert parser.get("0x14db").name == "MxString"
+    assert parser.get("0x14db").size == 16
+
+
+def test_members(parser):
+    """Return the list of items to compare for a given complex type.
+    If the class has a superclass, add those members too."""
+    # MxCore field list
+    mxcore_members = parser.get_scalars("0x405f")
+    assert mxcore_members == [
+        (0, "vftable", "T_32PVOID"),
+        (4, "m_id", "T_UINT4"),
+    ]
+
+    # MxCore class id. Should be the same members
+    assert mxcore_members == parser.get_scalars("0x4060")
+
+    # MxString field list. Should add inherited members from MxCore
+    assert parser.get_scalars("0x4db5") == [
+        (0, "vftable", "T_32PVOID"),
+        (4, "m_id", "T_UINT4"),
+        (8, "m_data", "T_32PRCHAR"),
+        (12, "m_length", "T_USHORT"),
+    ]
+
+
+def test_members_recursive(parser):
+    """Make sure that we unwrap the dependency tree correctly."""
+    # MxVariable field list
+    assert parser.get_scalars("0x22d4") == [
+        (0, "vftable", "T_32PVOID"),
+        (4, "m_key.vftable", "T_32PVOID"),
+        (8, "m_key.m_id", "T_UINT4"),
+        (12, "m_key.m_data", "T_32PRCHAR"),
+        (16, "m_key.m_length", "T_USHORT"),  # with padding
+        (20, "m_value.vftable", "T_32PVOID"),
+        (24, "m_value.m_id", "T_UINT4"),
+        (28, "m_value.m_data", "T_32PRCHAR"),
+        (32, "m_value.m_length", "T_USHORT"),  # with padding
+    ]
+
+
+def test_struct(parser):
+    """Basic test for converting type into struct.unpack format string."""
+    # MxCore: vftable and uint32. The vftable pointer is read as uint32.
+    assert parser.get_format_string("0x4060") == "<LL"
+
+    # _D3DVECTOR, three floats. Union types should already be removed.
+    assert parser.get_format_string("0x10e1") == "<fff"
+
+    # MxRect32, four signed ints.
+    assert parser.get_format_string("0x1214") == "<llll"
+
+
+def test_struct_padding(parser):
+    """Struct format string should insert padding characters 'x'
+    where a value is padded to alignment size (probably 4 bytes)"""
+
+    # MxString, padded to 16 bytes.
+    assert parser.get_format_string("0x4db6") == "<LLLHxx"
+
+    # MxVariable, with two MxString members.
+    assert parser.get_format_string("0x22d5") == "<LLLLHxxLLLHxx"
+
+
+def test_array(parser):
+    """LF_ARRAY members are created dynamically based on the
+    total array size and the size of one element."""
+    # unsigned char[8]
+    assert parser.get_scalars("0x10e4") == [
+        (0, "[0]", "T_UCHAR"),
+        (1, "[1]", "T_UCHAR"),
+        (2, "[2]", "T_UCHAR"),
+        (3, "[3]", "T_UCHAR"),
+        (4, "[4]", "T_UCHAR"),
+        (5, "[5]", "T_UCHAR"),
+        (6, "[6]", "T_UCHAR"),
+        (7, "[7]", "T_UCHAR"),
+    ]
+
+    # float[4]
+    assert parser.get_scalars("0x103b") == [
+        (0, "[0]", "T_REAL32"),
+        (4, "[1]", "T_REAL32"),
+        (8, "[2]", "T_REAL32"),
+        (12, "[3]", "T_REAL32"),
+    ]
+
+
+def test_2d_array(parser):
+    """Make sure 2d array elements are named as we expect."""
+    # float[4][4]
+    float_array = parser.get_scalars("0x103c")
+    assert len(float_array) == 16
+    assert float_array[0] == (0, "[0][0]", "T_REAL32")
+    assert float_array[1] == (4, "[0][1]", "T_REAL32")
+    assert float_array[4] == (16, "[1][0]", "T_REAL32")
+    assert float_array[-1] == (60, "[3][3]", "T_REAL32")
+
+
+def test_enum(parser):
+    """LF_ENUM should equal 4-byte int"""
+    assert parser.get("0x3cc2").size == 4
+    assert parser.get_scalars("0x3cc2") == [(0, None, "T_INT4")]
+
+    # Now look at an array of enum, 24 bytes
+    enum_array = parser.get_scalars("0x4262")
+    assert len(enum_array) == 6  # 24 / 4
+    assert enum_array[0].size == 4
+
+
+def test_lf_pointer(parser):
+    """LF_POINTER is just a wrapper for scalar pointer type"""
+    assert parser.get("0x3fab").size == 4
+    # assert parser.get("0x3fab").is_pointer is True  # TODO: ?
+
+    assert parser.get_scalars("0x3fab") == [(0, None, "T_32PVOID")]
+
+
+def test_key_not_exist(parser):
+    """Accessing a non-existent type id should raise our exception"""
+    with pytest.raises(CvdumpKeyError):
+        parser.get("0xbeef")
+
+    with pytest.raises(CvdumpKeyError):
+        parser.get_scalars("0xbeef")
+
+
+def test_broken_forward_ref(parser):
+    """Raise an exception if we cannot follow a forward reference"""
+    # Verify forward reference on MxCore
+    parser.get("0x1220")
+
+    # Delete the MxCore LF_CLASS
+    del parser.keys["0x4060"]
+
+    # Forward ref via 0x1220 will fail
+    with pytest.raises(CvdumpKeyError):
+        parser.get("0x1220")
+
+
+def test_null_forward_ref(parser):
+    """If the forward ref object is invalid and has no forward ref id,
+    raise an exception."""
+    # Test MxString forward reference
+    parser.get("0x14db")
+
+    # Delete the UDT for MxString
+    del parser.keys["0x14db"]["udt"]
+
+    # Cannot complete the forward reference lookup
+    with pytest.raises(CvdumpIntegrityError):
+        parser.get("0x14db")
+
+
+def test_broken_array_element_ref(parser):
+    # Test LF_ARRAY of ROIColorAlias
+    parser.get("0x19b1")
+
+    # Delete ROIColorAlias
+    del parser.keys["0x19b0"]
+
+    # Type reference lookup will fail
+    with pytest.raises(CvdumpKeyError):
+        parser.get("0x19b1")
+
+
+def test_lf_modifier(parser):
+    """Is this an alias for another type?"""
+    # Modifies float
+    assert parser.get("0x1028").size == 4
+    assert parser.get_scalars("0x1028") == [(0, None, "T_REAL32")]
+
+    mxrect = parser.get_scalars("0x1214")
+    # Modifies MxRect32 via forward ref
+    assert mxrect == parser.get_scalars("0x11f2")
+
+
+def test_union_members(parser):
+    """If there is a union somewhere in our dependency list, we can
+    expect to see duplicated member offsets and names. This is ok for
+    the TypeInfo tuple, but the list of ScalarType items should have
+    unique offset to simplify comparison."""
+
+    # D3DVector type with duplicated offsets
+    d3dvector = parser.get("0x10e1")
+    assert len(d3dvector.members) == 6
+    assert len([m for m in d3dvector.members if m.offset == 0]) == 2
+
+    # Deduplicated comparison list
+    vector_items = parser.get_scalars("0x10e1")
+    assert len(vector_items) == 3