#ifndef GEODE_CORE_META_OPTCALL_HPP #define GEODE_CORE_META_OPTCALL_HPP #include "callconv.hpp" #include "tuple.hpp" #include "x86.hpp" namespace geode::core::meta::x86 { template class Optcall : public CallConv { private: // Metaprogramming / typedefs we need for the rest of the class. using MyConv = CallConv; private: // These go in a class to not pollute the namespace. class Sequences { private: // These are required for proper reordering. static constexpr size_t length = sizeof...(Args); static constexpr size_t SSES = 4; static constexpr bool is_sse[length] = { sse_passable... }; static constexpr size_t GPRS = 2; static constexpr bool is_gpr[length] = { gpr_passable... }; static constexpr auto reordered_arr = reorder_pack(); // Setup call from our caller, to "foreign" function static constexpr auto filter_to() { /* The size of our output may be longer than the input. * Also, annoyingly, we must make a lambda to calculate this. */ constexpr auto arr_size = []() -> size_t { // Magic constant 2 is to pad XMM4 and XMM5: can't be used. size_t size = length + SSES + 2; /* We assume there are no SSES initially. * Any SSES we encounter, we have to remove a "duplicate". */ for (size_t i = 0; i < SSES; ++i) { if (i < length && is_sse[reordered_arr[i]]) { --size; } } return size; }; std::array to = {}; // This is the index of the placeholder float, for clobbering SSEs. constexpr size_t CLOBBER_SSE = length; // Put the SSEs into the right XMM registers, if they exist. for (size_t i = 0; i < SSES; ++i) { if (i < length && is_sse[reordered_arr[i]]) { to[i] = reordered_arr[i]; } else { to[i] = CLOBBER_SSE; } } // Clobber XMM4 and XMM5. to[4] = CLOBBER_SSE; to[5] = CLOBBER_SSE; for (size_t in = 0, out = SSES + 2; in < length; ++in) { // Put all the non SSEs in their correct places. Skip SSEs. size_t current = reordered_arr[in]; if (!(is_sse[current] && in < SSES)) { to[out] = current; ++out; } } return to; } // Setup call from "foreign" function, to one of ours. static constexpr auto filter_from() { std::array from = {}; for (size_t i = 0, gprs = 0, offset = 0; i < length; ++i) { size_t current = reordered_arr[i]; if (is_sse[current] && i < SSES) { // If in SSE, retain index from[current] = i; } else if (is_gpr[current] && gprs < GPRS) { // If in GPR, offset by 4 (4 SSE registers available) from[current] = gprs + SSES; ++gprs; } else { // If on stack, offset by 6 (4 SSE + 2 GPR registers available) from[current] = offset + SSES + GPRS; ++offset; } } return from; } // Grab only the stack values. For determining stack fixup. static constexpr auto filter_stack() { /* The size of our output may be shorter than the input. * Again, annoyingly, we must make a lambda to calculate this. */ constexpr auto arr_size = []() -> size_t { size_t size = length; for (size_t i = 0, gprs = 0; i < length; ++i) { size_t current = reordered_arr[i]; if (is_sse[current] && i < SSES) { --size; } else if (is_gpr[current] && gprs < GPRS) { --size; ++gprs; } } return size; }; std::array stack = {}; for (size_t in = 0, out = 0, gprs = 0; in < length; ++in) { size_t current = reordered_arr[in]; if ((!is_sse[current] || in >= SSES) && (!is_gpr[current] || gprs >= GPRS)) { stack[out] = current; ++out; } if (is_gpr[current]) { ++gprs; } } return stack; } // Annoyingly, unless we're using C++20, we can't eliminate these intermediates. (afaik) static constexpr auto to_arr = filter_to(); static constexpr auto from_arr = filter_from(); static constexpr auto stack_arr = filter_stack(); public: using to = typename MyConv::template arr_to_seq; using from = typename MyConv::template arr_to_seq; using stack = typename MyConv::template arr_to_seq; }; private: // Where all the logic is actually implemented. template class Impl { static_assert( always_false, "Please report a bug to the Geode developers! This should never be reached.\n" "SFINAE didn't reach the right overload!" ); }; template class Impl< std::index_sequence, std::index_sequence, std::index_sequence> { private: static constexpr size_t fix = (std::is_class_v ? stack_fix : 0) + stack_fix::template type_at...>; public: static Ret invoke(void* address, const Tuple& all) { if constexpr (!std::is_same_v) { Ret ret = reinterpret_cast())...)>( address )(all.template at()...); if constexpr (fix != 0) { __asm add esp, [fix] } return ret; } else { reinterpret_cast())...)>(address )(all.template at()...); if constexpr (fix != 0) { __asm add esp, [fix] } } } template static Ret __cdecl wrapper( /* It's wrapped to stop MSVC from giving me error messages with internal compiler * info. WTF. */ typename Tuple::template type_at_wrap... rest ) { Register f0, f1, f2, f3; Register i0, i1; __asm { movlpd f0, xmm0 movlpd f1, xmm1 movlpd f2, xmm2 movlpd f3, xmm3 mov i0, ecx mov i1, edx } auto all = Tuple<>::make(f0, f1, f2, f3, i0, i1, rest...); // Register has a explicit conversion operator we can take advantage of. return detour(static_cast(all.template at())...); } }; private: // Putting it all together: instantiating Impl with our filters. using MyImpl = Impl; public: // Just wrapping MyImpl. static Ret invoke(void* address, Args... all) { /* The extra float here is so that we can grab placeholders. * If we don't have anything in XMM0 - 5, we will use * this placeholder instead. The value is 314 to avoid unintentional * bugs (since 0 may work coincidentally). */ return MyImpl::invoke(address, { all..., 314.0f }); } template static constexpr decltype(auto) get_wrapper() { return &MyImpl::template wrapper; } }; template class Optcall : public Cdecl { public: using Cdecl::invoke; using Cdecl::get_wrapper; }; } #endif /* GEODE_CORE_META_OPTCALL_HPP */