diff --git a/3rdparty/remotery/lib/Remotery.c b/3rdparty/remotery/lib/Remotery.c
index a3e2ed23..17bbeb20 100644
--- a/3rdparty/remotery/lib/Remotery.c
+++ b/3rdparty/remotery/lib/Remotery.c
@@ -113,7 +113,7 @@ error:
 }
 #endif // __ANDROID__
 
-#ifdef RMT_ENABLED
+#if RMT_ENABLED
 
 // Global settings
 static rmtSettings g_Settings;
@@ -133,7 +133,7 @@ static rmtBool g_SettingsInitialized = RMT_FALSE;
 //
 // Required CRT dependencies
 //
-#ifdef RMT_USE_TINYCRT
+#if RMT_USE_TINYCRT
 
     #include <TinyCRT/TinyCRT.h>
     #include <TinyCRT/TinyWinsock.h>
@@ -189,7 +189,7 @@ static rmtBool g_SettingsInitialized = RMT_FALSE;
 #define RMT_UNREFERENCED_PARAMETER(i) (void)(1 ? (void)0 : ((void)i))
 
 
-#ifdef RMT_USE_CUDA
+#if RMT_USE_CUDA
     #include <cuda.h>
 #endif
 
@@ -488,7 +488,15 @@ static void AtomicSub(rmtS32 volatile* value, rmtS32 sub)
 }
 
 
-// Compiler write fences (windows implementation)
+// Compiler read/write fences (windows implementation)
+static void ReadFence()
+{
+#if defined(RMT_PLATFORM_WINDOWS)
+    _ReadBarrier();
+#else
+    asm volatile ("" : : : "memory");
+#endif
+}
 static void WriteFence()
 {
 #if defined(RMT_PLATFORM_WINDOWS) && !defined(__MINGW32__)
@@ -3719,14 +3727,14 @@ static rmtBool ThreadSampler_Pop(ThreadSampler* ts, MessageQueue* queue, Sample*
 
 
 
-#ifdef RMT_USE_D3D11
+#if RMT_USE_D3D11
 typedef struct D3D11 D3D11;
 static rmtError D3D11_Create(D3D11** d3d11);
 static void D3D11_Destructor(D3D11* d3d11);
 #endif
 
 
-#ifdef RMT_USE_OPENGL
+#if RMT_USE_OPENGL
 typedef struct OpenGL OpenGL;
 static rmtError OpenGL_Create(OpenGL** opengl);
 static void OpenGL_Destructor(OpenGL* opengl);
@@ -3754,15 +3762,15 @@ struct Remotery
     // The main server thread
     Thread* thread;
 
-#ifdef RMT_USE_CUDA
+#if RMT_USE_CUDA
     rmtCUDABind cuda;
 #endif
 
-#ifdef RMT_USE_D3D11
+#if RMT_USE_D3D11
     D3D11* d3d11;
 #endif
 
-#ifdef RMT_USE_OPENGL
+#if RMT_USE_OPENGL
     OpenGL* opengl;
 #endif
 };
@@ -3892,7 +3900,7 @@ static rmtError json_SampleTree(Buffer* buffer, Msg_SampleTree* msg)
 }
 
 
-#ifdef RMT_USE_CUDA
+#if RMT_USE_CUDA
 static rmtBool AreCUDASamplesReady(Sample* sample);
 static rmtBool GetCUDASampleTimes(Sample* root_sample, Sample* sample);
 #endif
@@ -3912,7 +3920,7 @@ static rmtError Remotery_SendSampleTreeMessage(Remotery* rmt, Message* message)
     sample = sample_tree->root_sample;
     assert(sample != NULL);
 
-    #ifdef RMT_USE_CUDA
+    #if RMT_USE_CUDA
     if (sample->type == SampleType_CUDA)
     {
         // If these CUDA samples aren't ready yet, stick them to the back of the queue and continue
@@ -4102,7 +4110,7 @@ static rmtError Remotery_Constructor(Remotery* rmt)
     if (error != RMT_ERROR_NONE)
         return error;
 
-    #ifdef RMT_USE_CUDA
+    #if RMT_USE_CUDA
 
         rmt->cuda.CtxSetCurrent = NULL;
         rmt->cuda.EventCreate = NULL;
@@ -4113,14 +4121,14 @@ static rmtError Remotery_Constructor(Remotery* rmt)
 
     #endif
 
-    #ifdef RMT_USE_D3D11
+    #if RMT_USE_D3D11
         rmt->d3d11 = NULL;
         error = D3D11_Create(&rmt->d3d11);
         if (error != RMT_ERROR_NONE)
             return error;
     #endif
 
-    #ifdef RMT_USE_OPENGL
+    #if RMT_USE_OPENGL
         rmt->opengl = NULL;
         error = OpenGL_Create(&rmt->opengl);
         if (error != RMT_ERROR_NONE)
@@ -4154,11 +4162,11 @@ static void Remotery_Destructor(Remotery* rmt)
     g_Remotery = NULL;
     g_RemoteryCreated = RMT_FALSE;
 
-    #ifdef RMT_USE_D3D11
+    #if RMT_USE_D3D11
         Delete(D3D11, rmt->d3d11);
     #endif
 
-    #ifdef RMT_USE_OPENGL
+    #if RMT_USE_OPENGL
         Delete(OpenGL, rmt->opengl);
     #endif
 
@@ -4548,7 +4556,7 @@ RMT_API void _rmt_EndCPUSample(void)
 
 
 
-#ifdef RMT_USE_CUDA
+#if RMT_USE_CUDA
 
 
 typedef struct CUDASample
@@ -4828,7 +4836,7 @@ RMT_API void _rmt_EndCUDASample(void* stream)
 
 
 
-#ifdef RMT_USE_D3D11
+#if RMT_USE_D3D11
 
 
 // As clReflect has no way of disabling C++ compile mode, this forces C interfaces everywhere...
@@ -5314,7 +5322,7 @@ RMT_API void _rmt_EndD3D11Sample(void)
 
 
 
-#ifdef RMT_USE_OPENGL
+#if RMT_USE_OPENGL
 
 
 #ifndef APIENTRY
diff --git a/3rdparty/remotery/lib/Remotery.h b/3rdparty/remotery/lib/Remotery.h
index 624eb7cb..d6c692b7 100644
--- a/3rdparty/remotery/lib/Remotery.h
+++ b/3rdparty/remotery/lib/Remotery.h
@@ -42,20 +42,30 @@ documented just below this comment.
 #define RMT_INCLUDED_H
 
 
-// Disable this to not include any bits of Remotery in your build
-#define RMT_ENABLED
+// Set to 0 to not include any bits of Remotery in your build
+#ifndef RMT_ENABLED
+#define RMT_ENABLED 1
+#endif
 
 // Used by the Celtoys TinyCRT library (not released yet)
-//#define RMT_USE_TINYCRT
+#ifndef RMT_USE_TINYCRT
+#define RMT_USE_TINYCRT 0
+#endif
 
 // Assuming CUDA headers/libs are setup, allow CUDA profiling
-//#define RMT_USE_CUDA
+#ifndef RMT_USE_CUDA
+#define RMT_USE_CUDA 0
+#endif
 
 // Assuming Direct3D 11 headers/libs are setup, allow D3D11 profiling
-//#define RMT_USE_D3D11
+#ifndef RMT_USE_D3D11
+#define RMT_USE_D3D11 0
+#endif
 
 // Allow OpenGL profiling
-//#define RMT_USE_OPENGL
+#ifndef RMT_USE_OPENGL
+#define RMT_USE_OPENGL 0
+#endif
 
 
 /*
@@ -109,22 +119,22 @@ documented just below this comment.
 
 // Allows macros to be written that can work around the inability to do: #define(x) #ifdef x
 // with the C preprocessor.
-#ifdef RMT_ENABLED
+#if RMT_ENABLED
     #define IFDEF_RMT_ENABLED(t, f) t
 #else
     #define IFDEF_RMT_ENABLED(t, f) f
 #endif
-#if defined(RMT_ENABLED) && defined(RMT_USE_CUDA)
+#if RMT_ENABLED && RMT_USE_CUDA
     #define IFDEF_RMT_USE_CUDA(t, f) t
 #else
     #define IFDEF_RMT_USE_CUDA(t, f) f
 #endif
-#if defined(RMT_ENABLED) && defined(RMT_USE_D3D11)
+#if RMT_ENABLED && RMT_USE_D3D11
     #define IFDEF_RMT_USE_D3D11(t, f) t
 #else
     #define IFDEF_RMT_USE_D3D11(t, f) f
 #endif
-#if defined(RMT_ENABLED) && defined(RMT_USE_OPENGL)
+#if RMT_ENABLED && RMT_USE_OPENGL
 #define IFDEF_RMT_USE_OPENGL(t, f) t
 #else
 #define IFDEF_RMT_USE_OPENGL(t, f) f
@@ -409,7 +419,7 @@ typedef struct rmtCUDABind
 #ifdef __cplusplus
 
 
-#ifdef RMT_ENABLED
+#if RMT_ENABLED
 
 // Types that end samples in their destructors
 extern "C" RMT_API void _rmt_EndCPUSample(void);
@@ -420,7 +430,7 @@ struct rmt_EndCPUSampleOnScopeExit
         _rmt_EndCPUSample();
     }
 };
-#ifdef RMT_USE_CUDA
+#if RMT_USE_CUDA
 extern "C" RMT_API void _rmt_EndCUDASample(void* stream);
 struct rmt_EndCUDASampleOnScopeExit
 {
@@ -434,7 +444,7 @@ struct rmt_EndCUDASampleOnScopeExit
     void* stream;
 };
 #endif
-#ifdef RMT_USE_D3D11
+#if RMT_USE_D3D11
 extern "C" RMT_API void _rmt_EndD3D11Sample(void);
 struct rmt_EndD3D11SampleOnScopeExit
 {
@@ -445,7 +455,7 @@ struct rmt_EndD3D11SampleOnScopeExit
 };
 #endif
 
-#ifdef RMT_USE_OPENGL
+#if RMT_USE_OPENGL
 extern "C" RMT_API void _rmt_EndOpenGLSample(void);
 struct rmt_EndOpenGLSampleOnScopeExit
 {
@@ -488,7 +498,7 @@ struct rmt_EndOpenGLSampleOnScopeExit
 
 
 
-#ifdef RMT_ENABLED
+#if RMT_ENABLED
 
 #ifdef __cplusplus
 extern "C" {
@@ -504,20 +514,20 @@ RMT_API void _rmt_LogText(rmtPStr text);
 RMT_API void _rmt_BeginCPUSample(rmtPStr name, rmtU32* hash_cache);
 RMT_API void _rmt_EndCPUSample(void);
 
-#ifdef RMT_USE_CUDA
+#if RMT_USE_CUDA
 RMT_API void _rmt_BindCUDA(const rmtCUDABind* bind);
 RMT_API void _rmt_BeginCUDASample(rmtPStr name, rmtU32* hash_cache, void* stream);
 RMT_API void _rmt_EndCUDASample(void* stream);
 #endif
 
-#ifdef RMT_USE_D3D11
+#if RMT_USE_D3D11
 RMT_API void _rmt_BindD3D11(void* device, void* context);
 RMT_API void _rmt_UnbindD3D11(void);
 RMT_API void _rmt_BeginD3D11Sample(rmtPStr name, rmtU32* hash_cache);
 RMT_API void _rmt_EndD3D11Sample(void);
 #endif
 
-#ifdef RMT_USE_OPENGL
+#if RMT_USE_OPENGL
 RMT_API void _rmt_BindOpenGL();
 RMT_API void _rmt_UnbindOpenGL(void);
 RMT_API void _rmt_BeginOpenGLSample(rmtPStr name, rmtU32* hash_cache);
diff --git a/3rdparty/remotery/readme.md b/3rdparty/remotery/readme.md
index 4426f266..99cbdf9a 100644
--- a/3rdparty/remotery/readme.md
+++ b/3rdparty/remotery/readme.md
@@ -11,7 +11,7 @@ Supported features:
 * Web viewer that runs in Chrome, Firefox and Safari. Custom WebSockets server
   transmits sample data to the browser on a latent thread.
 * Profiles itself and shows how it's performing in the viewer.
-* Can optionally sample CUDA/D3D11 GPU activity.
+* Can optionally sample CUDA/D3D11/OpenGL GPU activity.
 * Console output for logging text.
 * Console input for sending commands to your game.
 
@@ -31,13 +31,13 @@ Compiling
 
 You can define some extra macros to modify what features are compiled into Remotery:
 
-    Macro               Default             Description
+    Macro               Default     Description
 
-    RMT_ENABLED         <defined>           Disable this to not include any bits of Remotery in your build
-    RMT_USE_TINYCRT     <not defined>       Used by the Celtoys TinyCRT library (not released yet)
-    RMT_USE_CUDA        <not defined>       Assuming CUDA headers/libs are setup, allow CUDA profiling
-    RMT_USE_D3D11       <not defined>       Assuming Direct3D 11 headers/libs are setup, allow D3D11 GPU profiling
-    RMT_USE_OPENGL      <not defined>       Allow OpenGL GPU profiling (standalone except you must link to OpenGL which you already do if you use it)
+    RMT_ENABLED         1           Disable this to not include any bits of Remotery in your build
+    RMT_USE_TINYCRT     0           Used by the Celtoys TinyCRT library (not released yet)
+    RMT_USE_CUDA        0           Assuming CUDA headers/libs are setup, allow CUDA profiling
+    RMT_USE_D3D11       0           Assuming Direct3D 11 headers/libs are setup, allow D3D11 GPU profiling
+    RMT_USE_OPENGL      0           Allow OpenGL GPU profiling (standalone except you must link to OpenGL which you already do if you use it)
 
 
 Basic Use
diff --git a/examples/common/entry/entry.cpp b/examples/common/entry/entry.cpp
index 76d2b00e..14bf7540 100644
--- a/examples/common/entry/entry.cpp
+++ b/examples/common/entry/entry.cpp
@@ -3,9 +3,12 @@
  * License: http://www.opensource.org/licenses/BSD-2-Clause
  */
 
+#include <bx/bx.h>
+#if BX_PLATFORM_WINDOWS
 // BK - Remotery needs WinSock, but on VS2015/Win10 build
 //      fails if WinSock2 is included after Windows.h?!
-#include <WinSock2.h>
+#	include <WinSock2.h>
+#endif // BX_PLATFORM_WINDOWS
 
 #include <bgfx/bgfx.h>
 #include <bx/string.h>
@@ -21,10 +24,7 @@
 #include "cmd.h"
 #include "input.h"
 
-#if ENTRY_CONFIG_PROFILER
-#	define RMT_ENABLED
-#endif // ENTRY_CONFIG_PROFILER
-
+#define RMT_ENABLED ENTRY_CONFIG_PROFILER
 #include <remotery/lib/Remotery.c>
 
 extern "C" int _main_(int _argc, char** _argv);
diff --git a/src/bgfx_p.h b/src/bgfx_p.h
index 12c58113..12a18cab 100644
--- a/src/bgfx_p.h
+++ b/src/bgfx_p.h
@@ -54,6 +54,7 @@
 #		include <microprofile.h>
 #		define BGFX_PROFILER_SCOPE(_group, _name, _color) MICROPROFILE_SCOPEI(#_group, #_name, _color)
 #	elif BGFX_CONFIG_PROFILER_REMOTERY
+#		define RMT_ENABLED BGFX_CONFIG_PROFILER_REMOTERY
 #		include <remotery/lib/Remotery.h>
 #		define BGFX_PROFILER_SCOPE(_group, _name, _color) rmt_ScopedCPUSample(_group##_##_name)
 #	else