Added ETC2 encoding.

2024-11-28 10:35:43 -05:00 · 2016-01-28 21:09:03 -08:00 · 2016-01-28 21:09:03 -08:00 · 6c68a79526
commit 6c68a79526
parent 8ab0a4eea5
11 changed files with 1274 additions and 0 deletions
--- a/3rdparty/etc2/LICENSE.txt
+++ b/3rdparty/etc2/LICENSE.txt
@ -0,0 +1,24 @@
 Copyright (c) 2013, Bartosz Taudul <wolf.pld@gmail.com>
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of the <organization> nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/3rdparty/etc2/Math.hpp
+++ b/3rdparty/etc2/Math.hpp
@ -0,0 +1,89 @@
 #ifndef __DARKRL__MATH_HPP__
 #define __DARKRL__MATH_HPP__
 #include <algorithm>
 #include "Types.hpp"
 template<typename T>
 inline T AlignPOT( T val )
 {
    if( val == 0 ) return 1;
    val--;
    for( unsigned int i=1; i<sizeof( T ) * 8; i <<= 1 )
    {
        val |= val >> i;
    }
    return val + 1;
 }
 inline int CountSetBits( uint32 val )
 {
    val -= ( val >> 1 ) & 0x55555555;
    val = ( ( val >> 2 ) & 0x33333333 ) + ( val & 0x33333333 );
    val = ( ( val >> 4 ) + val ) & 0x0f0f0f0f;
    val += val >> 8;
    val += val >> 16;
    return val & 0x0000003f;
 }
 inline int CountLeadingZeros( uint32 val )
 {
    val |= val >> 1;
    val |= val >> 2;
    val |= val >> 4;
    val |= val >> 8;
    val |= val >> 16;
    return 32 - CountSetBits( val );
 }
 inline float sRGB2linear( float v )
 {
    const float a = 0.055f;
    if( v <= 0.04045f )
    {
        return v / 12.92f;
    }
    else
    {
        return pow( ( v + a ) / ( 1 + a ), 2.4f );
    }
 }
 inline float linear2sRGB( float v )
 {
    const float a = 0.055f;
    if( v <= 0.0031308f )
    {
        return 12.92f * v;
    }
    else
    {
        return ( 1 + a ) * pow( v, 1/2.4f ) - a;
    }
 }
 template<class T>
 inline T SmoothStep( T x )
 {
    return x*x*(3-2*x);
 }
 inline uint8 clampu8( int32 val )
 {
    return std::min( std::max( 0, val ), 255 );
 }
 template<class T>
 inline T sq( T val )
 {
    return val * val;
 }
 static inline int mul8bit( int a, int b )
 {
    int t = a*b + 128;
    return ( t + ( t >> 8 ) ) >> 8;
 }
 #endif
--- a/3rdparty/etc2/ProcessCommon.hpp
+++ b/3rdparty/etc2/ProcessCommon.hpp
@ -0,0 +1,51 @@
 #ifndef __PROCESSCOMMON_HPP__
 #define __PROCESSCOMMON_HPP__
 #include <assert.h>
 #include <stddef.h>
 #include "Types.hpp"
 template<class T>
 static size_t GetLeastError( const T* err, size_t num )
 {
    size_t idx = 0;
    for( size_t i=1; i<num; i++ )
    {
        if( err[i] < err[idx] )
        {
            idx = i;
        }
    }
    return idx;
 }
 static uint64 FixByteOrder( uint64 d )
 {
    return ( ( d & 0x00000000FFFFFFFF ) ) |
           ( ( d & 0xFF00000000000000 ) >> 24 ) |
           ( ( d & 0x000000FF00000000 ) << 24 ) |
           ( ( d & 0x00FF000000000000 ) >> 8 ) |
           ( ( d & 0x0000FF0000000000 ) << 8 );
 }
 template<class T, class S>
 static uint64 EncodeSelectors( uint64 d, const T terr[2][8], const S tsel[16][8], const uint32* id )
 {
    size_t tidx[2];
    tidx[0] = GetLeastError( terr[0], 8 );
    tidx[1] = GetLeastError( terr[1], 8 );
    d |= tidx[0] << 26;
    d |= tidx[1] << 29;
    for( int i=0; i<16; i++ )
    {
        uint64 t = tsel[i][tidx[id[i]%2]];
        d |= ( t & 0x1 ) << ( i + 32 );
        d |= ( t & 0x2 ) << ( i + 47 );
    }
    return d;
 }
 #endif
--- a/3rdparty/etc2/ProcessRGB.cpp
+++ b/3rdparty/etc2/ProcessRGB.cpp
@ -0,0 +1,708 @@
 #include <array>
 #include <string.h>
 #include "Math.hpp"
 #include "ProcessCommon.hpp"
 #include "ProcessRGB.hpp"
 #include "Tables.hpp"
 #include "Types.hpp"
 #include "Vector.hpp"
 #include <bx/endian.h>
 #ifdef __SSE4_1__
 #  ifdef _MSC_VER
 #    include <intrin.h>
 #    include <Windows.h>
 #  else
 #    include <x86intrin.h>
 #  endif
 #endif
 namespace
 {
 typedef std::array<uint16, 4> v4i;
 void Average( const uint8* data, v4i* a )
 {
 #ifdef __SSE4_1__
    __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
    __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
    __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
    __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
    __m128i d0l = _mm_unpacklo_epi8(d0, _mm_setzero_si128());
    __m128i d0h = _mm_unpackhi_epi8(d0, _mm_setzero_si128());
    __m128i d1l = _mm_unpacklo_epi8(d1, _mm_setzero_si128());
    __m128i d1h = _mm_unpackhi_epi8(d1, _mm_setzero_si128());
    __m128i d2l = _mm_unpacklo_epi8(d2, _mm_setzero_si128());
    __m128i d2h = _mm_unpackhi_epi8(d2, _mm_setzero_si128());
    __m128i d3l = _mm_unpacklo_epi8(d3, _mm_setzero_si128());
    __m128i d3h = _mm_unpackhi_epi8(d3, _mm_setzero_si128());
    __m128i sum0 = _mm_add_epi16(d0l, d1l);
    __m128i sum1 = _mm_add_epi16(d0h, d1h);
    __m128i sum2 = _mm_add_epi16(d2l, d3l);
    __m128i sum3 = _mm_add_epi16(d2h, d3h);
    __m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128());
    __m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128());
    __m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128());
    __m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128());
    __m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128());
    __m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128());
    __m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128());
    __m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128());
    __m128i b0 = _mm_add_epi32(sum0l, sum0h);
    __m128i b1 = _mm_add_epi32(sum1l, sum1h);
    __m128i b2 = _mm_add_epi32(sum2l, sum2h);
    __m128i b3 = _mm_add_epi32(sum3l, sum3h);
    __m128i a0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b2, b3), _mm_set1_epi32(4)), 3);
    __m128i a1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b1), _mm_set1_epi32(4)), 3);
    __m128i a2 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b1, b3), _mm_set1_epi32(4)), 3);
    __m128i a3 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b2), _mm_set1_epi32(4)), 3);
    _mm_storeu_si128((__m128i*)&a[0], _mm_packus_epi32(_mm_shuffle_epi32(a0, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a1, _MM_SHUFFLE(3, 0, 1, 2))));
    _mm_storeu_si128((__m128i*)&a[2], _mm_packus_epi32(_mm_shuffle_epi32(a2, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a3, _MM_SHUFFLE(3, 0, 1, 2))));
 #else
    uint32 r[4];
    uint32 g[4];
    uint32 b[4];
    memset(r, 0, sizeof(r));
    memset(g, 0, sizeof(g));
    memset(b, 0, sizeof(b));
    for( int j=0; j<4; j++ )
    {
        for( int i=0; i<4; i++ )
        {
            int index = (j & 2) + (i >> 1);
            b[index] += *data++;
            g[index] += *data++;
            r[index] += *data++;
            data++;
        }
    }
    a[0] = v4i{ uint16( (r[2] + r[3] + 4) / 8 ), uint16( (g[2] + g[3] + 4) / 8 ), uint16( (b[2] + b[3] + 4) / 8 ), 0};
    a[1] = v4i{ uint16( (r[0] + r[1] + 4) / 8 ), uint16( (g[0] + g[1] + 4) / 8 ), uint16( (b[0] + b[1] + 4) / 8 ), 0};
    a[2] = v4i{ uint16( (r[1] + r[3] + 4) / 8 ), uint16( (g[1] + g[3] + 4) / 8 ), uint16( (b[1] + b[3] + 4) / 8 ), 0};
    a[3] = v4i{ uint16( (r[0] + r[2] + 4) / 8 ), uint16( (g[0] + g[2] + 4) / 8 ), uint16( (b[0] + b[2] + 4) / 8 ), 0};
 #endif
 }
 void CalcErrorBlock( const uint8* data, uint err[4][4] )
 {
 #ifdef __SSE4_1__
    __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
    __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
    __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
    __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
    __m128i dm0 = _mm_and_si128(d0, _mm_set1_epi32(0x00FFFFFF));
    __m128i dm1 = _mm_and_si128(d1, _mm_set1_epi32(0x00FFFFFF));
    __m128i dm2 = _mm_and_si128(d2, _mm_set1_epi32(0x00FFFFFF));
    __m128i dm3 = _mm_and_si128(d3, _mm_set1_epi32(0x00FFFFFF));
    __m128i d0l = _mm_unpacklo_epi8(dm0, _mm_setzero_si128());
    __m128i d0h = _mm_unpackhi_epi8(dm0, _mm_setzero_si128());
    __m128i d1l = _mm_unpacklo_epi8(dm1, _mm_setzero_si128());
    __m128i d1h = _mm_unpackhi_epi8(dm1, _mm_setzero_si128());
    __m128i d2l = _mm_unpacklo_epi8(dm2, _mm_setzero_si128());
    __m128i d2h = _mm_unpackhi_epi8(dm2, _mm_setzero_si128());
    __m128i d3l = _mm_unpacklo_epi8(dm3, _mm_setzero_si128());
    __m128i d3h = _mm_unpackhi_epi8(dm3, _mm_setzero_si128());
    __m128i sum0 = _mm_add_epi16(d0l, d1l);
    __m128i sum1 = _mm_add_epi16(d0h, d1h);
    __m128i sum2 = _mm_add_epi16(d2l, d3l);
    __m128i sum3 = _mm_add_epi16(d2h, d3h);
    __m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128());
    __m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128());
    __m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128());
    __m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128());
    __m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128());
    __m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128());
    __m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128());
    __m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128());
    __m128i b0 = _mm_add_epi32(sum0l, sum0h);
    __m128i b1 = _mm_add_epi32(sum1l, sum1h);
    __m128i b2 = _mm_add_epi32(sum2l, sum2h);
    __m128i b3 = _mm_add_epi32(sum3l, sum3h);
    __m128i a0 = _mm_add_epi32(b2, b3);
    __m128i a1 = _mm_add_epi32(b0, b1);
    __m128i a2 = _mm_add_epi32(b1, b3);
    __m128i a3 = _mm_add_epi32(b0, b2);
    _mm_storeu_si128((__m128i*)&err[0], a0);
    _mm_storeu_si128((__m128i*)&err[1], a1);
    _mm_storeu_si128((__m128i*)&err[2], a2);
    _mm_storeu_si128((__m128i*)&err[3], a3);
 #else
    uint terr[4][4];
    memset(terr, 0, 16 * sizeof(uint));
    for( int j=0; j<4; j++ )
    {
        for( int i=0; i<4; i++ )
        {
            int index = (j & 2) + (i >> 1);
            uint d = *data++;
            terr[index][0] += d;
            d = *data++;
            terr[index][1] += d;
            d = *data++;
            terr[index][2] += d;
            data++;
        }
    }
    for( int i=0; i<3; i++ )
    {
        err[0][i] = terr[2][i] + terr[3][i];
        err[1][i] = terr[0][i] + terr[1][i];
        err[2][i] = terr[1][i] + terr[3][i];
        err[3][i] = terr[0][i] + terr[2][i];
    }
    for( int i=0; i<4; i++ )
    {
        err[i][3] = 0;
    }
 #endif
 }
 uint CalcError( const uint block[4], const v4i& average )
 {
    uint err = 0x3FFFFFFF; // Big value to prevent negative values, but small enough to prevent overflow
    err -= block[0] * 2 * average[2];
    err -= block[1] * 2 * average[1];
    err -= block[2] * 2 * average[0];
    err += 8 * ( sq( average[0] ) + sq( average[1] ) + sq( average[2] ) );
    return err;
 }
 void ProcessAverages( v4i* a )
 {
 #ifdef __SSE4_1__
    for( int i=0; i<2; i++ )
    {
        __m128i d = _mm_loadu_si128((__m128i*)a[i*2].data());
        __m128i t = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(31)), _mm_set1_epi16(128));
        __m128i c = _mm_srli_epi16(_mm_add_epi16(t, _mm_srli_epi16(t, 8)), 8);
        __m128i c1 = _mm_shuffle_epi32(c, _MM_SHUFFLE(3, 2, 3, 2));
        __m128i diff = _mm_sub_epi16(c, c1);
        diff = _mm_max_epi16(diff, _mm_set1_epi16(-4));
        diff = _mm_min_epi16(diff, _mm_set1_epi16(3));
        __m128i co = _mm_add_epi16(c1, diff);
        c = _mm_blend_epi16(co, c, 0xF0);
        __m128i a0 = _mm_or_si128(_mm_slli_epi16(c, 3), _mm_srli_epi16(c, 2));
        _mm_storeu_si128((__m128i*)a[4+i*2].data(), a0);
    }
    for( int i=0; i<2; i++ )
    {
        __m128i d = _mm_loadu_si128((__m128i*)a[i*2].data());
        __m128i t0 = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(15)), _mm_set1_epi16(128));
        __m128i t1 = _mm_srli_epi16(_mm_add_epi16(t0, _mm_srli_epi16(t0, 8)), 8);
        __m128i t2 = _mm_or_si128(t1, _mm_slli_epi16(t1, 4));
        _mm_storeu_si128((__m128i*)a[i*2].data(), t2);
    }
 #else
    for( int i=0; i<2; i++ )
    {
        for( int j=0; j<3; j++ )
        {
            int32 c1 = mul8bit( a[i*2+1][j], 31 );
            int32 c2 = mul8bit( a[i*2][j], 31 );
            int32 diff = c2 - c1;
            if( diff > 3 ) diff = 3;
            else if( diff < -4 ) diff = -4;
            int32 co = c1 + diff;
            a[5+i*2][j] = ( c1 << 3 ) | ( c1 >> 2 );
            a[4+i*2][j] = ( co << 3 ) | ( co >> 2 );
        }
    }
    for( int i=0; i<4; i++ )
    {
        a[i][0] = g_avg2[mul8bit( a[i][0], 15 )];
        a[i][1] = g_avg2[mul8bit( a[i][1], 15 )];
        a[i][2] = g_avg2[mul8bit( a[i][2], 15 )];
    }
 #endif
 }
 void EncodeAverages( uint64& _d, const v4i* a, size_t idx )
 {
    uint64 d = _d;
    d |= ( idx << 24 );
    size_t base = idx << 1;
    if( ( idx & 0x2 ) == 0 )
    {
        for( int i=0; i<3; i++ )
        {
            d |= uint64( a[base+0][i] >> 4 ) << ( i*8 );
            d |= uint64( a[base+1][i] >> 4 ) << ( i*8 + 4 );
        }
    }
    else
    {
        for( int i=0; i<3; i++ )
        {
            d |= uint64( a[base+1][i] & 0xF8 ) << ( i*8 );
            int32 c = ( ( a[base+0][i] & 0xF8 ) - ( a[base+1][i] & 0xF8 ) ) >> 3;
            c &= ~0xFFFFFFF8;
            d |= ((uint64)c) << ( i*8 );
        }
    }
    _d = d;
 }
 uint64 CheckSolid( const uint8* src )
 {
 #ifdef __SSE4_1__
    __m128i d0 = _mm_loadu_si128(((__m128i*)src) + 0);
    __m128i d1 = _mm_loadu_si128(((__m128i*)src) + 1);
    __m128i d2 = _mm_loadu_si128(((__m128i*)src) + 2);
    __m128i d3 = _mm_loadu_si128(((__m128i*)src) + 3);
    __m128i c = _mm_shuffle_epi32(d0, _MM_SHUFFLE(0, 0, 0, 0));
    __m128i c0 = _mm_cmpeq_epi8(d0, c);
    __m128i c1 = _mm_cmpeq_epi8(d1, c);
    __m128i c2 = _mm_cmpeq_epi8(d2, c);
    __m128i c3 = _mm_cmpeq_epi8(d3, c);
    __m128i m0 = _mm_and_si128(c0, c1);
    __m128i m1 = _mm_and_si128(c2, c3);
    __m128i m = _mm_and_si128(m0, m1);
    if (!_mm_testc_si128(m, _mm_set1_epi32(-1)))
    {
        return 0;
    }
 #else
    const uint8* ptr = src + 4;
    for( int i=1; i<16; i++ )
    {
        if( memcmp( src, ptr, 4 ) != 0 )
        {
            return 0;
        }
        ptr += 4;
    }
 #endif
    return 0x02000000 |
        ( uint( src[0] & 0xF8 ) << 16 ) |
        ( uint( src[1] & 0xF8 ) << 8 ) |
        ( uint( src[2] & 0xF8 ) );
 }
 void PrepareAverages( v4i a[8], const uint8* src, uint err[4] )
 {
    Average( src, a );
    ProcessAverages( a );
    uint errblock[4][4];
    CalcErrorBlock( src, errblock );
    for( int i=0; i<4; i++ )
    {
        err[i/2] += CalcError( errblock[i], a[i] );
        err[2+i/2] += CalcError( errblock[i], a[i+4] );
    }
 }
 void FindBestFit( uint64 terr[2][8], uint16 tsel[16][8], v4i a[8], const uint32* id, const uint8* data )
 {
    for( size_t i=0; i<16; i++ )
    {
        uint16* sel = tsel[i];
        uint bid = id[i];
        uint64* ter = terr[bid%2];
        uint8 b = *data++;
        uint8 g = *data++;
        uint8 r = *data++;
        data++;
        int dr = a[bid][0] - r;
        int dg = a[bid][1] - g;
        int db = a[bid][2] - b;
 #ifdef __SSE4_1__
        // Reference implementation
        __m128i pix = _mm_set1_epi32(dr * 77 + dg * 151 + db * 28);
        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
        __m128i error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[0]));
        __m128i error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[1]));
        __m128i error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[0]));
        __m128i error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[1]));
        __m128i index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1));
        __m128i minError0 = _mm_min_epi32(error0, error1);
        __m128i index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2));
        __m128i minError1 = _mm_min_epi32(error2, error3);
        __m128i minIndex0 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0));
        __m128i minError = _mm_min_epi32(minError0, minError1);
        // Squaring the minimum error to produce correct values when adding
        __m128i minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0));
        __m128i squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow);
        squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0));
        _mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow);
        __m128i minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2));
        __m128i squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh);
        squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1));
        _mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh);
        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
        error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[2]));
        error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[3]));
        error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[2]));
        error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[3]));
        index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1));
        minError0 = _mm_min_epi32(error0, error1);
        index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2));
        minError1 = _mm_min_epi32(error2, error3);
        __m128i minIndex1 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0));
        minError = _mm_min_epi32(minError0, minError1);
        // Squaring the minimum error to produce correct values when adding
        minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0));
        squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow);
        squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 2));
        _mm_storeu_si128(((__m128i*)ter) + 2, squareErrorLow);
        minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2));
        squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh);
        squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 3));
        _mm_storeu_si128(((__m128i*)ter) + 3, squareErrorHigh);
        __m128i minIndex = _mm_packs_epi32(minIndex0, minIndex1);
        _mm_storeu_si128((__m128i*)sel, minIndex);
 #else
        int pix = dr * 77 + dg * 151 + db * 28;
        for( int t=0; t<8; t++ )
        {
            const int64* tab = g_table256[t];
            uint idx = 0;
            uint64 err = sq( tab[0] + pix );
            for( int j=1; j<4; j++ )
            {
                uint64 local = sq( tab[j] + pix );
                if( local < err )
                {
                    err = local;
                    idx = j;
                }
            }
            *sel++ = idx;
            *ter++ += err;
        }
 #endif
    }
 }
 #ifdef __SSE4_1__
 // Non-reference implementation, but faster. Produces same results as the AVX2 version
 void FindBestFit( uint32 terr[2][8], uint16 tsel[16][8], v4i a[8], const uint32* id, const uint8* data )
 {
    for( size_t i=0; i<16; i++ )
    {
        uint16* sel = tsel[i];
        uint bid = id[i];
        uint32* ter = terr[bid%2];
        uint8 b = *data++;
        uint8 g = *data++;
        uint8 r = *data++;
        data++;
        int dr = a[bid][0] - r;
        int dg = a[bid][1] - g;
        int db = a[bid][2] - b;
        // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
        // This produces slightly different results, but is significant faster
        __m128i pixel = _mm_set1_epi16(dr * 38 + dg * 76 + db * 14);
        __m128i pix = _mm_abs_epi16(pixel);
        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
        // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
        __m128i error0 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[0]));
        __m128i error1 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[1]));
        __m128i index = _mm_and_si128(_mm_cmplt_epi16(error1, error0), _mm_set1_epi16(1));
        __m128i minError = _mm_min_epi16(error0, error1);
        // Exploiting symmetry of the selector table and use the sign bit
        // This produces slightly different results, but is needed to produce same results as AVX2 implementation
        __m128i indexBit = _mm_andnot_si128(_mm_srli_epi16(pixel, 15), _mm_set1_epi8(-1));
        __m128i minIndex = _mm_or_si128(index, _mm_add_epi16(indexBit, indexBit));
        // Squaring the minimum error to produce correct values when adding
        __m128i squareErrorLo = _mm_mullo_epi16(minError, minError);
        __m128i squareErrorHi = _mm_mulhi_epi16(minError, minError);
        __m128i squareErrorLow = _mm_unpacklo_epi16(squareErrorLo, squareErrorHi);
        __m128i squareErrorHigh = _mm_unpackhi_epi16(squareErrorLo, squareErrorHi);
        squareErrorLow = _mm_add_epi32(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0));
        _mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow);
        squareErrorHigh = _mm_add_epi32(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1));
        _mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh);
        _mm_storeu_si128((__m128i*)sel, minIndex);
    }
 }
 #endif
 uint8_t convert6(float f)
 {
    int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1;
    return (i + 11 - ((i + 11) >> 7) - ((i + 4) >> 7)) >> 3;
 }
 uint8_t convert7(float f)
 {
    int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1;
    return (i + 9 - ((i + 9) >> 8) - ((i + 6) >> 8)) >> 2;
 }
 std::pair<uint64, uint64> Planar(const uint8* src)
 {
    int32 r = 0;
    int32 g = 0;
    int32 b = 0;
    for (int i = 0; i < 16; ++i)
    {
        b += src[i * 4 + 0];
        g += src[i * 4 + 1];
        r += src[i * 4 + 2];
    }
    int32 difRyz = 0;
    int32 difGyz = 0;
    int32 difByz = 0;
    int32 difRxz = 0;
    int32 difGxz = 0;
    int32 difBxz = 0;
    const int32 scaling[] = { -255, -85, 85, 255 };
    for (int i = 0; i < 16; ++i)
    {
        int32 difB = (static_cast<int>(src[i * 4 + 0]) << 4) - b;
        int32 difG = (static_cast<int>(src[i * 4 + 1]) << 4) - g;
        int32 difR = (static_cast<int>(src[i * 4 + 2]) << 4) - r;
        difRyz += difR * scaling[i % 4];
        difGyz += difG * scaling[i % 4];
        difByz += difB * scaling[i % 4];
        difRxz += difR * scaling[i / 4];
        difGxz += difG * scaling[i / 4];
        difBxz += difB * scaling[i / 4];
    }
    const float scale = -4.0f / ((255 * 255 * 8.0f + 85 * 85 * 8.0f) * 16.0f);
    float aR = difRxz * scale;
    float aG = difGxz * scale;
    float aB = difBxz * scale;
    float bR = difRyz * scale;
    float bG = difGyz * scale;
    float bB = difByz * scale;
    float dR = r * (4.0f / 16.0f);
    float dG = g * (4.0f / 16.0f);
    float dB = b * (4.0f / 16.0f);
    // calculating the three colors RGBO, RGBH, and RGBV.  RGB = df - af * x - bf * y;
    float cofR = std::fma(aR,  255.0f, std::fma(bR,  255.0f, dR));
    float cofG = std::fma(aG,  255.0f, std::fma(bG,  255.0f, dG));
    float cofB = std::fma(aB,  255.0f, std::fma(bB,  255.0f, dB));
    float chfR = std::fma(aR, -425.0f, std::fma(bR,  255.0f, dR));
    float chfG = std::fma(aG, -425.0f, std::fma(bG,  255.0f, dG));
    float chfB = std::fma(aB, -425.0f, std::fma(bB,  255.0f, dB));
    float cvfR = std::fma(aR,  255.0f, std::fma(bR, -425.0f, dR));
    float cvfG = std::fma(aG,  255.0f, std::fma(bG, -425.0f, dG));
    float cvfB = std::fma(aB,  255.0f, std::fma(bB, -425.0f, dB));
    // convert to r6g7b6
    int32 coR = convert6(cofR);
    int32 coG = convert7(cofG);
    int32 coB = convert6(cofB);
    int32 chR = convert6(chfR);
    int32 chG = convert7(chfG);
    int32 chB = convert6(chfB);
    int32 cvR = convert6(cvfR);
    int32 cvG = convert7(cvfG);
    int32 cvB = convert6(cvfB);
    // Error calculation
    int32 ro0 = coR;
    int32 go0 = coG;
    int32 bo0 = coB;
    int32 ro1 = (ro0 >> 4) | (ro0 << 2);
    int32 go1 = (go0 >> 6) | (go0 << 1);
    int32 bo1 = (bo0 >> 4) | (bo0 << 2);
    int32 ro2 = (ro1 << 2) + 2;
    int32 go2 = (go1 << 2) + 2;
    int32 bo2 = (bo1 << 2) + 2;
    int32 rh0 = chR;
    int32 gh0 = chG;
    int32 bh0 = chB;
    int32 rh1 = (rh0 >> 4) | (rh0 << 2);
    int32 gh1 = (gh0 >> 6) | (gh0 << 1);
    int32 bh1 = (bh0 >> 4) | (bh0 << 2);
    int32 rh2 = rh1 - ro1;
    int32 gh2 = gh1 - go1;
    int32 bh2 = bh1 - bo1;
    int32 rv0 = cvR;
    int32 gv0 = cvG;
    int32 bv0 = cvB;
    int32 rv1 = (rv0 >> 4) | (rv0 << 2);
    int32 gv1 = (gv0 >> 6) | (gv0 << 1);
    int32 bv1 = (bv0 >> 4) | (bv0 << 2);
    int32 rv2 = rv1 - ro1;
    int32 gv2 = gv1 - go1;
    int32 bv2 = bv1 - bo1;
    uint64 error = 0;
    for (int i = 0; i < 16; ++i)
    {
        int32 cR = clampu8((rh2 * (i / 4) + rv2 * (i % 4) + ro2) >> 2);
        int32 cG = clampu8((gh2 * (i / 4) + gv2 * (i % 4) + go2) >> 2);
        int32 cB = clampu8((bh2 * (i / 4) + bv2 * (i % 4) + bo2) >> 2);
        int32 difB = static_cast<int>(src[i * 4 + 0]) - cB;
        int32 difG = static_cast<int>(src[i * 4 + 1]) - cG;
        int32 difR = static_cast<int>(src[i * 4 + 2]) - cR;
        int32 dif = difR * 38 + difG * 76 + difB * 14;
        error += dif * dif;
    }
    /**/
    uint32 rgbv = cvB | (cvG << 6) | (cvR << 13);
    uint32 rgbh = chB | (chG << 6) | (chR << 13);
    uint32 hi = rgbv | ((rgbh & 0x1FFF) << 19);
    uint32 lo = (chR & 0x1) | 0x2 | ((chR << 1) & 0x7C);
    lo |= ((coB & 0x07) <<  7) | ((coB & 0x18) <<  8) | ((coB & 0x20) << 11);
    lo |= ((coG & 0x3F) << 17) | ((coG & 0x40) << 18);
    lo |= coR << 25;
    const int32 idx = (coR & 0x20) | ((coG & 0x20) >> 1) | ((coB & 0x1E) >> 1);
    lo |= g_flags[idx];
    uint64 result = static_cast<uint32>(bx::endianSwap(lo));
    result |= static_cast<uint64>(static_cast<uint32>(bx::endianSwap(hi))) << 32;
    return std::make_pair(result, error);
 }
 template<class T, class S>
 uint64 EncodeSelectors( uint64 d, const T terr[2][8], const S tsel[16][8], const uint32* id, const uint64 value, const uint64 error)
 {
    size_t tidx[2];
    tidx[0] = GetLeastError( terr[0], 8 );
    tidx[1] = GetLeastError( terr[1], 8 );
    if ((terr[0][tidx[0]] + terr[1][tidx[1]]) >= error)
    {
        return value;
    }
    d |= tidx[0] << 26;
    d |= tidx[1] << 29;
    for( int i=0; i<16; i++ )
    {
        uint64 t = tsel[i][tidx[id[i]%2]];
        d |= ( t & 0x1 ) << ( i + 32 );
        d |= ( t & 0x2 ) << ( i + 47 );
    }
    return FixByteOrder(d);
 }
 }
 uint64 ProcessRGB( const uint8* src )
 {
    uint64 d = CheckSolid( src );
    if( d != 0 ) return d;
    v4i a[8];
    uint err[4] = {};
    PrepareAverages( a, src, err );
    size_t idx = GetLeastError( err, 4 );
    EncodeAverages( d, a, idx );
 #if defined __SSE4_1__ && !defined REFERENCE_IMPLEMENTATION
    uint32 terr[2][8] = {};
 #else
    uint64 terr[2][8] = {};
 #endif
    uint16 tsel[16][8];
    const uint32* id = g_id[idx];
    FindBestFit( terr, tsel, a, id, src );
    return FixByteOrder( EncodeSelectors( d, terr, tsel, id ) );
 }
 uint64 ProcessRGB_ETC2( const uint8* src )
 {
    std::pair<uint64, uint64> result = Planar( src );
    uint64 d = 0;
    v4i a[8];
    uint err[4] = {};
    PrepareAverages( a, src, err );
    size_t idx = GetLeastError( err, 4 );
    EncodeAverages( d, a, idx );
    uint64 terr[2][8] = {};
    uint16 tsel[16][8];
    const uint32* id = g_id[idx];
    FindBestFit( terr, tsel, a, id, src );
    return EncodeSelectors( d, terr, tsel, id, result.first, result.second );
 }
--- a/3rdparty/etc2/ProcessRGB.hpp
+++ b/3rdparty/etc2/ProcessRGB.hpp
@ -0,0 +1,9 @@
 #ifndef __PROCESSRGB_HPP__
 #define __PROCESSRGB_HPP__
 #include "Types.hpp"
 uint64 ProcessRGB( const uint8* src );
 uint64 ProcessRGB_ETC2( const uint8* src );
 #endif
--- a/3rdparty/etc2/Tables.cpp
+++ b/3rdparty/etc2/Tables.cpp
@ -0,0 +1,109 @@
 #include "Tables.hpp"
 const int32 g_table[8][4] = {
    {  2,  8,   -2,   -8 },
    {  5, 17,   -5,  -17 },
    {  9, 29,   -9,  -29 },
    { 13, 42,  -13,  -42 },
    { 18, 60,  -18,  -60 },
    { 24, 80,  -24,  -80 },
    { 33, 106, -33, -106 },
    { 47, 183, -47, -183 }
 };
 const int64 g_table256[8][4] = {
    {  2*256,  8*256,   -2*256,   -8*256 },
    {  5*256, 17*256,   -5*256,  -17*256 },
    {  9*256, 29*256,   -9*256,  -29*256 },
    { 13*256, 42*256,  -13*256,  -42*256 },
    { 18*256, 60*256,  -18*256,  -60*256 },
    { 24*256, 80*256,  -24*256,  -80*256 },
    { 33*256, 106*256, -33*256, -106*256 },
    { 47*256, 183*256, -47*256, -183*256 }
 };
 const uint32 g_id[4][16] = {
    { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 },
    { 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2 },
    { 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 },
    { 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6 }
 };
 const uint32 g_avg2[16] = {
    0x00,
    0x11,
    0x22,
    0x33,
    0x44,
    0x55,
    0x66,
    0x77,
    0x88,
    0x99,
    0xAA,
    0xBB,
    0xCC,
    0xDD,
    0xEE,
    0xFF
 };
 const uint32 g_flags[64] = {
    0x80800402, 0x80800402, 0x80800402, 0x80800402,
    0x80800402, 0x80800402, 0x80800402, 0x8080E002,
    0x80800402, 0x80800402, 0x8080E002, 0x8080E002,
    0x80800402, 0x8080E002, 0x8080E002, 0x8080E002,
    0x80000402, 0x80000402, 0x80000402, 0x80000402,
    0x80000402, 0x80000402, 0x80000402, 0x8000E002,
    0x80000402, 0x80000402, 0x8000E002, 0x8000E002,
    0x80000402, 0x8000E002, 0x8000E002, 0x8000E002,
    0x00800402, 0x00800402, 0x00800402, 0x00800402,
    0x00800402, 0x00800402, 0x00800402, 0x0080E002,
    0x00800402, 0x00800402, 0x0080E002, 0x0080E002,
    0x00800402, 0x0080E002, 0x0080E002, 0x0080E002,
    0x00000402, 0x00000402, 0x00000402, 0x00000402,
    0x00000402, 0x00000402, 0x00000402, 0x0000E002,
    0x00000402, 0x00000402, 0x0000E002, 0x0000E002,
    0x00000402, 0x0000E002, 0x0000E002, 0x0000E002
 };
 #ifdef __SSE4_1__
 const uint8 g_flags_AVX2[64] =
 {
    0x63, 0x63, 0x63, 0x63,
    0x63, 0x63, 0x63, 0x7D,
    0x63, 0x63, 0x7D, 0x7D,
    0x63, 0x7D, 0x7D, 0x7D,
    0x43, 0x43, 0x43, 0x43,
    0x43, 0x43, 0x43, 0x5D,
    0x43, 0x43, 0x5D, 0x5D,
    0x43, 0x5D, 0x5D, 0x5D,
    0x23, 0x23, 0x23, 0x23,
    0x23, 0x23, 0x23, 0x3D,
    0x23, 0x23, 0x3D, 0x3D,
    0x23, 0x3D, 0x3D, 0x3D,
    0x03, 0x03, 0x03, 0x03,
    0x03, 0x03, 0x03, 0x1D,
    0x03, 0x03, 0x1D, 0x1D,
    0x03, 0x1D, 0x1D, 0x1D,
 };
 const __m128i g_table_SIMD[2] =
 {
    _mm_setr_epi16(   2,   5,   9,  13,  18,  24,  33,  47),
    _mm_setr_epi16(   8,  17,  29,  42,  60,  80, 106, 183)
 };
 const __m128i g_table128_SIMD[2] =
 {
    _mm_setr_epi16(   2*128,   5*128,   9*128,  13*128,  18*128,  24*128,  33*128,  47*128),
    _mm_setr_epi16(   8*128,  17*128,  29*128,  42*128,  60*128,  80*128, 106*128, 183*128)
 };
 const __m128i g_table256_SIMD[4] =
 {
    _mm_setr_epi32(  2*256,   5*256,   9*256,  13*256),
    _mm_setr_epi32(  8*256,  17*256,  29*256,  42*256),
    _mm_setr_epi32( 18*256,  24*256,  33*256,  47*256),
    _mm_setr_epi32( 60*256,  80*256, 106*256, 183*256)
 };
 #endif
--- a/3rdparty/etc2/Tables.hpp
+++ b/3rdparty/etc2/Tables.hpp
@ -0,0 +1,25 @@
 #ifndef __TABLES_HPP__
 #define __TABLES_HPP__
 #include "Types.hpp"
 #ifdef __SSE4_1__
 #include <smmintrin.h>
 #endif
 extern const int32 g_table[8][4];
 extern const int64 g_table256[8][4];
 extern const uint32 g_id[4][16];
 extern const uint32 g_avg2[16];
 extern const uint32 g_flags[64];
 #ifdef __SSE4_1__
 extern const uint8 g_flags_AVX2[64];
 extern const __m128i g_table_SIMD[2];
 extern const __m128i g_table128_SIMD[2];
 extern const __m128i g_table256_SIMD[4];
 #endif
 #endif
--- a/3rdparty/etc2/Types.hpp
+++ b/3rdparty/etc2/Types.hpp
@ -0,0 +1,17 @@
 #ifndef __DARKRL__TYPES_HPP__
 #define __DARKRL__TYPES_HPP__
 #include <cstdint>
 typedef int8_t      int8;
 typedef uint8_t     uint8;
 typedef int16_t     int16;
 typedef uint16_t    uint16;
 typedef int32_t     int32;
 typedef uint32_t    uint32;
 typedef int64_t     int64;
 typedef uint64_t    uint64;
 typedef unsigned int uint;
 #endif
--- a/3rdparty/etc2/Vector.hpp
+++ b/3rdparty/etc2/Vector.hpp
@ -0,0 +1,222 @@
 #ifndef __DARKRL__VECTOR_HPP__
 #define __DARKRL__VECTOR_HPP__
 #include <assert.h>
 #include <algorithm>
 #include <math.h>
 #include "Math.hpp"
 #include "Types.hpp"
 template<class T>
 struct Vector2
 {
    Vector2() : x( 0 ), y( 0 ) {}
    Vector2( T v ) : x( v ), y( v ) {}
    Vector2( T _x, T _y ) : x( _x ), y( _y ) {}
    bool operator==( const Vector2<T>& rhs ) const { return x == rhs.x && y == rhs.y; }
    bool operator!=( const Vector2<T>& rhs ) const { return !( *this == rhs ); }
    Vector2<T>& operator+=( const Vector2<T>& rhs )
    {
        x += rhs.x;
        y += rhs.y;
        return *this;
    }
    Vector2<T>& operator-=( const Vector2<T>& rhs )
    {
        x -= rhs.x;
        y -= rhs.y;
        return *this;
    }
    Vector2<T>& operator*=( const Vector2<T>& rhs )
    {
        x *= rhs.x;
        y *= rhs.y;
        return *this;
    }
    T x, y;
 };
 template<class T>
 Vector2<T> operator+( const Vector2<T>& lhs, const Vector2<T>& rhs )
 {
    return Vector2<T>( lhs.x + rhs.x, lhs.y + rhs.y );
 }
 template<class T>
 Vector2<T> operator-( const Vector2<T>& lhs, const Vector2<T>& rhs )
 {
    return Vector2<T>( lhs.x - rhs.x, lhs.y - rhs.y );
 }
 template<class T>
 Vector2<T> operator*( const Vector2<T>& lhs, const float& rhs )
 {
    return Vector2<T>( lhs.x * rhs, lhs.y * rhs );
 }
 template<class T>
 Vector2<T> operator/( const Vector2<T>& lhs, const T& rhs )
 {
    return Vector2<T>( lhs.x / rhs, lhs.y / rhs );
 }
 typedef Vector2<int32> v2i;
 typedef Vector2<float> v2f;
 template<class T>
 struct Vector3
 {
    Vector3() : x( 0 ), y( 0 ), z( 0 ) {}
    Vector3( T v ) : x( v ), y( v ), z( v ) {}
    Vector3( T _x, T _y, T _z ) : x( _x ), y( _y ), z( _z ) {}
    template<class Y>
    Vector3( const Vector3<Y>& v ) : x( T( v.x ) ), y( T( v.y ) ), z( T( v.z ) ) {}
    T Luminance() const { return T( x * 0.3f + y * 0.59f + z * 0.11f ); }
    void Clamp()
    {
        x = std::min( T(1), std::max( T(0), x ) );
        y = std::min( T(1), std::max( T(0), y ) );
        z = std::min( T(1), std::max( T(0), z ) );
    }
    bool operator==( const Vector3<T>& rhs ) const { return x == rhs.x && y == rhs.y && z == rhs.z; }
    bool operator!=( const Vector2<T>& rhs ) const { return !( *this == rhs ); }
    T& operator[]( uint idx ) { assert( idx < 3 ); return ((T*)this)[idx]; }
    const T& operator[]( uint idx ) const { assert( idx < 3 ); return ((T*)this)[idx]; }
    Vector3<T> operator+=( const Vector3<T>& rhs )
    {
        x += rhs.x;
        y += rhs.y;
        z += rhs.z;
        return *this;
    }
    Vector3<T> operator*=( const Vector3<T>& rhs )
    {
        x *= rhs.x;
        y *= rhs.y;
        z *= rhs.z;
        return *this;
    }
    Vector3<T> operator*=( const float& rhs )
    {
        x *= rhs;
        y *= rhs;
        z *= rhs;
        return *this;
    }
    T x, y, z;
    T padding;
 };
 template<class T>
 Vector3<T> operator+( const Vector3<T>& lhs, const Vector3<T>& rhs )
 {
    return Vector3<T>( lhs.x + rhs.x, lhs.y + rhs.y, lhs.z + rhs.z );
 }
 template<class T>
 Vector3<T> operator-( const Vector3<T>& lhs, const Vector3<T>& rhs )
 {
    return Vector3<T>( lhs.x - rhs.x, lhs.y - rhs.y, lhs.z - rhs.z );
 }
 template<class T>
 Vector3<T> operator*( const Vector3<T>& lhs, const Vector3<T>& rhs )
 {
    return Vector3<T>( lhs.x * rhs.x, lhs.y * rhs.y, lhs.z * rhs.z );
 }
 template<class T>
 Vector3<T> operator*( const Vector3<T>& lhs, const float& rhs )
 {
    return Vector3<T>( T( lhs.x * rhs ), T( lhs.y * rhs ), T( lhs.z * rhs ) );
 }
 template<class T>
 Vector3<T> operator/( const Vector3<T>& lhs, const T& rhs )
 {
    return Vector3<T>( lhs.x / rhs, lhs.y / rhs, lhs.z / rhs );
 }
 template<class T>
 bool operator<( const Vector3<T>& lhs, const Vector3<T>& rhs )
 {
    return lhs.Luminance() < rhs.Luminance();
 }
 typedef Vector3<int32> v3i;
 typedef Vector3<float> v3f;
 typedef Vector3<uint8> v3b;
 static inline v3b v3f_to_v3b( const v3f& v )
 {
    return v3b( uint8( std::min( 1.f, v.x ) * 255 ), uint8( std::min( 1.f, v.y ) * 255 ), uint8( std::min( 1.f, v.z ) * 255 ) );
 }
 template<class T>
 Vector3<T> Mix( const Vector3<T>& v1, const Vector3<T>& v2, float amount )
 {
    return v1 + ( v2 - v1 ) * amount;
 }
 template<>
 inline v3b Mix( const v3b& v1, const v3b& v2, float amount )
 {
    return v3b( v3f( v1 ) + ( v3f( v2 ) - v3f( v1 ) ) * amount );
 }
 template<class T>
 Vector3<T> Desaturate( const Vector3<T>& v )
 {
    T l = v.Luminance();
    return Vector3<T>( l, l, l );
 }
 template<class T>
 Vector3<T> Desaturate( const Vector3<T>& v, float mul )
 {
    T l = T( v.Luminance() * mul );
    return Vector3<T>( l, l, l );
 }
 template<class T>
 Vector3<T> pow( const Vector3<T>& base, float exponent )
 {
    return Vector3<T>(
        pow( base.x, exponent ),
        pow( base.y, exponent ),
        pow( base.z, exponent ) );
 }
 template<class T>
 Vector3<T> sRGB2linear( const Vector3<T>& v )
 {
    return Vector3<T>(
        sRGB2linear( v.x ),
        sRGB2linear( v.y ),
        sRGB2linear( v.z ) );
 }
 template<class T>
 Vector3<T> linear2sRGB( const Vector3<T>& v )
 {
    return Vector3<T>(
        linear2sRGB( v.x ),
        linear2sRGB( v.y ),
        linear2sRGB( v.z ) );
 }
 #endif
--- a/scripts/texturec.lua
+++ b/scripts/texturec.lua
@ -21,6 +21,8 @@ project "texturec"
 		path.join(BGFX_DIR, "3rdparty/libsquish/**.h"),
 		path.join(BGFX_DIR, "3rdparty/etc1/**.cpp"),
 		path.join(BGFX_DIR, "3rdparty/etc1/**.h"),
 		path.join(BGFX_DIR, "3rdparty/etc2/**.cpp"),
 		path.join(BGFX_DIR, "3rdparty/etc2/**.hpp"),
 		path.join(BGFX_DIR, "3rdparty/nvtt/**.cpp"),
 		path.join(BGFX_DIR, "3rdparty/nvtt/**.h"),
 		path.join(BGFX_DIR, "3rdparty/pvrtc/**.cpp"),
--- a/tools/texturec/texturec.cpp
+++ b/tools/texturec/texturec.cpp
@ -13,6 +13,7 @@
 #include "image.h"
 #include <libsquish/squish.h>
 #include <etc1/etc1.h>
 #include <etc2/ProcessRGB.hpp>
 #include <nvtt/nvtt.h>
 #include <pvrtc/PvrTcEncoder.h>
 #include <tinyexr/tinyexr.h>
@ -86,6 +87,23 @@ namespace bgfx
 			etc1_encode_image( (const uint8_t*)_src, _width, _height, 4, _width*4, (uint8_t*)_dst);
 			return true;
 		case TextureFormat::ETC2:
 			{
 				const uint32_t pitch  = _width*4;
 				const uint32_t width  = _width/4;
 				const uint32_t height = _height/4;
 				const uint8_t* src = (const uint8_t*)_src;
 				uint64_t* dst = (uint64_t*)_dst;
 				for (uint32_t yy = 0; yy < height; ++yy)
 				{
 					for (uint32_t xx = 0; xx < width; ++xx)
 					{
 						*dst++ = ProcessRGB_ETC2(&src[(yy*pitch+xx)*4]);
 					}
 				}
 			}
 			return true;
 		case TextureFormat::PTC14:
 			{
 				using namespace Javelin;