From 8ab70bd8cfa2016cc418322a31a95151e45a0031 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Branimir=20Karad=C5=BEi=C4=87?= Date: Sun, 20 Dec 2015 20:40:35 -0800 Subject: [PATCH] Added stripped down NVTT library. --- .../nvtt/NVIDIA_Texture_Tools_LICENSE.txt | 24 + 3rdparty/nvtt/bc6h/bits.h | 76 + 3rdparty/nvtt/bc6h/shapes_two.h | 133 ++ 3rdparty/nvtt/bc6h/tile.h | 83 ++ 3rdparty/nvtt/bc6h/zoh.cpp | 197 +++ 3rdparty/nvtt/bc6h/zoh.h | 65 + 3rdparty/nvtt/bc6h/zoh_utils.cpp | 324 +++++ 3rdparty/nvtt/bc6h/zoh_utils.h | 73 + 3rdparty/nvtt/bc6h/zohone.cpp | 799 +++++++++++ 3rdparty/nvtt/bc6h/zohtwo.cpp | 883 ++++++++++++ 3rdparty/nvtt/bc7/avpcl.cpp | 264 ++++ 3rdparty/nvtt/bc7/avpcl.h | 99 ++ 3rdparty/nvtt/bc7/avpcl_mode0.cpp | 1066 ++++++++++++++ 3rdparty/nvtt/bc7/avpcl_mode1.cpp | 1047 ++++++++++++++ 3rdparty/nvtt/bc7/avpcl_mode2.cpp | 1004 +++++++++++++ 3rdparty/nvtt/bc7/avpcl_mode3.cpp | 1059 ++++++++++++++ 3rdparty/nvtt/bc7/avpcl_mode4.cpp | 1214 ++++++++++++++++ 3rdparty/nvtt/bc7/avpcl_mode5.cpp | 1216 ++++++++++++++++ 3rdparty/nvtt/bc7/avpcl_mode6.cpp | 1055 ++++++++++++++ 3rdparty/nvtt/bc7/avpcl_mode7.cpp | 1094 ++++++++++++++ 3rdparty/nvtt/bc7/avpcl_utils.cpp | 389 +++++ 3rdparty/nvtt/bc7/avpcl_utils.h | 61 + 3rdparty/nvtt/bc7/bits.h | 76 + 3rdparty/nvtt/bc7/endpts.h | 81 ++ 3rdparty/nvtt/bc7/shapes_three.h | 132 ++ 3rdparty/nvtt/bc7/shapes_two.h | 133 ++ 3rdparty/nvtt/bc7/tile.h | 41 + 3rdparty/nvtt/nvcore/Array.inl | 437 ++++++ 3rdparty/nvtt/nvcore/Debug.h | 216 +++ 3rdparty/nvtt/nvcore/array.h | 181 +++ 3rdparty/nvtt/nvcore/defsgnucdarwin.h | 53 + 3rdparty/nvtt/nvcore/defsgnuclinux.h | 59 + 3rdparty/nvtt/nvcore/defsgnucwin32.h | 65 + 3rdparty/nvtt/nvcore/defsvcwin32.h | 94 ++ 3rdparty/nvtt/nvcore/foreach.h | 68 + 3rdparty/nvtt/nvcore/hash.h | 83 ++ 3rdparty/nvtt/nvcore/memory.h | 29 + 3rdparty/nvtt/nvcore/nvcore.h | 299 ++++ 3rdparty/nvtt/nvcore/posh.h | 1030 +++++++++++++ 3rdparty/nvtt/nvcore/stdstream.h | 459 ++++++ 3rdparty/nvtt/nvcore/stream.h | 163 +++ 3rdparty/nvtt/nvcore/strlib.h | 429 ++++++ 3rdparty/nvtt/nvcore/utils.h | 281 ++++ 3rdparty/nvtt/nvmath/Vector.inl | 921 ++++++++++++ 3rdparty/nvtt/nvmath/fitting.cpp | 1200 ++++++++++++++++ 3rdparty/nvtt/nvmath/fitting.h | 49 + 3rdparty/nvtt/nvmath/matrix.h | 112 ++ 3rdparty/nvtt/nvmath/matrix.inl | 1274 +++++++++++++++++ 3rdparty/nvtt/nvmath/nvmath.h | 56 + 3rdparty/nvtt/nvmath/plane.h | 40 + 3rdparty/nvtt/nvmath/plane.inl | 49 + 3rdparty/nvtt/nvmath/vector.h | 148 ++ 3rdparty/nvtt/nvtt.cpp | 95 ++ 3rdparty/nvtt/nvtt.h | 13 + scripts/texturec.lua | 3 + tools/texturec/texturec.cpp | 32 + 56 files changed, 20626 insertions(+) create mode 100644 3rdparty/nvtt/NVIDIA_Texture_Tools_LICENSE.txt create mode 100644 3rdparty/nvtt/bc6h/bits.h create mode 100644 3rdparty/nvtt/bc6h/shapes_two.h create mode 100644 3rdparty/nvtt/bc6h/tile.h create mode 100644 3rdparty/nvtt/bc6h/zoh.cpp create mode 100644 3rdparty/nvtt/bc6h/zoh.h create mode 100644 3rdparty/nvtt/bc6h/zoh_utils.cpp create mode 100644 3rdparty/nvtt/bc6h/zoh_utils.h create mode 100644 3rdparty/nvtt/bc6h/zohone.cpp create mode 100644 3rdparty/nvtt/bc6h/zohtwo.cpp create mode 100644 3rdparty/nvtt/bc7/avpcl.cpp create mode 100644 3rdparty/nvtt/bc7/avpcl.h create mode 100644 3rdparty/nvtt/bc7/avpcl_mode0.cpp create mode 100644 3rdparty/nvtt/bc7/avpcl_mode1.cpp create mode 100644 3rdparty/nvtt/bc7/avpcl_mode2.cpp create mode 100644 3rdparty/nvtt/bc7/avpcl_mode3.cpp create mode 100644 3rdparty/nvtt/bc7/avpcl_mode4.cpp create mode 100644 3rdparty/nvtt/bc7/avpcl_mode5.cpp create mode 100644 3rdparty/nvtt/bc7/avpcl_mode6.cpp create mode 100644 3rdparty/nvtt/bc7/avpcl_mode7.cpp create mode 100644 3rdparty/nvtt/bc7/avpcl_utils.cpp create mode 100644 3rdparty/nvtt/bc7/avpcl_utils.h create mode 100644 3rdparty/nvtt/bc7/bits.h create mode 100644 3rdparty/nvtt/bc7/endpts.h create mode 100644 3rdparty/nvtt/bc7/shapes_three.h create mode 100644 3rdparty/nvtt/bc7/shapes_two.h create mode 100644 3rdparty/nvtt/bc7/tile.h create mode 100644 3rdparty/nvtt/nvcore/Array.inl create mode 100644 3rdparty/nvtt/nvcore/Debug.h create mode 100644 3rdparty/nvtt/nvcore/array.h create mode 100644 3rdparty/nvtt/nvcore/defsgnucdarwin.h create mode 100644 3rdparty/nvtt/nvcore/defsgnuclinux.h create mode 100644 3rdparty/nvtt/nvcore/defsgnucwin32.h create mode 100644 3rdparty/nvtt/nvcore/defsvcwin32.h create mode 100644 3rdparty/nvtt/nvcore/foreach.h create mode 100644 3rdparty/nvtt/nvcore/hash.h create mode 100644 3rdparty/nvtt/nvcore/memory.h create mode 100644 3rdparty/nvtt/nvcore/nvcore.h create mode 100644 3rdparty/nvtt/nvcore/posh.h create mode 100644 3rdparty/nvtt/nvcore/stdstream.h create mode 100644 3rdparty/nvtt/nvcore/stream.h create mode 100644 3rdparty/nvtt/nvcore/strlib.h create mode 100644 3rdparty/nvtt/nvcore/utils.h create mode 100644 3rdparty/nvtt/nvmath/Vector.inl create mode 100644 3rdparty/nvtt/nvmath/fitting.cpp create mode 100644 3rdparty/nvtt/nvmath/fitting.h create mode 100644 3rdparty/nvtt/nvmath/matrix.h create mode 100644 3rdparty/nvtt/nvmath/matrix.inl create mode 100644 3rdparty/nvtt/nvmath/nvmath.h create mode 100644 3rdparty/nvtt/nvmath/plane.h create mode 100644 3rdparty/nvtt/nvmath/plane.inl create mode 100644 3rdparty/nvtt/nvmath/vector.h create mode 100644 3rdparty/nvtt/nvtt.cpp create mode 100644 3rdparty/nvtt/nvtt.h diff --git a/3rdparty/nvtt/NVIDIA_Texture_Tools_LICENSE.txt b/3rdparty/nvtt/NVIDIA_Texture_Tools_LICENSE.txt new file mode 100644 index 00000000..c422f717 --- /dev/null +++ b/3rdparty/nvtt/NVIDIA_Texture_Tools_LICENSE.txt @@ -0,0 +1,24 @@ +NVIDIA Texture Tools 2.0 is licensed under the MIT license. + +Copyright (c) 2007 NVIDIA Corporation + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the "Software"), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. diff --git a/3rdparty/nvtt/bc6h/bits.h b/3rdparty/nvtt/bc6h/bits.h new file mode 100644 index 00000000..67261351 --- /dev/null +++ b/3rdparty/nvtt/bc6h/bits.h @@ -0,0 +1,76 @@ +/* +Copyright 2007 nVidia, Inc. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. +*/ +#pragma once +#ifndef _ZOH_BITS_H +#define _ZOH_BITS_H + +// read/write a bitstream + +#include "nvcore/Debug.h" + +namespace ZOH { + +class Bits +{ +public: + + Bits(char *data, int maxdatabits) { nvAssert (data && maxdatabits > 0); bptr = bend = 0; bits = data; maxbits = maxdatabits; readonly = 0;} + Bits(const char *data, int availdatabits) { nvAssert (data && availdatabits > 0); bptr = 0; bend = availdatabits; cbits = data; maxbits = availdatabits; readonly = 1;} + + void write(int value, int nbits) { + nvAssert (nbits >= 0 && nbits < 32); + nvAssert (sizeof(int)>= 4); + for (int i=0; i>i); + } + int read(int nbits) { + nvAssert (nbits >= 0 && nbits < 32); + nvAssert (sizeof(int)>= 4); + int out = 0; + for (int i=0; i= 0 && ptr < maxbits); bptr = ptr; } + int getsize() { return bend; } + +private: + int bptr; // next bit to read + int bend; // last written bit + 1 + char *bits; // ptr to user bit stream + const char *cbits; // ptr to const user bit stream + int maxbits; // max size of user bit stream + char readonly; // 1 if this is a read-only stream + + int readone() { + nvAssert (bptr < bend); + if (bptr >= bend) return 0; + int bit = (readonly ? cbits[bptr>>3] : bits[bptr>>3]) & (1 << (bptr & 7)); + ++bptr; + return bit != 0; + } + void writeone(int bit) { + nvAssert (!readonly); // "Writing a read-only bit stream" + nvAssert (bptr < maxbits); + if (bptr >= maxbits) return; + if (bit&1) + bits[bptr>>3] |= 1 << (bptr & 7); + else + bits[bptr>>3] &= ~(1 << (bptr & 7)); + if (bptr++ >= bend) bend = bptr; + } +}; + +} + +#endif diff --git a/3rdparty/nvtt/bc6h/shapes_two.h b/3rdparty/nvtt/bc6h/shapes_two.h new file mode 100644 index 00000000..2fc55599 --- /dev/null +++ b/3rdparty/nvtt/bc6h/shapes_two.h @@ -0,0 +1,133 @@ +/* +Copyright 2007 nVidia, Inc. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. +*/ +#pragma once +#ifndef _ZOH_SHAPES_TWO_H +#define _ZOH_SHAPES_TWO_H + +// shapes for two regions + +#define NREGIONS 2 +#define NSHAPES 64 +#define SHAPEBITS 6 + +static const int shapes[NSHAPES*16] = +{ +0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, +0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, +0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, +0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, + +0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, +0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, +0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, +0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, + +0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, +0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, +0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, + +0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + +0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, +1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, +1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, +1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, + +0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, +0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, +0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, +0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, + +0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, +0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, +0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, +0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, + +0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, +0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, +1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, +1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, + +0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, +0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, +0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, +0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, + +0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, +1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, +0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, +1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, + +0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, +0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, +1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, +1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, + +0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, +1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, +1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, +0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, + +0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, +1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, +0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, + +0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, +1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, +1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, +0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, + +0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, +1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, +1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, +1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, + +0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, +1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, +0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, +0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, + +}; + +#define REGION(x,y,si) shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16] + +static const int shapeindex_to_compressed_indices[NSHAPES*2] = +{ + 0,15, 0,15, 0,15, 0,15, + 0,15, 0,15, 0,15, 0,15, + 0,15, 0,15, 0,15, 0,15, + 0,15, 0,15, 0,15, 0,15, + + 0,15, 0, 2, 0, 8, 0, 2, + 0, 2, 0, 8, 0, 8, 0,15, + 0, 2, 0, 8, 0, 2, 0, 2, + 0, 8, 0, 8, 0, 2, 0, 2, + + 0,15, 0,15, 0, 6, 0, 8, + 0, 2, 0, 8, 0,15, 0,15, + 0, 2, 0, 8, 0, 2, 0, 2, + 0, 2, 0,15, 0,15, 0, 6, + + 0, 6, 0, 2, 0, 6, 0, 8, + 0,15, 0,15, 0, 2, 0, 2, + 0,15, 0,15, 0,15, 0,15, + 0,15, 0, 2, 0, 2, 0,15 + +}; +#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region) shapeindex_to_compressed_indices[(si)*2+(region)] + +#endif diff --git a/3rdparty/nvtt/bc6h/tile.h b/3rdparty/nvtt/bc6h/tile.h new file mode 100644 index 00000000..d1ee9bdf --- /dev/null +++ b/3rdparty/nvtt/bc6h/tile.h @@ -0,0 +1,83 @@ +/* +Copyright 2007 nVidia, Inc. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. +*/ +#pragma once +#ifndef _ZOH_TILE_H +#define _ZOH_TILE_H + +#include "zoh_utils.h" +#include "nvmath/Vector.h" +#include + +namespace ZOH { + +//#define USE_IMPORTANCE_MAP 1 // define this if you want to increase importance of some pixels in tile +class Tile +{ +public: + // NOTE: this returns the appropriately-clamped BIT PATTERN of the half as an INTEGRAL float value + static float half2float(uint16 h) + { + return (float) Utils::ushort_to_format(h); + } + // NOTE: this is the inverse of the above operation + static uint16 float2half(float f) + { + return Utils::format_to_ushort((int)f); + } + + // look for adjacent pixels that are identical. if there are enough of them, increase their importance + void generate_importance_map() + { + // initialize + for (int y=0; y= size_x || yn < 0 || yn >= size_y) + return false; + return( (data[y][x].x == data[yn][xn].x) && + (data[y][x].y == data[yn][xn].y) && + (data[y][x].z == data[yn][xn].z) ); + } + +#ifdef USE_IMPORTANCE_MAP + bool match_4_neighbor(int x, int y) + { + return is_equal(x,y,x-1,y) || is_equal(x,y,x+1,y) || is_equal(x,y,x,y-1) || is_equal(x,y,x,y+1); + } +#else + bool match_4_neighbor(int, int) + { + return false; + } +#endif + + Tile() {}; + ~Tile(){}; + Tile(int xs, int ys) {size_x = xs; size_y = ys;} + + static const int TILE_H = 4; + static const int TILE_W = 4; + static const int TILE_TOTAL = TILE_H * TILE_W; + nv::Vector3 data[TILE_H][TILE_W]; + float importance_map[TILE_H][TILE_W]; + int size_x, size_y; // actual size of tile +}; + +} + +#endif // _ZOH_TILE_H diff --git a/3rdparty/nvtt/bc6h/zoh.cpp b/3rdparty/nvtt/bc6h/zoh.cpp new file mode 100644 index 00000000..3053ea15 --- /dev/null +++ b/3rdparty/nvtt/bc6h/zoh.cpp @@ -0,0 +1,197 @@ +/* +Copyright 2007 nVidia, Inc. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. +*/ + +// the zoh compressor and decompressor + +#include "tile.h" +#include "zoh.h" + +#include // memcpy + +using namespace ZOH; + + +bool ZOH::isone(const char *block) +{ + char code = block[0] & 0x1F; + + return (code == 0x03 || code == 0x07 || code == 0x0b || code == 0x0f); +} + +void ZOH::compress(const Tile &t, char *block) +{ + char oneblock[ZOH::BLOCKSIZE], twoblock[ZOH::BLOCKSIZE]; + + float mseone = ZOH::compressone(t, oneblock); + float msetwo = ZOH::compresstwo(t, twoblock); + + if (mseone <= msetwo) + memcpy(block, oneblock, ZOH::BLOCKSIZE); + else + memcpy(block, twoblock, ZOH::BLOCKSIZE); +} + +void ZOH::decompress(const char *block, Tile &t) +{ + if (ZOH::isone(block)) + ZOH::decompressone(block, t); + else + ZOH::decompresstwo(block, t); +} + +/* +void ZOH::compress(string inf, string zohf) +{ + Array2D pixels; + int w, h; + char block[ZOH::BLOCKSIZE]; + + Exr::readRgba(inf, pixels, w, h); + FILE *zohfile = fopen(zohf.c_str(), "wb"); + if (zohfile == NULL) throw "Unable to open .zoh file for write"; + + // stuff for progress bar O.o + int ntiles = ((h+Tile::TILE_H-1)/Tile::TILE_H)*((w+Tile::TILE_W-1)/Tile::TILE_W); + int tilecnt = 0; + int ndots = 25; + int dotcnt = 0; + printf("Progress ["); + for (int i=0; i (ntiles * dotcnt)/ndots) { printf("."); fflush(stdout); ++dotcnt; } + } + } + + printf("]\n"); // advance to next line finally + + if (fclose(zohfile)) throw "Close failed on .zoh file"; +} + +static int str2int(std::string s) +{ + int thing; + std::stringstream str (stringstream::in | stringstream::out); + str << s; + str >> thing; + return thing; +} + +// zoh file name is ...-w-h.zoh, extract width and height +static void extract(string zohf, int &w, int &h) +{ + size_t n = zohf.rfind('.', zohf.length()-1); + size_t n1 = zohf.rfind('-', n-1); + size_t n2 = zohf.rfind('-', n1-1); + string width = zohf.substr(n2+1, n1-n2-1); + w = str2int(width); + string height = zohf.substr(n1+1, n-n1-1); + h = str2int(height); +} + +static int mode_to_prec[] = { + 10,7,11,10, + 10,7,11,11, + 10,7,11,12, + 10,7,9,16, + 10,7,8,-1, + 10,7,8,-1, + 10,7,8,-1, + 10,7,6,-1, +}; + +static int shapeindexhist[32], modehist[32], prechistone[16], prechisttwo[16], oneregion, tworegions; + +static void stats(char block[ZOH::BLOCKSIZE]) +{ + char mode = block[0] & 0x1F; if ((mode & 0x3) == 0) mode = 0; if ((mode & 0x3) == 1) mode = 1; modehist[mode]++; + int prec = mode_to_prec[mode]; + nvAssert (prec != -1); + if (!ZOH::isone(block)) + { + tworegions++; + prechisttwo[prec]++; + int shapeindex = ((block[0] & 0xe0) >> 5) | ((block[1] & 0x3) << 3); + shapeindexhist[shapeindex]++; + } + else + { + oneregion++; + prechistone[prec]++; + } +} + +static void printstats() +{ + printf("\nPrecision histogram 10b to 16b one region: "); for (int i=10; i<=16; ++i) printf("%d,", prechistone[i]); + printf("\nPrecision histogram 6b to 11b two regions: "); for (int i=6; i<=11; ++i) printf("%d,", prechisttwo[i]); + printf("\nMode histogram: "); for (int i=0; i<32; ++i) printf("%d,", modehist[i]); + printf("\nShape index histogram: "); for (int i=0; i<32; ++i) printf("%d,", shapeindexhist[i]); + printf("\nOne region %5.2f%% Two regions %5.2f%%", 100.0*oneregion/float(oneregion+tworegions), 100.0*tworegions/float(oneregion+tworegions)); + printf("\n"); +} + +void ZOH::decompress(string zohf, string outf) +{ + Array2D pixels; + int w, h; + char block[ZOH::BLOCKSIZE]; + + extract(zohf, w, h); + FILE *zohfile = fopen(zohf.c_str(), "rb"); + if (zohfile == NULL) throw "Unable to open .zoh file for read"; + pixels.resizeErase(h, w); + + // convert to tiles and decompress each tile + for (int y=0; y + +using namespace nv; +using namespace ZOH; + +static const int denom7_weights_64[] = {0, 9, 18, 27, 37, 46, 55, 64}; // divided by 64 +static const int denom15_weights_64[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64}; // divided by 64 + +/*static*/ Format Utils::FORMAT; + +int Utils::lerp(int a, int b, int i, int denom) +{ + nvDebugCheck (denom == 3 || denom == 7 || denom == 15); + nvDebugCheck (i >= 0 && i <= denom); + + int round = 32, shift = 6; + const int *weights; + + switch(denom) + { + case 3: denom *= 5; i *= 5; // fall through to case 15 + case 15: weights = denom15_weights_64; break; + case 7: weights = denom7_weights_64; break; + default: nvDebugCheck(0); + } + + return (a*weights[denom-i] +b*weights[i] + round) >> shift; +} + +Vector3 Utils::lerp(const Vector3& a, const Vector3 &b, int i, int denom) +{ + nvDebugCheck (denom == 3 || denom == 7 || denom == 15); + nvDebugCheck (i >= 0 && i <= denom); + + int shift = 6; + const int *weights; + + switch(denom) + { + case 3: denom *= 5; i *= 5; // fall through to case 15 + case 15: weights = denom15_weights_64; break; + case 7: weights = denom7_weights_64; break; + default: nvUnreachable(); + } + + // no need to round these as this is an exact division + return (a*float(weights[denom-i]) +b*float(weights[i])) / float(1 << shift); +} + + +/* + For unsigned f16, clamp the input to [0,F16MAX]. Thus u15. + For signed f16, clamp the input to [-F16MAX,F16MAX]. Thus s16. + + The conversions proceed as follows: + + unsigned f16: get bits. if high bit set, clamp to 0, else clamp to F16MAX. + signed f16: get bits. extract exp+mantissa and clamp to F16MAX. return -value if sign bit was set, else value + unsigned int: get bits. return as a positive value. + signed int. get bits. return as a value in -32768..32767. + + The inverse conversions are just the inverse of the above. +*/ + +// clamp the 3 channels of the input vector to the allowable range based on FORMAT +// note that each channel is a float storing the allowable range as a bit pattern converted to float +// that is, for unsigned f16 say, we would clamp each channel to the range [0, F16MAX] + +void Utils::clamp(Vector3 &v) +{ + for (int i=0; i<3; ++i) + { + switch(Utils::FORMAT) + { + case UNSIGNED_F16: + if (v.component[i] < 0.0) v.component[i] = 0; + else if (v.component[i] > F16MAX) v.component[i] = F16MAX; + break; + + case SIGNED_F16: + if (v.component[i] < -F16MAX) v.component[i] = -F16MAX; + else if (v.component[i] > F16MAX) v.component[i] = F16MAX; + break; + + default: + nvUnreachable(); + } + } +} + +// convert a u16 value to s17 (represented as an int) based on the format expected +int Utils::ushort_to_format(unsigned short input) +{ + int out, s; + + // clamp to the valid range we are expecting + switch (Utils::FORMAT) + { + case UNSIGNED_F16: + if (input & F16S_MASK) out = 0; + else if (input > F16MAX) out = F16MAX; + else out = input; + break; + + case SIGNED_F16: + s = input & F16S_MASK; + input &= F16EM_MASK; + if (input > F16MAX) out = F16MAX; + else out = input; + out = s ? -out : out; + break; + } + return out; +} + +// convert a s17 value to u16 based on the format expected +unsigned short Utils::format_to_ushort(int input) +{ + unsigned short out; + + // clamp to the valid range we are expecting + switch (Utils::FORMAT) + { + case UNSIGNED_F16: + nvDebugCheck (input >= 0 && input <= F16MAX); + out = input; + break; + + case SIGNED_F16: + nvDebugCheck (input >= -F16MAX && input <= F16MAX); + // convert to sign-magnitude + int s; + if (input < 0) { s = F16S_MASK; input = -input; } + else { s = 0; } + out = s | input; + break; + } + return out; +} + +// quantize the input range into equal-sized bins +int Utils::quantize(float value, int prec) +{ + int q, ivalue, s; + + nvDebugCheck (prec > 1); // didn't bother to make it work for 1 + + value = (float)floor(value + 0.5); + + int bias = (prec > 10) ? ((1<<(prec-1))-1) : 0; // bias precisions 11..16 to get a more accurate quantization + + switch (Utils::FORMAT) + { + case UNSIGNED_F16: + nvDebugCheck (value >= 0 && value <= F16MAX); + ivalue = (int)value; + q = ((ivalue << prec) + bias) / (F16MAX+1); + nvDebugCheck (q >= 0 && q < (1 << prec)); + break; + + case SIGNED_F16: + nvDebugCheck (value >= -F16MAX && value <= F16MAX); + // convert to sign-magnitude + ivalue = (int)value; + if (ivalue < 0) { s = 1; ivalue = -ivalue; } else s = 0; + + q = ((ivalue << (prec-1)) + bias) / (F16MAX+1); + if (s) + q = -q; + nvDebugCheck (q > -(1 << (prec-1)) && q < (1 << (prec-1))); + break; + } + + return q; +} + +int Utils::finish_unquantize(int q, int prec) +{ + if (Utils::FORMAT == UNSIGNED_F16) + return (q * 31) >> 6; // scale the magnitude by 31/64 + else if (Utils::FORMAT == SIGNED_F16) + return (q < 0) ? -(((-q) * 31) >> 5) : (q * 31) >> 5; // scale the magnitude by 31/32 + else + return q; +} + +// unquantize each bin to midpoint of original bin range, except +// for the end bins which we push to an endpoint of the bin range. +// we do this to ensure we can represent all possible original values. +// the asymmetric end bins do not affect PSNR for the test images. +// +// code this function assuming an arbitrary bit pattern as the encoded block +int Utils::unquantize(int q, int prec) +{ + int unq, s; + + nvDebugCheck (prec > 1); // not implemented for prec 1 + + switch (Utils::FORMAT) + { + // modify this case to move the multiplication by 31 after interpolation. + // Need to use finish_unquantize. + + // since we have 16 bits available, let's unquantize this to 16 bits unsigned + // thus the scale factor is [0-7c00)/[0-10000) = 31/64 + case UNSIGNED_F16: + if (prec >= 15) + unq = q; + else if (q == 0) + unq = 0; + else if (q == ((1<> prec; + break; + + // here, let's stick with S16 (no apparent quality benefit from going to S17) + // range is (-7c00..7c00)/(-8000..8000) = 31/32 + case SIGNED_F16: + // don't remove this test even though it appears equivalent to the code below + // as it isn't -- the code below can overflow for prec = 16 + if (prec >= 16) + unq = q; + else + { + if (q < 0) { s = 1; q = -q; } else s = 0; + + if (q == 0) + unq = 0; + else if (q >= ((1<<(prec-1))-1)) + unq = s ? -S16MAX : S16MAX; + else + { + unq = (q * (S16MAX+1) + (S16MAX+1)/2) >> (prec-1); + if (s) + unq = -unq; + } + } + break; + } + return unq; +} + + + +// pick a norm! +#define NORM_EUCLIDEAN 1 + +float Utils::norm(const Vector3 &a, const Vector3 &b) +{ +#ifdef NORM_EUCLIDEAN + return lengthSquared(a - b); +#endif +#ifdef NORM_ABS + Vector3 err = a - b; + return fabs(err.x) + fabs(err.y) + fabs(err.z); +#endif +} + +// parse [{:}]{,} +// the pointer starts here ^ +// name is 1 or 2 chars and matches field names. start and end are decimal numbers +void Utils::parse(const char *encoding, int &ptr, Field &field, int &endbit, int &len) +{ + if (ptr <= 0) return; + --ptr; + if (encoding[ptr] == ',') --ptr; + nvDebugCheck (encoding[ptr] == ']'); + --ptr; + endbit = 0; + int scale = 1; + while (encoding[ptr] != ':' && encoding[ptr] != '[') + { + nvDebugCheck(encoding[ptr] >= '0' && encoding[ptr] <= '9'); + endbit += (encoding[ptr--] - '0') * scale; + scale *= 10; + } + int startbit = 0; scale = 1; + if (encoding[ptr] == '[') + startbit = endbit; + else + { + ptr--; + while (encoding[ptr] != '[') + { + nvDebugCheck(encoding[ptr] >= '0' && encoding[ptr] <= '9'); + startbit += (encoding[ptr--] - '0') * scale; + scale *= 10; + } + } + len = startbit - endbit + 1; // startbit>=endbit note + --ptr; + if (encoding[ptr] == 'm') field = FIELD_M; + else if (encoding[ptr] == 'd') field = FIELD_D; + else { + // it's wxyz + nvDebugCheck (encoding[ptr] >= 'w' && encoding[ptr] <= 'z'); + int foo = encoding[ptr--] - 'w'; + // now it is r g or b + if (encoding[ptr] == 'r') foo += 10; + else if (encoding[ptr] == 'g') foo += 20; + else if (encoding[ptr] == 'b') foo += 30; + else nvDebugCheck(0); + field = (Field) foo; + } +} + + diff --git a/3rdparty/nvtt/bc6h/zoh_utils.h b/3rdparty/nvtt/bc6h/zoh_utils.h new file mode 100644 index 00000000..b047a835 --- /dev/null +++ b/3rdparty/nvtt/bc6h/zoh_utils.h @@ -0,0 +1,73 @@ +/* +Copyright 2007 nVidia, Inc. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. +*/ + +// utility class holding common routines +#pragma once +#ifndef _ZOH_UTILS_H +#define _ZOH_UTILS_H + +#include "nvmath/Vector.h" + +namespace ZOH { + +inline int SIGN_EXTEND(int x, int nb) { return ((((signed(x))&(1<<((nb)-1)))?((~0)<<(nb)):0)|(signed(x))); } + +enum Field { + FIELD_M = 1, // mode + FIELD_D = 2, // distribution/shape + FIELD_RW = 10+0, FIELD_RX = 10+1, FIELD_RY = 10+2, FIELD_RZ = 10+3, // red channel endpoints or deltas + FIELD_GW = 20+0, FIELD_GX = 20+1, FIELD_GY = 20+2, FIELD_GZ = 20+3, // green channel endpoints or deltas + FIELD_BW = 30+0, FIELD_BX = 30+1, FIELD_BY = 30+2, FIELD_BZ = 30+3, // blue channel endpoints or deltas +}; + +// some constants +static const int F16S_MASK = 0x8000; // f16 sign mask +static const int F16EM_MASK = 0x7fff; // f16 exp & mantissa mask +static const int U16MAX = 0xffff; +static const int S16MIN = -0x8000; +static const int S16MAX = 0x7fff; +static const int INT16_MASK = 0xffff; +static const int F16MAX = 0x7bff; // MAXFLT bit pattern for halfs + +enum Format { UNSIGNED_F16, SIGNED_F16 }; + +class Utils +{ +public: + static Format FORMAT; // this is a global -- we're either handling unsigned or unsigned half values + + // error metrics + static float norm(const nv::Vector3 &a, const nv::Vector3 &b); + static float mpsnr_norm(const nv::Vector3 &a, int exposure, const nv::Vector3 &b); + + // conversion & clamp + static int ushort_to_format(unsigned short input); + static unsigned short format_to_ushort(int input); + + // clamp to format + static void clamp(nv::Vector3 &v); + + // quantization and unquantization + static int finish_unquantize(int q, int prec); + static int unquantize(int q, int prec); + static int quantize(float value, int prec); + + static void parse(const char *encoding, int &ptr, Field & field, int &endbit, int &len); + + // lerping + static int lerp(int a, int b, int i, int denom); + static nv::Vector3 lerp(const nv::Vector3 & a, const nv::Vector3 & b, int i, int denom); +}; + +} + +#endif // _ZOH_UTILS_H diff --git a/3rdparty/nvtt/bc6h/zohone.cpp b/3rdparty/nvtt/bc6h/zohone.cpp new file mode 100644 index 00000000..c32c5010 --- /dev/null +++ b/3rdparty/nvtt/bc6h/zohone.cpp @@ -0,0 +1,799 @@ +/* +Copyright 2007 nVidia, Inc. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. +*/ + +// one region zoh compress/decompress code +// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis + +#include "bits.h" +#include "tile.h" +#include "zoh.h" +#include "zoh_utils.h" + +#include "nvmath/Vector.inl" +#include "nvmath/Fitting.h" + +#include // strlen +#include // FLT_MAX + +using namespace nv; +using namespace ZOH; + +#define NINDICES 16 +#define INDEXBITS 4 +#define HIGH_INDEXBIT (1<<(INDEXBITS-1)) +#define DENOM (NINDICES-1) + +#define NSHAPES 1 + +static const int shapes[NSHAPES] = +{ + 0x0000 +}; // only 1 shape + +#define REGION(x,y,shapeindex) ((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0) + +#define POS_TO_X(pos) ((pos)&3) +#define POS_TO_Y(pos) (((pos)>>2)&3) + +#define NDELTA 2 + +struct Chanpat +{ + int prec[NDELTA]; // precision pattern for one channel +}; + +struct Pattern +{ + Chanpat chan[NCHANNELS];// allow different bit patterns per channel -- but we still want constant precision per channel + int transformed; // if 0, deltas are unsigned and no transform; otherwise, signed and transformed + int mode; // associated mode value + int modebits; // number of mode bits + const char *encoding; // verilog description of encoding for this mode +}; + +#define MAXMODEBITS 5 +#define MAXMODES (1<> 2) & 3 and x = index & 3 +static void swap_indices(IntEndpts endpts[NREGIONS_ONE], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex) +{ + int index_positions[NREGIONS_ONE]; + + index_positions[0] = 0; // since WLOG we have the high bit of the shapes at 0 + + for (int region = 0; region < NREGIONS_ONE; ++region) + { + int x = index_positions[region] & 3; + int y = (index_positions[region] >> 2) & 3; + nvDebugCheck(REGION(x,y,shapeindex) == region); // double check the table + if (indices[y][x] & HIGH_INDEXBIT) + { + // high bit is set, swap the endpts and indices for this region + int t; + for (int i=0; i> endbit, len); break; + case FIELD_RW: out.write(rw >> endbit, len); break; + case FIELD_RX: out.write(rx >> endbit, len); break; + case FIELD_GW: out.write(gw >> endbit, len); break; + case FIELD_GX: out.write(gx >> endbit, len); break; + case FIELD_BW: out.write(bw >> endbit, len); break; + case FIELD_BX: out.write(bx >> endbit, len); break; + + case FIELD_D: + case FIELD_RY: + case FIELD_RZ: + case FIELD_GY: + case FIELD_GZ: + case FIELD_BY: + case FIELD_BZ: + default: nvUnreachable(); + } + } +} + +static void read_header(Bits &in, ComprEndpts endpts[NREGIONS_ONE], Pattern &p) +{ + // reading isn't quite symmetric with writing -- we don't know the encoding until we decode the mode + int mode = in.read(2); + if (mode != 0x00 && mode != 0x01) + mode = (in.read(3) << 2) | mode; + + int pat_index = mode_to_pat[mode]; + + nvDebugCheck (pat_index >= 0 && pat_index < NPATTERNS); + nvDebugCheck (in.getptr() == patterns[pat_index].modebits); + + p = patterns[pat_index]; + + int d; + int rw, rx; + int gw, gx; + int bw, bx; + + d = 0; + rw = rx = 0; + gw = gx = 0; + bw = bx = 0; + + int ptr = int(strlen(p.encoding)); + + while (ptr) + { + Field field; + int endbit, len; + + // !!!UNDONE: get rid of string parsing!!! + Utils::parse(p.encoding, ptr, field, endbit, len); + + switch(field) + { + case FIELD_M: break; // already processed so ignore + case FIELD_RW: rw |= in.read(len) << endbit; break; + case FIELD_RX: rx |= in.read(len) << endbit; break; + case FIELD_GW: gw |= in.read(len) << endbit; break; + case FIELD_GX: gx |= in.read(len) << endbit; break; + case FIELD_BW: bw |= in.read(len) << endbit; break; + case FIELD_BX: bx |= in.read(len) << endbit; break; + + case FIELD_D: + case FIELD_RY: + case FIELD_RZ: + case FIELD_GY: + case FIELD_GZ: + case FIELD_BY: + case FIELD_BZ: + default: nvUnreachable(); + } + } + + nvDebugCheck (in.getptr() == 128 - 63); + + endpts[0].A[0] = rw; endpts[0].B[0] = rx; + endpts[0].A[1] = gw; endpts[0].B[1] = gx; + endpts[0].A[2] = bw; endpts[0].B[2] = bx; +} + +// compress index 0 +static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out) +{ + for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos) + { + int x = POS_TO_X(pos); + int y = POS_TO_Y(pos); + + out.write(indices[y][x], INDEXBITS - ((pos == 0) ? 1 : 0)); + } +} + +static void emit_block(const ComprEndpts endpts[NREGIONS_ONE], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block) +{ + Bits out(block, ZOH::BITSIZE); + + write_header(endpts, p, out); + + write_indices(indices, shapeindex, out); + + nvDebugCheck(out.getptr() == ZOH::BITSIZE); +} + +static void generate_palette_quantized(const IntEndpts &endpts, int prec, Vector3 palette[NINDICES]) +{ + // scale endpoints + int a, b; // really need a IntVector3... + + a = Utils::unquantize(endpts.A[0], prec); + b = Utils::unquantize(endpts.B[0], prec); + + // interpolate + for (int i = 0; i < NINDICES; ++i) + palette[i].x = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec)); + + a = Utils::unquantize(endpts.A[1], prec); + b = Utils::unquantize(endpts.B[1], prec); + + // interpolate + for (int i = 0; i < NINDICES; ++i) + palette[i].y = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec)); + + a = Utils::unquantize(endpts.A[2], prec); + b = Utils::unquantize(endpts.B[2], prec); + + // interpolate + for (int i = 0; i < NINDICES; ++i) + palette[i].z = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec)); +} + +// position 0 was compressed +static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W]) +{ + for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos) + { + int x = POS_TO_X(pos); + int y = POS_TO_Y(pos); + + indices[y][x]= in.read(INDEXBITS - ((pos == 0) ? 1 : 0)); + } +} + +void ZOH::decompressone(const char *block, Tile &t) +{ + Bits in(block, ZOH::BITSIZE); + + Pattern p; + IntEndpts endpts[NREGIONS_ONE]; + ComprEndpts compr_endpts[NREGIONS_ONE]; + + read_header(in, compr_endpts, p); + int shapeindex = 0; // only one shape + + decompress_endpts(compr_endpts, endpts, p); + + Vector3 palette[NREGIONS_ONE][NINDICES]; + for (int r = 0; r < NREGIONS_ONE; ++r) + generate_palette_quantized(endpts[r], p.chan[0].prec[0], &palette[r][0]); + + // read indices + int indices[Tile::TILE_H][Tile::TILE_W]; + + read_indices(in, shapeindex, indices); + + nvDebugCheck(in.getptr() == ZOH::BITSIZE); + + // lookup + for (int y = 0; y < Tile::TILE_H; y++) + for (int x = 0; x < Tile::TILE_W; x++) + t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]]; +} + +// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr +static float map_colors(const Vector3 colors[], const float importance[], int np, const IntEndpts &endpts, int prec) +{ + Vector3 palette[NINDICES]; + float toterr = 0; + Vector3 err; + + generate_palette_quantized(endpts, prec, palette); + + for (int i = 0; i < np; ++i) + { + float err, besterr; + + besterr = Utils::norm(colors[i], palette[0]) * importance[i]; + + for (int j = 1; j < NINDICES && besterr > 0; ++j) + { + err = Utils::norm(colors[i], palette[j]) * importance[i]; + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + besterr = err; + } + toterr += besterr; + } + return toterr; +} + +// assign indices given a tile, shape, and quantized endpoints, return toterr for each region +static void assign_indices(const Tile &tile, int shapeindex, IntEndpts endpts[NREGIONS_ONE], int prec, + int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_ONE]) +{ + // build list of possibles + Vector3 palette[NREGIONS_ONE][NINDICES]; + + for (int region = 0; region < NREGIONS_ONE; ++region) + { + generate_palette_quantized(endpts[region], prec, &palette[region][0]); + toterr[region] = 0; + } + + Vector3 err; + + for (int y = 0; y < tile.size_y; y++) + for (int x = 0; x < tile.size_x; x++) + { + int region = REGION(x,y,shapeindex); + float err, besterr; + + besterr = Utils::norm(tile.data[y][x], palette[region][0]); + indices[y][x] = 0; + + for (int i = 1; i < NINDICES && besterr > 0; ++i) + { + err = Utils::norm(tile.data[y][x], palette[region][i]); + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[y][x] = i; + } + } + toterr[region] += besterr; + } +} + +static float perturb_one(const Vector3 colors[], const float importance[], int np, int ch, int prec, const IntEndpts &old_endpts, IntEndpts &new_endpts, + float old_err, int do_b) +{ + // we have the old endpoints: old_endpts + // we have the perturbed endpoints: new_endpts + // we have the temporary endpoints: temp_endpts + + IntEndpts temp_endpts; + float min_err = old_err; // start with the best current error + int beststep; + + // copy real endpoints so we can perturb them + for (int i=0; i>= 1) + { + bool improved = false; + for (int sign = -1; sign <= 1; sign += 2) + { + if (do_b == 0) + { + temp_endpts.A[ch] = new_endpts.A[ch] + sign * step; + if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec)) + continue; + } + else + { + temp_endpts.B[ch] = new_endpts.B[ch] + sign * step; + if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec)) + continue; + } + + float err = map_colors(colors, importance, np, temp_endpts, prec); + + if (err < min_err) + { + improved = true; + min_err = err; + beststep = sign * step; + } + } + // if this was an improvement, move the endpoint and continue search from there + if (improved) + { + if (do_b == 0) + new_endpts.A[ch] += beststep; + else + new_endpts.B[ch] += beststep; + } + } + return min_err; +} + +static void optimize_one(const Vector3 colors[], const float importance[], int np, float orig_err, const IntEndpts &orig_endpts, int prec, IntEndpts &opt_endpts) +{ + float opt_err = orig_err; + for (int ch = 0; ch < NCHANNELS; ++ch) + { + opt_endpts.A[ch] = orig_endpts.A[ch]; + opt_endpts.B[ch] = orig_endpts.B[ch]; + } + /* + err0 = perturb(rgb0, delta0) + err1 = perturb(rgb1, delta1) + if (err0 < err1) + if (err0 >= initial_error) break + rgb0 += delta0 + next = 1 + else + if (err1 >= initial_error) break + rgb1 += delta1 + next = 0 + initial_err = map() + for (;;) + err = perturb(next ? rgb1:rgb0, delta) + if (err >= initial_err) break + next? rgb1 : rgb0 += delta + initial_err = err + */ + IntEndpts new_a, new_b; + IntEndpts new_endpt; + int do_b; + + // now optimize each channel separately + for (int ch = 0; ch < NCHANNELS; ++ch) + { + // figure out which endpoint when perturbed gives the most improvement and start there + // if we just alternate, we can easily end up in a local minima + float err0 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_a, opt_err, 0); // perturb endpt A + float err1 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_b, opt_err, 1); // perturb endpt B + + if (err0 < err1) + { + if (err0 >= opt_err) + continue; + + opt_endpts.A[ch] = new_a.A[ch]; + opt_err = err0; + do_b = 1; // do B next + } + else + { + if (err1 >= opt_err) + continue; + opt_endpts.B[ch] = new_b.B[ch]; + opt_err = err1; + do_b = 0; // do A next + } + + // now alternate endpoints and keep trying until there is no improvement + for (;;) + { + float err = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_endpt, opt_err, do_b); + if (err >= opt_err) + break; + if (do_b == 0) + opt_endpts.A[ch] = new_endpt.A[ch]; + else + opt_endpts.B[ch] = new_endpt.B[ch]; + opt_err = err; + do_b = 1 - do_b; // now move the other endpoint + } + } +} + +static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS_ONE], + const IntEndpts orig_endpts[NREGIONS_ONE], int prec, IntEndpts opt_endpts[NREGIONS_ONE]) +{ + Vector3 pixels[Tile::TILE_TOTAL]; + float importance[Tile::TILE_TOTAL]; + float err = 0; + + for (int region=0; region 0; ++i) + { + err = Utils::norm(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x]; + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + besterr = err; + } + toterr += besterr; + } + return toterr; +} + +float ZOH::roughone(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_ONE]) +{ + for (int region=0; region maxp) maxp = dp; + } + + // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values + endpts[region].A = mean + minp*direction; + endpts[region].B = mean + maxp*direction; + + // clamp endpoints + // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best + // shape based on endpoints being clamped + Utils::clamp(endpts[region].A); + Utils::clamp(endpts[region].B); + } + + return map_colors(tile, shapeindex, endpts); +} + +float ZOH::compressone(const Tile &t, char *block) +{ + int shapeindex_best = 0; + FltEndpts endptsbest[NREGIONS_ONE], tempendpts[NREGIONS_ONE]; + float msebest = FLT_MAX; + + /* + collect the mse values that are within 5% of the best values + optimize each one and choose the best + */ + // hack for now -- just use the best value WORK + for (int i=0; i0.0; ++i) + { + float mse = roughone(t, i, tempendpts); + if (mse < msebest) + { + msebest = mse; + shapeindex_best = i; + memcpy(endptsbest, tempendpts, sizeof(endptsbest)); + } + + } + return refineone(t, shapeindex_best, endptsbest, block); +} diff --git a/3rdparty/nvtt/bc6h/zohtwo.cpp b/3rdparty/nvtt/bc6h/zohtwo.cpp new file mode 100644 index 00000000..7172793b --- /dev/null +++ b/3rdparty/nvtt/bc6h/zohtwo.cpp @@ -0,0 +1,883 @@ +/* +Copyright 2007 nVidia, Inc. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. +*/ + +// two regions zoh compress/decompress code +// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis + +/* optimization algorithm + + get initial float endpoints + convert endpoints using 16 bit precision, transform, and get bit delta. choose likely endpoint compression candidates. + note that there will be 1 or 2 candidates; 2 will be chosen when the delta values are close to the max possible. + for each EC candidate in order from max precision to smaller precision + convert endpoints using the appropriate precision. + optimize the endpoints and minimize square error. save the error and index assignments. apply index compression as well. + (thus the endpoints and indices are in final form.) + transform and get bit delta. + if the bit delta fits, exit + if we ended up with no candidates somehow, choose the tail set of EC candidates and retry. this should happen hardly ever. + add a state variable to nvDebugCheck we only do this once. + convert to bit stream. + return the error. + + Global optimization + order all tiles based on their errors + do something special for high-error tiles + the goal here is to try to avoid tiling artifacts. but I think this is a research problem. let's just generate an error image... + + display an image that shows partitioning and precision selected for each tile +*/ + +#include "bits.h" +#include "tile.h" +#include "zoh.h" +#include "zoh_utils.h" + +#include "nvmath/Fitting.h" +#include "nvmath/Vector.inl" + +#include // strlen +#include // FLT_MAX + +using namespace nv; +using namespace ZOH; + +#define NINDICES 8 +#define INDEXBITS 3 +#define HIGH_INDEXBIT (1<<(INDEXBITS-1)) +#define DENOM (NINDICES-1) + +// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like? +// i.e. can we search shapes in a particular order so we can see the global error minima easily and +// stop without having to touch all shapes? + +#include "shapes_two.h" +// use only the first 32 available shapes +#undef NSHAPES +#undef SHAPEBITS +#define NSHAPES 32 +#define SHAPEBITS 5 + +#define POS_TO_X(pos) ((pos)&3) +#define POS_TO_Y(pos) (((pos)>>2)&3) + +#define NDELTA 4 + +struct Chanpat +{ + int prec[NDELTA]; // precision pattern for one channel +}; + +struct Pattern +{ + Chanpat chan[NCHANNELS]; // allow different bit patterns per channel -- but we still want constant precision per channel + int transformed; // if 0, deltas are unsigned and no transform; otherwise, signed and transformed + int mode; // associated mode value + int modebits; // number of mode bits + const char *encoding; // verilog description of encoding for this mode +}; + +#define MAXMODEBITS 5 +#define MAXMODES (1<> endbit, len); break; + case FIELD_D: out.write( d >> endbit, len); break; + case FIELD_RW: out.write(rw >> endbit, len); break; + case FIELD_RX: out.write(rx >> endbit, len); break; + case FIELD_RY: out.write(ry >> endbit, len); break; + case FIELD_RZ: out.write(rz >> endbit, len); break; + case FIELD_GW: out.write(gw >> endbit, len); break; + case FIELD_GX: out.write(gx >> endbit, len); break; + case FIELD_GY: out.write(gy >> endbit, len); break; + case FIELD_GZ: out.write(gz >> endbit, len); break; + case FIELD_BW: out.write(bw >> endbit, len); break; + case FIELD_BX: out.write(bx >> endbit, len); break; + case FIELD_BY: out.write(by >> endbit, len); break; + case FIELD_BZ: out.write(bz >> endbit, len); break; + default: nvUnreachable(); + } + } +} + +static bool read_header(Bits &in, ComprEndpts endpts[NREGIONS_TWO], int &shapeindex, Pattern &p) +{ + // reading isn't quite symmetric with writing -- we don't know the encoding until we decode the mode + int mode = in.read(2); + if (mode != 0x00 && mode != 0x01) + mode = (in.read(3) << 2) | mode; + + int pat_index = mode_to_pat[mode]; + + if (pat_index == -2) + return false; // reserved mode found + + nvDebugCheck (pat_index >= 0 && pat_index < NPATTERNS); + nvDebugCheck (in.getptr() == patterns[pat_index].modebits); + + p = patterns[pat_index]; + + int d; + int rw, rx, ry, rz; + int gw, gx, gy, gz; + int bw, bx, by, bz; + + d = 0; + rw = rx = ry = rz = 0; + gw = gx = gy = gz = 0; + bw = bx = by = bz = 0; + + int ptr = int(strlen(p.encoding)); + + while (ptr) + { + Field field; + int endbit, len; + + // !!!UNDONE: get rid of string parsing!!! + Utils::parse(p.encoding, ptr, field, endbit, len); + + switch(field) + { + case FIELD_M: break; // already processed so ignore + case FIELD_D: d |= in.read(len) << endbit; break; + case FIELD_RW: rw |= in.read(len) << endbit; break; + case FIELD_RX: rx |= in.read(len) << endbit; break; + case FIELD_RY: ry |= in.read(len) << endbit; break; + case FIELD_RZ: rz |= in.read(len) << endbit; break; + case FIELD_GW: gw |= in.read(len) << endbit; break; + case FIELD_GX: gx |= in.read(len) << endbit; break; + case FIELD_GY: gy |= in.read(len) << endbit; break; + case FIELD_GZ: gz |= in.read(len) << endbit; break; + case FIELD_BW: bw |= in.read(len) << endbit; break; + case FIELD_BX: bx |= in.read(len) << endbit; break; + case FIELD_BY: by |= in.read(len) << endbit; break; + case FIELD_BZ: bz |= in.read(len) << endbit; break; + default: nvUnreachable(); + } + } + + nvDebugCheck (in.getptr() == 128 - 46); + + shapeindex = d; + endpts[0].A[0] = rw; endpts[0].B[0] = rx; endpts[1].A[0] = ry; endpts[1].B[0] = rz; + endpts[0].A[1] = gw; endpts[0].B[1] = gx; endpts[1].A[1] = gy; endpts[1].B[1] = gz; + endpts[0].A[2] = bw; endpts[0].B[2] = bx; endpts[1].A[2] = by; endpts[1].B[2] = bz; + + return true; +} + +static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out) +{ + int positions[NREGIONS_TWO]; + + for (int r = 0; r < NREGIONS_TWO; ++r) + positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r); + + for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos) + { + int x = POS_TO_X(pos); + int y = POS_TO_Y(pos); + + bool match = false; + + for (int r = 0; r < NREGIONS_TWO; ++r) + if (positions[r] == pos) { match = true; break; } + + out.write(indices[y][x], INDEXBITS - (match ? 1 : 0)); + } +} + +static void emit_block(const ComprEndpts compr_endpts[NREGIONS_TWO], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block) +{ + Bits out(block, ZOH::BITSIZE); + + write_header(compr_endpts, shapeindex, p, out); + + write_indices(indices, shapeindex, out); + + nvDebugCheck(out.getptr() == ZOH::BITSIZE); +} + +static void generate_palette_quantized(const IntEndpts &endpts, int prec, Vector3 palette[NINDICES]) +{ + // scale endpoints + int a, b; // really need a IntVector3... + + a = Utils::unquantize(endpts.A[0], prec); + b = Utils::unquantize(endpts.B[0], prec); + + // interpolate + for (int i = 0; i < NINDICES; ++i) + palette[i].x = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec)); + + a = Utils::unquantize(endpts.A[1], prec); + b = Utils::unquantize(endpts.B[1], prec); + + // interpolate + for (int i = 0; i < NINDICES; ++i) + palette[i].y = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec)); + + a = Utils::unquantize(endpts.A[2], prec); + b = Utils::unquantize(endpts.B[2], prec); + + // interpolate + for (int i = 0; i < NINDICES; ++i) + palette[i].z = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec)); +} + +static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W]) +{ + int positions[NREGIONS_TWO]; + + for (int r = 0; r < NREGIONS_TWO; ++r) + positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r); + + for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos) + { + int x = POS_TO_X(pos); + int y = POS_TO_Y(pos); + + bool match = false; + + for (int r = 0; r < NREGIONS_TWO; ++r) + if (positions[r] == pos) { match = true; break; } + + indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0)); + } +} + +void ZOH::decompresstwo(const char *block, Tile &t) +{ + Bits in(block, ZOH::BITSIZE); + + Pattern p; + IntEndpts endpts[NREGIONS_TWO]; + ComprEndpts compr_endpts[NREGIONS_TWO]; + int shapeindex; + + if (!read_header(in, compr_endpts, shapeindex, p)) + { + // reserved mode, return all zeroes + for (int y = 0; y < Tile::TILE_H; y++) + for (int x = 0; x < Tile::TILE_W; x++) + t.data[y][x] = Vector3(0.0f); + + return; + } + + decompress_endpts(compr_endpts, endpts, p); + + Vector3 palette[NREGIONS_TWO][NINDICES]; + for (int r = 0; r < NREGIONS_TWO; ++r) + generate_palette_quantized(endpts[r], p.chan[0].prec[0], &palette[r][0]); + + int indices[Tile::TILE_H][Tile::TILE_W]; + + read_indices(in, shapeindex, indices); + + nvDebugCheck(in.getptr() == ZOH::BITSIZE); + + // lookup + for (int y = 0; y < Tile::TILE_H; y++) + for (int x = 0; x < Tile::TILE_W; x++) + t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]]; +} + +// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr +static float map_colors(const Vector3 colors[], const float importance[], int np, const IntEndpts &endpts, int prec) +{ + Vector3 palette[NINDICES]; + float toterr = 0; + Vector3 err; + + generate_palette_quantized(endpts, prec, palette); + + for (int i = 0; i < np; ++i) + { + float err, besterr; + + besterr = Utils::norm(colors[i], palette[0]) * importance[i]; + + for (int j = 1; j < NINDICES && besterr > 0; ++j) + { + err = Utils::norm(colors[i], palette[j]) * importance[i]; + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + besterr = err; + } + toterr += besterr; + } + return toterr; +} + +// assign indices given a tile, shape, and quantized endpoints, return toterr for each region +static void assign_indices(const Tile &tile, int shapeindex, IntEndpts endpts[NREGIONS_TWO], int prec, + int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_TWO]) +{ + // build list of possibles + Vector3 palette[NREGIONS_TWO][NINDICES]; + + for (int region = 0; region < NREGIONS_TWO; ++region) + { + generate_palette_quantized(endpts[region], prec, &palette[region][0]); + toterr[region] = 0; + } + + Vector3 err; + + for (int y = 0; y < tile.size_y; y++) + for (int x = 0; x < tile.size_x; x++) + { + int region = REGION(x,y,shapeindex); + float err, besterr; + + besterr = Utils::norm(tile.data[y][x], palette[region][0]); + indices[y][x] = 0; + + for (int i = 1; i < NINDICES && besterr > 0; ++i) + { + err = Utils::norm(tile.data[y][x], palette[region][i]); + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[y][x] = i; + } + } + toterr[region] += besterr; + } +} + +static float perturb_one(const Vector3 colors[], const float importance[], int np, int ch, int prec, const IntEndpts &old_endpts, IntEndpts &new_endpts, + float old_err, int do_b) +{ + // we have the old endpoints: old_endpts + // we have the perturbed endpoints: new_endpts + // we have the temporary endpoints: temp_endpts + + IntEndpts temp_endpts; + float min_err = old_err; // start with the best current error + int beststep; + + // copy real endpoints so we can perturb them + for (int i=0; i>= 1) + { + bool improved = false; + for (int sign = -1; sign <= 1; sign += 2) + { + if (do_b == 0) + { + temp_endpts.A[ch] = new_endpts.A[ch] + sign * step; + if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec)) + continue; + } + else + { + temp_endpts.B[ch] = new_endpts.B[ch] + sign * step; + if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec)) + continue; + } + + float err = map_colors(colors, importance, np, temp_endpts, prec); + + if (err < min_err) + { + improved = true; + min_err = err; + beststep = sign * step; + } + } + // if this was an improvement, move the endpoint and continue search from there + if (improved) + { + if (do_b == 0) + new_endpts.A[ch] += beststep; + else + new_endpts.B[ch] += beststep; + } + } + return min_err; +} + +static void optimize_one(const Vector3 colors[], const float importance[], int np, float orig_err, const IntEndpts &orig_endpts, int prec, IntEndpts &opt_endpts) +{ + float opt_err = orig_err; + for (int ch = 0; ch < NCHANNELS; ++ch) + { + opt_endpts.A[ch] = orig_endpts.A[ch]; + opt_endpts.B[ch] = orig_endpts.B[ch]; + } + /* + err0 = perturb(rgb0, delta0) + err1 = perturb(rgb1, delta1) + if (err0 < err1) + if (err0 >= initial_error) break + rgb0 += delta0 + next = 1 + else + if (err1 >= initial_error) break + rgb1 += delta1 + next = 0 + initial_err = map() + for (;;) + err = perturb(next ? rgb1:rgb0, delta) + if (err >= initial_err) break + next? rgb1 : rgb0 += delta + initial_err = err + */ + IntEndpts new_a, new_b; + IntEndpts new_endpt; + int do_b; + + // now optimize each channel separately + for (int ch = 0; ch < NCHANNELS; ++ch) + { + // figure out which endpoint when perturbed gives the most improvement and start there + // if we just alternate, we can easily end up in a local minima + float err0 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_a, opt_err, 0); // perturb endpt A + float err1 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_b, opt_err, 1); // perturb endpt B + + if (err0 < err1) + { + if (err0 >= opt_err) + continue; + + opt_endpts.A[ch] = new_a.A[ch]; + opt_err = err0; + do_b = 1; // do B next + } + else + { + if (err1 >= opt_err) + continue; + opt_endpts.B[ch] = new_b.B[ch]; + opt_err = err1; + do_b = 0; // do A next + } + + // now alternate endpoints and keep trying until there is no improvement + for (;;) + { + float err = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_endpt, opt_err, do_b); + if (err >= opt_err) + break; + if (do_b == 0) + opt_endpts.A[ch] = new_endpt.A[ch]; + else + opt_endpts.B[ch] = new_endpt.B[ch]; + opt_err = err; + do_b = 1 - do_b; // now move the other endpoint + } + } +} + +static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS_TWO], + const IntEndpts orig_endpts[NREGIONS_TWO], int prec, IntEndpts opt_endpts[NREGIONS_TWO]) +{ + Vector3 pixels[Tile::TILE_TOTAL]; + float importance[Tile::TILE_TOTAL]; + float err = 0; + + for (int region=0; region 0; ++i) + { + err = Utils::norm(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x]; + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + besterr = err; + } + toterr += besterr; + } + return toterr; +} + +float ZOH::roughtwo(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_TWO]) +{ + for (int region=0; region maxp) maxp = dp; + } + + // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values + endpts[region].A = mean + minp*direction; + endpts[region].B = mean + maxp*direction; + + // clamp endpoints + // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best + // shape based on endpoints being clamped + Utils::clamp(endpts[region].A); + Utils::clamp(endpts[region].B); + } + + return map_colors(tile, shapeindex, endpts); +} + +float ZOH::compresstwo(const Tile &t, char *block) +{ + int shapeindex_best = 0; + FltEndpts endptsbest[NREGIONS_TWO], tempendpts[NREGIONS_TWO]; + float msebest = FLT_MAX; + + /* + collect the mse values that are within 5% of the best values + optimize each one and choose the best + */ + // hack for now -- just use the best value WORK + for (int i=0; i0.0; ++i) + { + float mse = roughtwo(t, i, tempendpts); + if (mse < msebest) + { + msebest = mse; + shapeindex_best = i; + memcpy(endptsbest, tempendpts, sizeof(endptsbest)); + } + + } + return refinetwo(t, shapeindex_best, endptsbest, block); +} + diff --git a/3rdparty/nvtt/bc7/avpcl.cpp b/3rdparty/nvtt/bc7/avpcl.cpp new file mode 100644 index 00000000..f3ec8d05 --- /dev/null +++ b/3rdparty/nvtt/bc7/avpcl.cpp @@ -0,0 +1,264 @@ +/* +Copyright 2007 nVidia, Inc. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. +*/ + +// the avpcl compressor and decompressor + +#include "tile.h" +#include "avpcl.h" +#include "nvcore/Debug.h" +#include "nvmath/Vector.inl" +#include +#include + +using namespace nv; +using namespace AVPCL; + +// global flags +bool AVPCL::flag_premult = false; +bool AVPCL::flag_nonuniform = false; +bool AVPCL::flag_nonuniform_ati = false; + +// global mode +bool AVPCL::mode_rgb = false; // true if image had constant alpha = 255 + +void AVPCL::compress(const Tile &t, char *block) +{ + char tempblock[AVPCL::BLOCKSIZE]; + float msebest = FLT_MAX; + + float mse_mode0 = AVPCL::compress_mode0(t, tempblock); if(mse_mode0 < msebest) { msebest = mse_mode0; memcpy(block, tempblock, AVPCL::BLOCKSIZE); } + float mse_mode1 = AVPCL::compress_mode1(t, tempblock); if(mse_mode1 < msebest) { msebest = mse_mode1; memcpy(block, tempblock, AVPCL::BLOCKSIZE); } + float mse_mode2 = AVPCL::compress_mode2(t, tempblock); if(mse_mode2 < msebest) { msebest = mse_mode2; memcpy(block, tempblock, AVPCL::BLOCKSIZE); } + float mse_mode3 = AVPCL::compress_mode3(t, tempblock); if(mse_mode3 < msebest) { msebest = mse_mode3; memcpy(block, tempblock, AVPCL::BLOCKSIZE); } + float mse_mode4 = AVPCL::compress_mode4(t, tempblock); if(mse_mode4 < msebest) { msebest = mse_mode4; memcpy(block, tempblock, AVPCL::BLOCKSIZE); } + float mse_mode5 = AVPCL::compress_mode5(t, tempblock); if(mse_mode5 < msebest) { msebest = mse_mode5; memcpy(block, tempblock, AVPCL::BLOCKSIZE); } + float mse_mode6 = AVPCL::compress_mode6(t, tempblock); if(mse_mode6 < msebest) { msebest = mse_mode6; memcpy(block, tempblock, AVPCL::BLOCKSIZE); } + float mse_mode7 = AVPCL::compress_mode7(t, tempblock); if(mse_mode7 < msebest) { msebest = mse_mode7; memcpy(block, tempblock, AVPCL::BLOCKSIZE); } + + /*if (errfile) + { + float errs[21]; + int nerrs = 8; + errs[0] = mse_mode0; + errs[1] = mse_mode1; + errs[2] = mse_mode2; + errs[3] = mse_mode3; + errs[4] = mse_mode4; + errs[5] = mse_mode5; + errs[6] = mse_mode6; + errs[7] = mse_mode7; + if (fwrite(errs, sizeof(float), nerrs, errfile) != nerrs) + throw "Write error on error file"; + }*/ +} + +/* +static int getbit(char *b, int start) +{ + if (start < 0 || start >= 128) return 0; // out of range + + int ix = start >> 3; + return (b[ix] & (1 << (start & 7))) != 0; +} + +static int getbits(char *b, int start, int len) +{ + int out = 0; + for (int i=0; i= 128) return; // out of range + + int ix = start >> 3; + + if (bit & 1) + b[ix] |= (1 << (start & 7)); + else + b[ix] &= ~(1 << (start & 7)); +} + +static void setbits(char *b, int start, int len, int bits) +{ + for (int i=0; i> i); +} +*/ + +void AVPCL::decompress(const char *cblock, Tile &t) +{ + char block[AVPCL::BLOCKSIZE]; + memcpy(block, cblock, AVPCL::BLOCKSIZE); + + switch(getmode(block)) + { + case 0: AVPCL::decompress_mode0(block, t); break; + case 1: AVPCL::decompress_mode1(block, t); break; + case 2: AVPCL::decompress_mode2(block, t); break; + case 3: AVPCL::decompress_mode3(block, t); break; + case 4: AVPCL::decompress_mode4(block, t); break; + case 5: AVPCL::decompress_mode5(block, t); break; + case 6: AVPCL::decompress_mode6(block, t); break; + case 7: AVPCL::decompress_mode7(block, t); break; + case 8: // return a black tile if you get a reserved mode + for (int y=0; y pixels; + int w, h; + char block[AVPCL::BLOCKSIZE]; + + Targa::read(inf, pixels, w, h); + FILE *avpclfile = fopen(avpclf.c_str(), "wb"); + if (avpclfile == NULL) throw "Unable to open .avpcl file for write"; + FILE *errfile = NULL; + if (errf != "") + { + errfile = fopen(errf.c_str(), "wb"); + if (errfile == NULL) throw "Unable to open error file for write"; + } + + // Look at alpha channel and override the premult flag if alpha is constant (but only if premult is set) + if (AVPCL::flag_premult) + { + if (AVPCL::mode_rgb) + { + AVPCL::flag_premult = false; + cout << endl << "NOTE: Source image alpha is constant 255, turning off premultiplied-alpha error metric." << endl << endl; + } + } + + // stuff for progress bar O.o + int ntiles = ((h+Tile::TILE_H-1)/Tile::TILE_H)*((w+Tile::TILE_W-1)/Tile::TILE_W); + int tilecnt = 0; + clock_t start, prev, cur; + + start = prev = clock(); + + // convert to tiles and compress each tile + for (int y=0; y> thing; + return thing; +} + +// avpcl file name is ...-w-h-RGB[A].avpcl, extract width and height +static void extract(string avpclf, int &w, int &h, bool &mode_rgb) +{ + size_t n = avpclf.rfind('.', avpclf.length()-1); + size_t n1 = avpclf.rfind('-', n-1); + size_t n2 = avpclf.rfind('-', n1-1); + size_t n3 = avpclf.rfind('-', n2-1); + // ...-wwww-hhhh-RGB[A].avpcl + // ^ ^ ^ ^ + // n3 n2 n1 n n3 pixels; + int w, h; + char block[AVPCL::BLOCKSIZE]; + + extract(avpclf, w, h, AVPCL::mode_rgb); + FILE *avpclfile = fopen(avpclf.c_str(), "rb"); + if (avpclfile == NULL) throw "Unable to open .avpcl file for read"; + pixels.resizeErase(h, w); + + // convert to tiles and decompress each tile + for (int y=0; y +#include + +#include "shapes_three.h" + +// use only the first 16 available shapes +#undef NSHAPES +#undef SHAPEBITS +#define NSHAPES 16 +#define SHAPEBITS 4 + +using namespace nv; +using namespace AVPCL; + +#define NLSBMODES 4 // number of different lsb modes per region. since we have two .1 per region, that can have 4 values + +#define NINDICES 8 +#define INDEXBITS 3 +#define HIGH_INDEXBIT (1<<(INDEXBITS-1)) +#define DENOM (NINDICES-1) +#define BIAS (DENOM/2) + +// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like? +// i.e. can we search shapes in a particular order so we can see the global error minima easily and +// stop without having to touch all shapes? + +#define POS_TO_X(pos) ((pos)&3) +#define POS_TO_Y(pos) (((pos)>>2)&3) + +#define NBITSIZES (NREGIONS*2) +#define ABITINDEX(region) (2*(region)+0) +#define BBITINDEX(region) (2*(region)+1) + +struct ChanBits +{ + int nbitsizes[NBITSIZES]; // bitsizes for one channel +}; + +struct Pattern +{ + ChanBits chan[NCHANNELS_RGB];// bit patterns used per channel + int transformed; // if 0, deltas are unsigned and no transform; otherwise, signed and transformed + int mode; // associated mode value + int modebits; // number of mode bits + const char *encoding; // verilog description of encoding for this mode +}; + +#define NPATTERNS 1 + +static Pattern patterns[NPATTERNS] = +{ + // red green blue xfm mode mb + 4,4,4,4,4,4, 4,4,4,4,4,4, 4,4,4,4,4,4, 0, 0x1, 1, "", // really 444.1 x 6 +}; + +struct RegionPrec +{ + int endpt_a_prec[NCHANNELS_RGB]; + int endpt_b_prec[NCHANNELS_RGB]; +}; + +struct PatternPrec +{ + RegionPrec region_precs[NREGIONS]; +}; + +// this is the precision for each channel and region +// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this! +static PatternPrec pattern_precs[NPATTERNS] = +{ + 4,4,4, 4,4,4, 4,4,4, 4,4,4, 4,4,4, 4,4,4, +}; + +// return # of bits needed to store n. handle signed or unsigned cases properly +static int nbits(int n, bool issigned) +{ + int nb; + if (n==0) + return 0; // no bits needed for 0, signed or not + else if (n > 0) + { + for (nb=0; n; ++nb, n>>=1) ; + return nb + (issigned?1:0); + } + else + { + nvAssert (issigned); + for (nb=0; n<-1; ++nb, n>>=1) ; + return nb + 1; + } +} + +static void transform_forward(IntEndptsRGB_2 ep[NREGIONS]) +{ + nvUnreachable(); +} + +static void transform_inverse(IntEndptsRGB_2 ep[NREGIONS]) +{ + nvUnreachable(); +} + +// endpoints are 555,555; reduce to 444,444 and put the lsb bit majority in compr_bits +static void compress_one(const IntEndptsRGB& endpts, IntEndptsRGB_2& compr_endpts) +{ + int onescnt; + + onescnt = 0; + for (int j=0; j> 1; + nvAssert (compr_endpts.A[j] < 16); + } + compr_endpts.a_lsb = onescnt >= 2; + + onescnt = 0; + for (int j=0; j> 1; + nvAssert (compr_endpts.B[j] < 16); + } + compr_endpts.b_lsb = onescnt >= 2; +} + +static void uncompress_one(const IntEndptsRGB_2& compr_endpts, IntEndptsRGB& endpts) +{ + for (int j=0; j= 0 && pat_index < NPATTERNS); + nvAssert (in.getptr() == patterns[pat_index].modebits); + + shapeindex = in.read(SHAPEBITS); + p = patterns[pat_index]; + + for (int j=0; j 0; ++j) + { + float err = Utils::metric4(colors[i], palette[j]) * importance[i]; + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[i] = j; + } + } + toterr += besterr; + + // check for early exit + if (toterr > current_err) + { + // fill out bogus index values so it's initialized at least + for (int k = i; k < np; ++k) + indices[k] = -1; + + return FLT_MAX; + } + } + return toterr; +} + +// assign indices given a tile, shape, and quantized endpoints, return toterr for each region +static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB_2 endpts[NREGIONS], const PatternPrec &pattern_prec, + int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS]) +{ + // build list of possibles + Vector4 palette[NREGIONS][NINDICES]; + + for (int region = 0; region < NREGIONS; ++region) + { + generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]); + toterr[region] = 0; + } + + Vector4 err; + + for (int y = 0; y < tile.size_y; y++) + for (int x = 0; x < tile.size_x; x++) + { + int region = REGION(x,y,shapeindex); + float err, besterr = FLT_MAX; + + for (int i = 0; i < NINDICES && besterr > 0; ++i) + { + err = Utils::metric4(tile.data[y][x], palette[region][i]); + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[y][x] = i; + } + } + toterr[region] += besterr; + } +} + +// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's +// this function returns either old_err or a value smaller (if it was successful in improving the error) +static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, const IntEndptsRGB_2 &old_endpts, IntEndptsRGB_2 &new_endpts, + float old_err, int do_b, int indices[Tile::TILE_TOTAL]) +{ + // we have the old endpoints: old_endpts + // we have the perturbed endpoints: new_endpts + // we have the temporary endpoints: temp_endpts + + IntEndptsRGB_2 temp_endpts; + float min_err = old_err; // start with the best current error + int beststep; + int temp_indices[Tile::TILE_TOTAL]; + + for (int i=0; i>= 1) + { + bool improved = false; + for (int sign = -1; sign <= 1; sign += 2) + { + if (do_b == 0) + { + temp_endpts.A[ch] = new_endpts.A[ch] + sign * step; + if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec)) + continue; + } + else + { + temp_endpts.B[ch] = new_endpts.B[ch] + sign * step; + if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec)) + continue; + } + + float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices); + + if (err < min_err) + { + improved = true; + min_err = err; + beststep = sign * step; + for (int i=0; i 5000 perturb endpoints 50% of precision +// if err > 1000 25% +// if err > 200 12.5% +// if err > 40 6.25% +// for np = 16 -- adjust error thresholds as a function of np +// always ensure endpoint ordering is preserved (no need to overlap the scan) +// if orig_err returned from this is less than its input value, then indices[] will contain valid indices +static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, float &orig_err, IntEndptsRGB_2 &opt_endpts, int indices[Tile::TILE_TOTAL]) +{ + IntEndptsRGB_2 temp_endpts; + float best_err = orig_err; + int aprec = region_prec.endpt_a_prec[ch]; + int bprec = region_prec.endpt_b_prec[ch]; + int good_indices[Tile::TILE_TOTAL]; + int temp_indices[Tile::TILE_TOTAL]; + + for (int i=0; i 5000.0*thr_scale) { adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; } + else if (orig_err > 1000.0*thr_scale) { adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; } + else if (orig_err > 200.0*thr_scale) { adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; } + else if (orig_err > 40.0*thr_scale) { adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; } + adelta = max(adelta, 3); + bdelta = max(bdelta, 3); + +#ifdef DISABLE_EXHAUSTIVE + adelta = bdelta = 3; +#endif + + temp_endpts = opt_endpts; + + // ok figure out the range of A and B + int alow = max(0, opt_endpts.A[ch] - adelta); + int ahigh = min((1<= initial_error) break + rgb0 += delta0 + next = 1 + else + if (err1 >= initial_error) break + rgb1 += delta1 + next = 0 + initial_err = map() + for (;;) + err = perturb(next ? rgb1:rgb0, delta) + if (err >= initial_err) break + next? rgb1 : rgb0 += delta + initial_err = err + */ + IntEndptsRGB_2 new_a, new_b; + IntEndptsRGB_2 new_endpt; + int do_b; + int orig_indices[Tile::TILE_TOTAL]; + int new_indices[Tile::TILE_TOTAL]; + int temp_indices0[Tile::TILE_TOTAL]; + int temp_indices1[Tile::TILE_TOTAL]; + + // now optimize each channel separately + // for the first error improvement, we save the indices. then, for any later improvement, we compare the indices + // if they differ, we restart the loop (which then falls back to looking for a first improvement.) + for (int ch = 0; ch < NCHANNELS_RGB; ++ch) + { + // figure out which endpoint when perturbed gives the most improvement and start there + // if we just alternate, we can easily end up in a local minima + float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A + float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B + + if (err0 < err1) + { + if (err0 >= opt_err) + continue; + + for (int i=0; i= opt_err) + continue; + + for (int i=0; i= opt_err) + break; + + for (int i=0; i> 1) & 1; + + // make sure we have a valid error for temp_in + // we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts + // (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position) + float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices); + + // now try to optimize these endpoints + float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out); + + // if we find an improvement, update the best so far and correct the output endpoints and errors + if (temp_out_err < best_err) + { + best_err = temp_out_err; + opt_err[region] = temp_out_err; + opt_endpts[region] = temp_out; + } + } + } +} + +/* optimization algorithm + for each pattern + convert endpoints using pattern precision + assign indices and get initial error + compress indices (and possibly reorder endpoints) + transform endpoints + if transformed endpoints fit pattern + get original endpoints back + optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better + compress new indices + transform new endpoints + if new endpoints fit pattern AND if error is improved + emit compressed block with new data + else + emit compressed block with original data // to try to preserve maximum endpoint precision +*/ + +static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block) +{ + float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS]; + IntEndptsRGB_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS]; + int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W]; + + for (int sp = 0; sp < NPATTERNS; ++sp) + { + quantize_endpts(endpts, pattern_precs[sp], orig_endpts); + assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err); + swap_indices(orig_endpts, orig_indices, shapeindex_best); + if (patterns[sp].transformed) + transform_forward(orig_endpts); + // apply a heuristic here -- we check if the endpoints fit before we try to optimize them. + // the assumption made is that if they don't fit now, they won't fit after optimizing. + if (endpts_fit(orig_endpts, patterns[sp])) + { + if (patterns[sp].transformed) + transform_inverse(orig_endpts); + optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts); + assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err); + // (nreed) Commented out asserts because they go off all the time...not sure why + //for (int i=0; i 255.0f) v.x = 255.0f; + if (v.y < 0.0f) v.y = 0.0f; + if (v.y > 255.0f) v.y = 255.0f; + if (v.z < 0.0f) v.z = 0.0f; + if (v.z > 255.0f) v.z = 255.0f; + v.w = 255.0f; +} + +static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES]) +{ + for (int region = 0; region < NREGIONS; ++region) + for (int i = 0; i < NINDICES; ++i) + palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM); +} + +// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined +static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS]) +{ + // build list of possibles + Vector4 palette[NREGIONS][NINDICES]; + + generate_palette_unquantized(endpts, palette); + + float toterr = 0; + Vector4 err; + + for (int y = 0; y < tile.size_y; y++) + for (int x = 0; x < tile.size_x; x++) + { + int region = REGION(x,y,shapeindex); + float err, besterr = FLT_MAX; + + for (int i = 0; i < NINDICES && besterr > 0; ++i) + { + err = Utils::metric4(tile.data[y][x], palette[region][i]); + + if (err > besterr) // error increased, so we're done searching. this works for most norms. + break; + if (err < besterr) + besterr = err; + } + toterr += besterr; + } + return toterr; +} + +// for this mode, we assume alpha = 255 constant and compress only the RGB portion. +// however, we do the error check against the actual alpha values supplied for the tile. +static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS]) +{ + for (int region=0; region maxp) maxp = dp; + } + + // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values + endpts[region].A = mean + minp*Vector4(direction, 0); + endpts[region].B = mean + maxp*Vector4(direction, 0); + + // clamp endpoints + // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best + // shape based on endpoints being clamped + clamp(endpts[region].A); + clamp(endpts[region].B); + } + + return map_colors(tile, shapeindex, endpts); +} + +static void swap(float *list1, int *list2, int i, int j) +{ + float t = list1[i]; list1[i] = list1[j]; list1[j] = t; + int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1; +} + +float AVPCL::compress_mode0(const Tile &t, char *block) +{ + // number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES + // NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out + const int NITEMS=NSHAPES/4; + + // pick the best NITEMS shapes and refine these. + struct { + FltEndpts endpts[NREGIONS]; + } all[NSHAPES]; + float roughmse[NSHAPES]; + int index[NSHAPES]; + char tempblock[AVPCL::BLOCKSIZE]; + float msebest = FLT_MAX; + + for (int i=0; i roughmse[j]) + swap(roughmse, index, i, j); + + for (int i=0; i0; ++i) + { + int shape = index[i]; + float mse = refine(t, shape, &all[shape].endpts[0], tempblock); + if (mse < msebest) + { + memcpy(block, tempblock, sizeof(tempblock)); + msebest = mse; + } + } + return msebest; +} + diff --git a/3rdparty/nvtt/bc7/avpcl_mode1.cpp b/3rdparty/nvtt/bc7/avpcl_mode1.cpp new file mode 100644 index 00000000..c01a7150 --- /dev/null +++ b/3rdparty/nvtt/bc7/avpcl_mode1.cpp @@ -0,0 +1,1047 @@ +/* +Copyright 2007 nVidia, Inc. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. +*/ + +// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis + +// x10 (666x2).1 (666x2).1 64p 3bi + +#include "bits.h" +#include "tile.h" +#include "avpcl.h" +#include "nvcore/Debug.h" +#include "nvmath/Vector.inl" +#include "nvmath/Matrix.inl" +#include "nvmath/Fitting.h" +#include "avpcl_utils.h" +#include "endpts.h" +#include +#include + +#include "shapes_two.h" + +using namespace nv; +using namespace AVPCL; + +#define NLSBMODES 2 // number of different lsb modes per region. since we have one .1 per region, that can have 2 values + +#define NINDICES 8 +#define INDEXBITS 3 +#define HIGH_INDEXBIT (1<<(INDEXBITS-1)) +#define DENOM (NINDICES-1) +#define BIAS (DENOM/2) + +// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like? +// i.e. can we search shapes in a particular order so we can see the global error minima easily and +// stop without having to touch all shapes? + +#define POS_TO_X(pos) ((pos)&3) +#define POS_TO_Y(pos) (((pos)>>2)&3) + +#define NBITSIZES (NREGIONS*2) +#define ABITINDEX(region) (2*(region)+0) +#define BBITINDEX(region) (2*(region)+1) + +struct ChanBits +{ + int nbitsizes[NBITSIZES]; // bitsizes for one channel +}; + +struct Pattern +{ + ChanBits chan[NCHANNELS_RGB];// bit patterns used per channel + int transformed; // if 0, deltas are unsigned and no transform; otherwise, signed and transformed + int mode; // associated mode value + int modebits; // number of mode bits + const char *encoding; // verilog description of encoding for this mode +}; + +#define NPATTERNS 1 + +static Pattern patterns[NPATTERNS] = +{ + // red green blue xfm mode mb + 6,6,6,6, 6,6,6,6, 6,6,6,6, 0, 0x2, 2, "", +}; + +struct RegionPrec +{ + int endpt_a_prec[NCHANNELS_RGB]; + int endpt_b_prec[NCHANNELS_RGB]; +}; + +struct PatternPrec +{ + RegionPrec region_precs[NREGIONS]; +}; + + +// this is the precision for each channel and region +// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this! +static PatternPrec pattern_precs[NPATTERNS] = +{ + 6,6,6, 6,6,6, 6,6,6, 6,6,6, +}; + +// return # of bits needed to store n. handle signed or unsigned cases properly +static int nbits(int n, bool issigned) +{ + int nb; + if (n==0) + return 0; // no bits needed for 0, signed or not + else if (n > 0) + { + for (nb=0; n; ++nb, n>>=1) ; + return nb + (issigned?1:0); + } + else + { + nvAssert (issigned); + for (nb=0; n<-1; ++nb, n>>=1) ; + return nb + 1; + } +} + + +static void transform_forward(IntEndptsRGB_1 ep[NREGIONS]) +{ + nvUnreachable(); +} + +static void transform_inverse(IntEndptsRGB_1 ep[NREGIONS]) +{ + nvUnreachable(); +} + +// endpoints are 777,777; reduce to 666,666 and put the lsb bit majority in compr_bits +static void compress_one(const IntEndptsRGB& endpts, IntEndptsRGB_1& compr_endpts) +{ + int onescnt; + + onescnt = 0; + for (int j=0; j> 1; + onescnt += endpts.B[j] & 1; + compr_endpts.B[j] = endpts.B[j] >> 1; + nvAssert (compr_endpts.A[j] < 64); + nvAssert (compr_endpts.B[j] < 64); + } + compr_endpts.lsb = onescnt >= 3; +} + +static void uncompress_one(const IntEndptsRGB_1& compr_endpts, IntEndptsRGB& endpts) +{ + for (int j=0; j= 0 && pat_index < NPATTERNS); + nvAssert (in.getptr() == patterns[pat_index].modebits); + + shapeindex = in.read(SHAPEBITS); + p = patterns[pat_index]; + + for (int j=0; j 0; ++j) + { + float err = Utils::metric4(colors[i], palette[j]) * importance[i]; + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[i] = j; + } + } + toterr += besterr; + + // check for early exit + if (toterr > current_err) + { + // fill out bogus index values so it's initialized at least + for (int k = i; k < np; ++k) + indices[k] = -1; + + return FLT_MAX; + } + } + return toterr; +} + +// assign indices given a tile, shape, and quantized endpoints, return toterr for each region +static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB_1 endpts[NREGIONS], const PatternPrec &pattern_prec, + int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS]) +{ + // build list of possibles + Vector4 palette[NREGIONS][NINDICES]; + + for (int region = 0; region < NREGIONS; ++region) + { + generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]); + toterr[region] = 0; + } + + Vector4 err; + + for (int y = 0; y < tile.size_y; y++) + for (int x = 0; x < tile.size_x; x++) + { + int region = REGION(x,y,shapeindex); + float err, besterr = FLT_MAX; + + for (int i = 0; i < NINDICES && besterr > 0; ++i) + { + err = Utils::metric4(tile.data[y][x], palette[region][i]); + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[y][x] = i; + } + } + toterr[region] += besterr; + } +} + +// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's +// this function returns either old_err or a value smaller (if it was successful in improving the error) +static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, const IntEndptsRGB_1 &old_endpts, IntEndptsRGB_1 &new_endpts, + float old_err, int do_b, int indices[Tile::TILE_TOTAL]) +{ + // we have the old endpoints: old_endpts + // we have the perturbed endpoints: new_endpts + // we have the temporary endpoints: temp_endpts + + IntEndptsRGB_1 temp_endpts; + float min_err = old_err; // start with the best current error + int beststep; + int temp_indices[Tile::TILE_TOTAL]; + + for (int i=0; i>= 1) + { + bool improved = false; + for (int sign = -1; sign <= 1; sign += 2) + { + if (do_b == 0) + { + temp_endpts.A[ch] = new_endpts.A[ch] + sign * step; + if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec)) + continue; + } + else + { + temp_endpts.B[ch] = new_endpts.B[ch] + sign * step; + if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec)) + continue; + } + + float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices); + + if (err < min_err) + { + improved = true; + min_err = err; + beststep = sign * step; + for (int i=0; i 5000 perturb endpoints 50% of precision +// if err > 1000 25% +// if err > 200 12.5% +// if err > 40 6.25% +// for np = 16 -- adjust error thresholds as a function of np +// always ensure endpoint ordering is preserved (no need to overlap the scan) +// if orig_err returned from this is less than its input value, then indices[] will contain valid indices +static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, float orig_err, IntEndptsRGB_1 &opt_endpts, int indices[Tile::TILE_TOTAL]) +{ + IntEndptsRGB_1 temp_endpts; + float best_err = orig_err; + int aprec = region_prec.endpt_a_prec[ch]; + int bprec = region_prec.endpt_b_prec[ch]; + int good_indices[Tile::TILE_TOTAL]; + int temp_indices[Tile::TILE_TOTAL]; + + for (int i=0; i 5000.0*thr_scale) { adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; } + else if (orig_err > 1000.0*thr_scale) { adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; } + else if (orig_err > 200.0*thr_scale) { adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; } + else if (orig_err > 40.0*thr_scale) { adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; } + adelta = max(adelta, 3); + bdelta = max(bdelta, 3); + +#ifdef DISABLE_EXHAUSTIVE + adelta = bdelta = 3; +#endif + + temp_endpts = opt_endpts; + + // ok figure out the range of A and B + int alow = max(0, opt_endpts.A[ch] - adelta); + int ahigh = min((1<= initial_error) break + rgb0 += delta0 + next = 1 + else + if (err1 >= initial_error) break + rgb1 += delta1 + next = 0 + initial_err = map() + for (;;) + err = perturb(next ? rgb1:rgb0, delta) + if (err >= initial_err) break + next? rgb1 : rgb0 += delta + initial_err = err + */ + IntEndptsRGB_1 new_a, new_b; + IntEndptsRGB_1 new_endpt; + int do_b; + int orig_indices[Tile::TILE_TOTAL]; + int new_indices[Tile::TILE_TOTAL]; + int temp_indices0[Tile::TILE_TOTAL]; + int temp_indices1[Tile::TILE_TOTAL]; + + // now optimize each channel separately + // for the first error improvement, we save the indices. then, for any later improvement, we compare the indices + // if they differ, we restart the loop (which then falls back to looking for a first improvement.) + for (int ch = 0; ch < NCHANNELS_RGB; ++ch) + { + // figure out which endpoint when perturbed gives the most improvement and start there + // if we just alternate, we can easily end up in a local minima + float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A + float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B + + if (err0 < err1) + { + if (err0 >= opt_err) + continue; + + for (int i=0; i= opt_err) + continue; + + for (int i=0; i= opt_err) + break; + + for (int i=0; i 255.0f) v.x = 255.0f; + if (v.y < 0.0f) v.y = 0.0f; + if (v.y > 255.0f) v.y = 255.0f; + if (v.z < 0.0f) v.z = 0.0f; + if (v.z > 255.0f) v.z = 255.0f; + v.w = 255.0f; +} + +static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES]) +{ + for (int region = 0; region < NREGIONS; ++region) + for (int i = 0; i < NINDICES; ++i) + palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM); +} + +// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined +static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS]) +{ + // build list of possibles + Vector4 palette[NREGIONS][NINDICES]; + + generate_palette_unquantized(endpts, palette); + + float toterr = 0; + Vector4 err; + + for (int y = 0; y < tile.size_y; y++) + for (int x = 0; x < tile.size_x; x++) + { + int region = REGION(x,y,shapeindex); + float besterr = FLT_MAX; + + for (int i = 0; i < NINDICES && besterr > 0; ++i) + { + float err = Utils::metric4(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x]; + + if (err > besterr) // error increased, so we're done searching. this works for most norms. + break; + if (err < besterr) + besterr = err; + } + toterr += besterr; + } + return toterr; +} + +static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS]) +{ + for (int region=0; region maxp) maxp = dp; + } + + // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values + endpts[region].A = mean + minp*Vector4(direction, 0); + endpts[region].B = mean + maxp*Vector4(direction, 0); + + // clamp endpoints + // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best + // shape based on endpoints being clamped + clamp(endpts[region].A); + clamp(endpts[region].B); + } + + return map_colors(tile, shapeindex, endpts); +} + +static void swap(float *list1, int *list2, int i, int j) +{ + float t = list1[i]; list1[i] = list1[j]; list1[j] = t; + int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1; +} + +float AVPCL::compress_mode1(const Tile &t, char *block) +{ + // number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES + // NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out + const int NITEMS=NSHAPES/4; + + // pick the best NITEMS shapes and refine these. + struct { + FltEndpts endpts[NREGIONS]; + } all[NSHAPES]; + float roughmse[NSHAPES]; + int index[NSHAPES]; + char tempblock[AVPCL::BLOCKSIZE]; + float msebest = FLT_MAX; + + for (int i=0; i roughmse[j]) + swap(roughmse, index, i, j); + + for (int i=0; i0; ++i) + { + int shape = index[i]; + float mse = refine(t, shape, &all[shape].endpts[0], tempblock); + if (mse < msebest) + { + memcpy(block, tempblock, sizeof(tempblock)); + msebest = mse; + } + } + return msebest; +} + diff --git a/3rdparty/nvtt/bc7/avpcl_mode2.cpp b/3rdparty/nvtt/bc7/avpcl_mode2.cpp new file mode 100644 index 00000000..67498dce --- /dev/null +++ b/3rdparty/nvtt/bc7/avpcl_mode2.cpp @@ -0,0 +1,1004 @@ +/* +Copyright 2007 nVidia, Inc. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. +*/ + +// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis + +// x100 555x6 64p 2bi + +#include "bits.h" +#include "tile.h" +#include "avpcl.h" +#include "nvcore/Debug.h" +#include "nvmath/Vector.inl" +#include "nvmath/Matrix.inl" +#include "nvmath/Fitting.h" +#include "avpcl_utils.h" +#include "endpts.h" +#include +#include + +#include "shapes_three.h" + +using namespace nv; +using namespace AVPCL; + +#define NINDICES 4 +#define INDEXBITS 2 +#define HIGH_INDEXBIT (1<<(INDEXBITS-1)) +#define DENOM (NINDICES-1) +#define BIAS (DENOM/2) + +// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like? +// i.e. can we search shapes in a particular order so we can see the global error minima easily and +// stop without having to touch all shapes? + +#define POS_TO_X(pos) ((pos)&3) +#define POS_TO_Y(pos) (((pos)>>2)&3) + +#define NBITSIZES 6 + +struct ChanBits +{ + int nbitsizes[NBITSIZES]; // bitsizes for one channel +}; + +struct Pattern +{ + ChanBits chan[NCHANNELS_RGB];// bit patterns used per channel + int transformed; // if 0, deltas are unsigned and no transform; otherwise, signed and transformed + int mode; // associated mode value + int modebits; // number of mode bits + const char *encoding; // verilog description of encoding for this mode +}; + +#define NPATTERNS 1 + +static Pattern patterns[NPATTERNS] = +{ + // red green blue xfm mode mb + 5,5,5,5,5,5, 5,5,5,5,5,5, 5,5,5,5,5,5, 0, 0x4, 3, "", +}; + + +struct RegionPrec +{ + int endpt_a_prec[NCHANNELS_RGB]; + int endpt_b_prec[NCHANNELS_RGB]; +}; + +struct PatternPrec +{ + RegionPrec region_precs[NREGIONS_THREE]; +}; + + +// this is the precision for each channel and region +// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this! + +static PatternPrec pattern_precs[NPATTERNS] = +{ + 5,5,5, 5,5,5, 5,5,5, 5,5,5, 5,5,5, 5,5,5, +}; + +// return # of bits needed to store n. handle signed or unsigned cases properly +static int nbits(int n, bool issigned) +{ + int nb; + if (n==0) + return 0; // no bits needed for 0, signed or not + else if (n > 0) + { + for (nb=0; n; ++nb, n>>=1) ; + return nb + (issigned?1:0); + } + else + { + nvAssert (issigned); + for (nb=0; n<-1; ++nb, n>>=1) ; + return nb + 1; + } +} + +#define R_0 ep[0].A[i] +#define R_1 ep[0].B[i] +#define R_2 ep[1].A[i] +#define R_3 ep[1].B[i] + +static void transform_forward(IntEndptsRGB ep[NREGIONS]) +{ + for (int i=0; i= 0 && pat_index < NPATTERNS); + nvAssert (in.getptr() == patterns[pat_index].modebits); + + shapeindex = in.read(SHAPEBITS); + + p = patterns[pat_index]; + + for (int j=0; j 0; ++j) + { + float err = Utils::metric4(colors[i], palette[j]) * importance[i]; + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[i] = j; + } + } + toterr += besterr; + + // check for early exit + if (toterr > current_err) + { + // fill out bogus index values so it's initialized at least + for (int k = i; k < np; ++k) + indices[k] = -1; + + return FLT_MAX; + } + } + return toterr; +} + +// assign indices given a tile, shape, and quantized endpoints, return toterr for each region +static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB endpts[NREGIONS_THREE], const PatternPrec &pattern_prec, + int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_THREE]) +{ + // build list of possibles + Vector4 palette[NREGIONS_THREE][NINDICES]; + + for (int region = 0; region < NREGIONS_THREE; ++region) + { + generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]); + toterr[region] = 0; + } + + Vector4 err; + + for (int y = 0; y < tile.size_y; y++) + for (int x = 0; x < tile.size_x; x++) + { + int region = REGION(x,y,shapeindex); + float err, besterr = FLT_MAX; + + for (int i = 0; i < NINDICES && besterr > 0; ++i) + { + err = Utils::metric4(tile.data[y][x], palette[region][i]); + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[y][x] = i; + } + } + toterr[region] += besterr; + } +} + +// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's +// this function returns either old_err or a value smaller (if it was successful in improving the error) +static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, const IntEndptsRGB &old_endpts, IntEndptsRGB &new_endpts, + float old_err, int do_b, int indices[Tile::TILE_TOTAL]) +{ + // we have the old endpoints: old_endpts + // we have the perturbed endpoints: new_endpts + // we have the temporary endpoints: temp_endpts + + IntEndptsRGB temp_endpts; + float min_err = old_err; // start with the best current error + int beststep; + int temp_indices[Tile::TILE_TOTAL]; + + for (int i=0; i>= 1) + { + bool improved = false; + for (int sign = -1; sign <= 1; sign += 2) + { + if (do_b == 0) + { + temp_endpts.A[ch] = new_endpts.A[ch] + sign * step; + if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec)) + continue; + } + else + { + temp_endpts.B[ch] = new_endpts.B[ch] + sign * step; + if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec)) + continue; + } + + float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices); + + if (err < min_err) + { + improved = true; + min_err = err; + beststep = sign * step; + for (int i=0; i 5000 perturb endpoints 50% of precision +// if err > 1000 25% +// if err > 200 12.5% +// if err > 40 6.25% +// for np = 16 -- adjust error thresholds as a function of np +// always ensure endpoint ordering is preserved (no need to overlap the scan) +// if orig_err returned from this is less than its input value, then indices[] will contain valid indices +static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, float orig_err, IntEndptsRGB &opt_endpts, int indices[Tile::TILE_TOTAL]) +{ + IntEndptsRGB temp_endpts; + float best_err = orig_err; + int aprec = region_prec.endpt_a_prec[ch]; + int bprec = region_prec.endpt_b_prec[ch]; + int good_indices[Tile::TILE_TOTAL]; + int temp_indices[Tile::TILE_TOTAL]; + + for (int i=0; i 5000.0*thr_scale) { adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; } + else if (orig_err > 1000.0*thr_scale) { adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; } + else if (orig_err > 200.0*thr_scale) { adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; } + else if (orig_err > 40.0*thr_scale) { adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; } + adelta = max(adelta, 3); + bdelta = max(bdelta, 3); + +#ifdef DISABLE_EXHAUSTIVE + adelta = bdelta = 3; +#endif + + temp_endpts = opt_endpts; + + // ok figure out the range of A and B + int alow = max(0, opt_endpts.A[ch] - adelta); + int ahigh = min((1<= initial_error) break + rgb0 += delta0 + next = 1 + else + if (err1 >= initial_error) break + rgb1 += delta1 + next = 0 + initial_err = map() + for (;;) + err = perturb(next ? rgb1:rgb0, delta) + if (err >= initial_err) break + next? rgb1 : rgb0 += delta + initial_err = err + */ + IntEndptsRGB new_a, new_b; + IntEndptsRGB new_endpt; + int do_b; + int orig_indices[Tile::TILE_TOTAL]; + int new_indices[Tile::TILE_TOTAL]; + int temp_indices0[Tile::TILE_TOTAL]; + int temp_indices1[Tile::TILE_TOTAL]; + + // now optimize each channel separately + // for the first error improvement, we save the indices. then, for any later improvement, we compare the indices + // if they differ, we restart the loop (which then falls back to looking for a first improvement.) + for (int ch = 0; ch < NCHANNELS_RGB; ++ch) + { + // figure out which endpoint when perturbed gives the most improvement and start there + // if we just alternate, we can easily end up in a local minima + float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A + float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B + + if (err0 < err1) + { + if (err0 >= opt_err) + continue; + + for (int i=0; i= opt_err) + continue; + + for (int i=0; i= opt_err) + break; + + for (int i=0; i 255.0f) v.x = 255.0f; + if (v.y < 0.0f) v.y = 0.0f; + if (v.y > 255.0f) v.y = 255.0f; + if (v.z < 0.0f) v.z = 0.0f; + if (v.z > 255.0f) v.z = 255.0f; + v.w = 255.0f; +} + +static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_THREE], Vector4 palette[NREGIONS_THREE][NINDICES]) +{ + for (int region = 0; region < NREGIONS_THREE; ++region) + for (int i = 0; i < NINDICES; ++i) + palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM); +} + +// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined +static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_THREE]) +{ + // build list of possibles + Vector4 palette[NREGIONS_THREE][NINDICES]; + + generate_palette_unquantized(endpts, palette); + + float toterr = 0; + Vector4 err; + + for (int y = 0; y < tile.size_y; y++) + for (int x = 0; x < tile.size_x; x++) + { + int region = REGION(x,y,shapeindex); + float err, besterr = FLT_MAX; + + for (int i = 0; i < NINDICES && besterr > 0; ++i) + { + err = Utils::metric4(tile.data[y][x], palette[region][i]); + + if (err > besterr) // error increased, so we're done searching. this works for most norms. + break; + if (err < besterr) + besterr = err; + } + toterr += besterr; + } + return toterr; +} + +static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_THREE]) +{ + for (int region=0; region maxp) maxp = dp; + } + + // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values + endpts[region].A = mean + minp*Vector4(direction, 0); + endpts[region].B = mean + maxp*Vector4(direction, 0); + + // clamp endpoints + // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best + // shape based on endpoints being clamped + clamp(endpts[region].A); + clamp(endpts[region].B); + } + + return map_colors(tile, shapeindex, endpts); +} + +static void swap(float *list1, int *list2, int i, int j) +{ + float t = list1[i]; list1[i] = list1[j]; list1[j] = t; + int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1; +} + +float AVPCL::compress_mode2(const Tile &t, char *block) +{ + // number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES + // NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out + const int NITEMS=NSHAPES/4; + + // pick the best NITEMS shapes and refine these. + struct { + FltEndpts endpts[NREGIONS_THREE]; + } all[NSHAPES]; + float roughmse[NSHAPES]; + int index[NSHAPES]; + char tempblock[AVPCL::BLOCKSIZE]; + float msebest = FLT_MAX; + + for (int i=0; i roughmse[j]) + swap(roughmse, index, i, j); + + for (int i=0; i0; ++i) + { + int shape = index[i]; + float mse = refine(t, shape, &all[shape].endpts[0], tempblock); + if (mse < msebest) + { + memcpy(block, tempblock, sizeof(tempblock)); + msebest = mse; + } + } + return msebest; +} + diff --git a/3rdparty/nvtt/bc7/avpcl_mode3.cpp b/3rdparty/nvtt/bc7/avpcl_mode3.cpp new file mode 100644 index 00000000..2a070a8e --- /dev/null +++ b/3rdparty/nvtt/bc7/avpcl_mode3.cpp @@ -0,0 +1,1059 @@ +/* +Copyright 2007 nVidia, Inc. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. +*/ + +// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis + +// x1000 777.1x4 64p 2bi (30b) + +#include "bits.h" +#include "tile.h" +#include "avpcl.h" +#include "nvcore/Debug.h" +#include "nvmath/Vector.inl" +#include "nvmath/Matrix.inl" +#include "nvmath/Fitting.h" +#include "avpcl_utils.h" +#include "endpts.h" +#include +#include + +#include "shapes_two.h" + +using namespace nv; +using namespace AVPCL; + +#define NLSBMODES 4 // number of different lsb modes per region. since we have two .1 per region, that can have 4 values + +#define NINDICES 4 +#define INDEXBITS 2 +#define HIGH_INDEXBIT (1<<(INDEXBITS-1)) +#define DENOM (NINDICES-1) +#define BIAS (DENOM/2) + +// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like? +// i.e. can we search shapes in a particular order so we can see the global error minima easily and +// stop without having to touch all shapes? + +#define POS_TO_X(pos) ((pos)&3) +#define POS_TO_Y(pos) (((pos)>>2)&3) + +#define NBITSIZES (NREGIONS*2) +#define ABITINDEX(region) (2*(region)+0) +#define BBITINDEX(region) (2*(region)+1) + +struct ChanBits +{ + int nbitsizes[NBITSIZES]; // bitsizes for one channel +}; + +struct Pattern +{ + ChanBits chan[NCHANNELS_RGB];// bit patterns used per channel + int transformed; // if 0, deltas are unsigned and no transform; otherwise, signed and transformed + int mode; // associated mode value + int modebits; // number of mode bits + const char *encoding; // verilog description of encoding for this mode +}; + +#define NPATTERNS 1 +#define NREGIONS 2 + +static Pattern patterns[NPATTERNS] = +{ + // red green blue xfm mode mb + 7,7,7,7, 7,7,7,7, 7,7,7,7, 0, 0x8, 4, "", +}; + +struct RegionPrec +{ + int endpt_a_prec[NCHANNELS_RGB]; + int endpt_b_prec[NCHANNELS_RGB]; +}; + +struct PatternPrec +{ + RegionPrec region_precs[NREGIONS]; +}; + + +// this is the precision for each channel and region +// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this! +static PatternPrec pattern_precs[NPATTERNS] = +{ + 7,7,7, 7,7,7, 7,7,7, 7,7,7, +}; + +// return # of bits needed to store n. handle signed or unsigned cases properly +static int nbits(int n, bool issigned) +{ + int nb; + if (n==0) + return 0; // no bits needed for 0, signed or not + else if (n > 0) + { + for (nb=0; n; ++nb, n>>=1) ; + return nb + (issigned?1:0); + } + else + { + nvAssert (issigned); + for (nb=0; n<-1; ++nb, n>>=1) ; + return nb + 1; + } +} + +static void transform_forward(IntEndptsRGB_2 ep[NREGIONS]) +{ + nvUnreachable(); +} + +static void transform_inverse(IntEndptsRGB_2 ep[NREGIONS]) +{ + nvUnreachable(); +} + +// endpoints are 888,888; reduce to 777,777 and put the lsb bit majority in compr_bits +static void compress_one(const IntEndptsRGB& endpts, IntEndptsRGB_2& compr_endpts) +{ + int onescnt; + + onescnt = 0; + for (int j=0; j> 1; + nvAssert (compr_endpts.A[j] < 128); + } + compr_endpts.a_lsb = onescnt >= 2; + + onescnt = 0; + for (int j=0; j> 1; + nvAssert (compr_endpts.B[j] < 128); + } + compr_endpts.b_lsb = onescnt >= 2; +} + +static void uncompress_one(const IntEndptsRGB_2& compr_endpts, IntEndptsRGB& endpts) +{ + for (int j=0; j= 0 && pat_index < NPATTERNS); + nvAssert (in.getptr() == patterns[pat_index].modebits); + + shapeindex = in.read(SHAPEBITS); + p = patterns[pat_index]; + + for (int j=0; j 0; ++j) + { + float err = Utils::metric4(colors[i], palette[j]) * importance[i]; + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[i] = j; + } + } + toterr += besterr; + + // check for early exit + if (toterr > current_err) + { + // fill out bogus index values so it's initialized at least + for (int k = i; k < np; ++k) + indices[k] = -1; + + return FLT_MAX; + } + } + return toterr; +} + +static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB_2 endpts[NREGIONS], const PatternPrec &pattern_prec, + int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS]) +{ + // build list of possibles + Vector4 palette[NREGIONS][NINDICES]; + + for (int region = 0; region < NREGIONS; ++region) + { + generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]); + toterr[region] = 0; + } + + Vector4 err; + + for (int y = 0; y < tile.size_y; y++) + for (int x = 0; x < tile.size_x; x++) + { + int region = REGION(x,y,shapeindex); + float err, besterr = FLT_MAX; + + for (int i = 0; i < NINDICES && besterr > 0; ++i) + { + err = Utils::metric4(tile.data[y][x], palette[region][i]); + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[y][x] = i; + } + } + toterr[region] += besterr; + } +} + +// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's +// this function returns either old_err or a value smaller (if it was successful in improving the error) +static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, const IntEndptsRGB_2 &old_endpts, IntEndptsRGB_2 &new_endpts, + float old_err, int do_b, int indices[Tile::TILE_TOTAL]) +{ + // we have the old endpoints: old_endpts + // we have the perturbed endpoints: new_endpts + // we have the temporary endpoints: temp_endpts + + IntEndptsRGB_2 temp_endpts; + float min_err = old_err; // start with the best current error + int beststep; + int temp_indices[Tile::TILE_TOTAL]; + + for (int i=0; i>= 1) + { + bool improved = false; + for (int sign = -1; sign <= 1; sign += 2) + { + if (do_b == 0) + { + temp_endpts.A[ch] = new_endpts.A[ch] + sign * step; + if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec)) + continue; + } + else + { + temp_endpts.B[ch] = new_endpts.B[ch] + sign * step; + if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec)) + continue; + } + + float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices); + + if (err < min_err) + { + improved = true; + min_err = err; + beststep = sign * step; + for (int i=0; i 5000 perturb endpoints 50% of precision +// if err > 1000 25% +// if err > 200 12.5% +// if err > 40 6.25% +// for np = 16 -- adjust error thresholds as a function of np +// always ensure endpoint ordering is preserved (no need to overlap the scan) +// if orig_err returned from this is less than its input value, then indices[] will contain valid indices +static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, float &orig_err, IntEndptsRGB_2 &opt_endpts, int indices[Tile::TILE_TOTAL]) +{ + IntEndptsRGB_2 temp_endpts; + float best_err = orig_err; + int aprec = region_prec.endpt_a_prec[ch]; + int bprec = region_prec.endpt_b_prec[ch]; + int good_indices[Tile::TILE_TOTAL]; + int temp_indices[Tile::TILE_TOTAL]; + + for (int i=0; i 5000.0*thr_scale) { adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; } + else if (orig_err > 1000.0*thr_scale) { adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; } + else if (orig_err > 200.0*thr_scale) { adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; } + else if (orig_err > 40.0*thr_scale) { adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; } + adelta = max(adelta, 3); + bdelta = max(bdelta, 3); + +#ifdef DISABLE_EXHAUSTIVE + adelta = bdelta = 3; +#endif + + temp_endpts = opt_endpts; + + // ok figure out the range of A and B + int alow = max(0, opt_endpts.A[ch] - adelta); + int ahigh = min((1<= initial_error) break + rgb0 += delta0 + next = 1 + else + if (err1 >= initial_error) break + rgb1 += delta1 + next = 0 + initial_err = map() + for (;;) + err = perturb(next ? rgb1:rgb0, delta) + if (err >= initial_err) break + next? rgb1 : rgb0 += delta + initial_err = err + */ + IntEndptsRGB_2 new_a, new_b; + IntEndptsRGB_2 new_endpt; + int do_b; + int orig_indices[Tile::TILE_TOTAL]; + int new_indices[Tile::TILE_TOTAL]; + int temp_indices0[Tile::TILE_TOTAL]; + int temp_indices1[Tile::TILE_TOTAL]; + + // now optimize each channel separately + // for the first error improvement, we save the indices. then, for any later improvement, we compare the indices + // if they differ, we restart the loop (which then falls back to looking for a first improvement.) + for (int ch = 0; ch < NCHANNELS_RGB; ++ch) + { + // figure out which endpoint when perturbed gives the most improvement and start there + // if we just alternate, we can easily end up in a local minima + float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A + float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B + + if (err0 < err1) + { + if (err0 >= opt_err) + continue; + + for (int i=0; i= opt_err) + continue; + + for (int i=0; i= opt_err) + break; + + for (int i=0; i> 1) & 1; + + // make sure we have a valid error for temp_in + // we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts + // (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position) + float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices); + + // now try to optimize these endpoints + float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out); + + // if we find an improvement, update the best so far and correct the output endpoints and errors + if (temp_out_err < best_err) + { + best_err = temp_out_err; + opt_err[region] = temp_out_err; + opt_endpts[region] = temp_out; + } + } + } +} + +/* optimization algorithm + for each pattern + convert endpoints using pattern precision + assign indices and get initial error + compress indices (and possibly reorder endpoints) + transform endpoints + if transformed endpoints fit pattern + get original endpoints back + optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better + compress new indices + transform new endpoints + if new endpoints fit pattern AND if error is improved + emit compressed block with new data + else + emit compressed block with original data // to try to preserve maximum endpoint precision +*/ + +static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block) +{ + float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS]; + IntEndptsRGB_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS]; + int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W]; + + for (int sp = 0; sp < NPATTERNS; ++sp) + { + quantize_endpts(endpts, pattern_precs[sp], orig_endpts); + assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err); + swap_indices(orig_endpts, orig_indices, shapeindex_best); + if (patterns[sp].transformed) + transform_forward(orig_endpts); + // apply a heuristic here -- we check if the endpoints fit before we try to optimize them. + // the assumption made is that if they don't fit now, they won't fit after optimizing. + if (endpts_fit(orig_endpts, patterns[sp])) + { + if (patterns[sp].transformed) + transform_inverse(orig_endpts); + optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts); + assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err); + // (nreed) Commented out asserts because they go off all the time...not sure why + //for (int i=0; i 255.0f) v.x = 255.0f; + if (v.y < 0.0f) v.y = 0.0f; + if (v.y > 255.0f) v.y = 255.0f; + if (v.z < 0.0f) v.z = 0.0f; + if (v.z > 255.0f) v.z = 255.0f; + v.w = 255.0f; +} + +static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES]) +{ + for (int region = 0; region < NREGIONS; ++region) + for (int i = 0; i < NINDICES; ++i) + palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM); +} + +// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined +static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS]) +{ + // build list of possibles + Vector4 palette[NREGIONS][NINDICES]; + + generate_palette_unquantized(endpts, palette); + + float toterr = 0; + Vector4 err; + + for (int y = 0; y < tile.size_y; y++) + for (int x = 0; x < tile.size_x; x++) + { + int region = REGION(x,y,shapeindex); + float err, besterr = FLT_MAX; + + for (int i = 0; i < NINDICES && besterr > 0; ++i) + { + err = Utils::metric4(tile.data[y][x], palette[region][i]); + + if (err > besterr) // error increased, so we're done searching. this works for most norms. + break; + if (err < besterr) + besterr = err; + } + toterr += besterr; + } + return toterr; +} + +static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS]) +{ + for (int region=0; region maxp) maxp = dp; + } + + // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values + endpts[region].A = mean + minp*Vector4(direction, 0); + endpts[region].B = mean + maxp*Vector4(direction, 0); + + // clamp endpoints + // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best + // shape based on endpoints being clamped + clamp(endpts[region].A); + clamp(endpts[region].B); + } + + return map_colors(tile, shapeindex, endpts); +} + +static void swap(float *list1, int *list2, int i, int j) +{ + float t = list1[i]; list1[i] = list1[j]; list1[j] = t; + int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1; +} + +float AVPCL::compress_mode3(const Tile &t, char *block) +{ + // number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES + // NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out + const int NITEMS=NSHAPES/4; + + // pick the best NITEMS shapes and refine these. + struct { + FltEndpts endpts[NREGIONS]; + } all[NSHAPES]; + float roughmse[NSHAPES]; + int index[NSHAPES]; + char tempblock[AVPCL::BLOCKSIZE]; + float msebest = FLT_MAX; + + for (int i=0; i roughmse[j]) + swap(roughmse, index, i, j); + + for (int i=0; i0; ++i) + { + int shape = index[i]; + float mse = refine(t, shape, &all[shape].endpts[0], tempblock); + if (mse < msebest) + { + memcpy(block, tempblock, sizeof(tempblock)); + msebest = mse; + } + } + return msebest; +} + diff --git a/3rdparty/nvtt/bc7/avpcl_mode4.cpp b/3rdparty/nvtt/bc7/avpcl_mode4.cpp new file mode 100644 index 00000000..e2cb5a9d --- /dev/null +++ b/3rdparty/nvtt/bc7/avpcl_mode4.cpp @@ -0,0 +1,1214 @@ +/* +Copyright 2007 nVidia, Inc. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. +*/ + +// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis + +// x10000 2r 1i 555x2 6x2 2bi 3bi + +#include "bits.h" +#include "tile.h" +#include "avpcl.h" +#include "nvcore/Debug.h" +#include "nvmath/Vector.inl" +#include "nvmath/Matrix.inl" +#include "nvmath/Fitting.h" +#include "avpcl_utils.h" +#include "endpts.h" +#include +#include + +using namespace nv; +using namespace AVPCL; + +// there are 2 index arrays. INDEXMODE selects between the arrays being 2 & 3 bits or 3 & 2 bits +// array 0 is always the RGB array and array 1 is always the A array +#define NINDEXARRAYS 2 +#define INDEXARRAY_RGB 0 +#define INDEXARRAY_A 1 +#define INDEXARRAY_2BITS(indexmode) ((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXARRAY_A : INDEXARRAY_RGB) +#define INDEXARRAY_3BITS(indexmode) ((indexmode == INDEXMODE_ALPHA_IS_3BITS) ? INDEXARRAY_A : INDEXARRAY_RGB) + +#define NINDICES3 8 +#define INDEXBITS3 3 +#define HIGH_INDEXBIT3 (1<<(INDEXBITS3-1)) +#define DENOM3 (NINDICES3-1) +#define BIAS3 (DENOM3/2) + +#define NINDICES2 4 +#define INDEXBITS2 2 +#define HIGH_INDEXBIT2 (1<<(INDEXBITS2-1)) +#define DENOM2 (NINDICES2-1) +#define BIAS2 (DENOM2/2) + +#define NINDICES_RGB(indexmode) ((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES3 : NINDICES2) +#define INDEXBITS_RGB(indexmode) ((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS3 : INDEXBITS2) +#define HIGH_INDEXBIT_RGB(indexmode)((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT3 : HIGH_INDEXBIT2) +#define DENOM_RGB(indexmode) ((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM3 : DENOM2) +#define BIAS_RGB(indexmode) ((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS3 : BIAS2) + +#define NINDICES_A(indexmode) ((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES2 : NINDICES3) +#define INDEXBITS_A(indexmode) ((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS2 : INDEXBITS3) +#define HIGH_INDEXBIT_A(indexmode) ((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT2 : HIGH_INDEXBIT3) +#define DENOM_A(indexmode) ((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM2 : DENOM3) +#define BIAS_A(indexmode) ((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS2 : BIAS3) + +#define NSHAPES 1 + +static int shapes[NSHAPES] = +{ + 0x0000, +}; + +#define REGION(x,y,shapeindex) ((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0) + +#define NREGIONS 1 // keep the region stuff in just in case... + +// encoded index compression location: region 0 is always at 0,0. + +#define NBITSIZES 2 // one endpoint pair + +struct ChanBits +{ + int nbitsizes[NBITSIZES]; // bitsizes for one channel +}; + +struct Pattern +{ + ChanBits chan[NCHANNELS_RGBA];// bit patterns used per channel + int transform_mode; // x0 means alpha channel not transformed, x1 otherwise. 0x rgb not transformed, 1x otherwise. + int mode; // associated mode value + int modebits; // number of mode bits + const char *encoding; // verilog description of encoding for this mode +}; + +#define TRANSFORM_MODE_ALPHA 1 +#define TRANSFORM_MODE_RGB 2 + +#define NPATTERNS 1 + +static Pattern patterns[NPATTERNS] = +{ + // red green blue alpha xfm mode mb encoding + 5,5, 5,5, 5,5, 6,6, 0x0, 0x10, 5, "", +}; + +struct RegionPrec +{ + int endpt_a_prec[NCHANNELS_RGBA]; + int endpt_b_prec[NCHANNELS_RGBA]; +}; + +struct PatternPrec +{ + RegionPrec region_precs[NREGIONS]; +}; + +// this is the precision for each channel and region +// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this! +static PatternPrec pattern_precs[NPATTERNS] = +{ + 5,5,5,6, 5,5,5,6, +}; + + +// return # of bits needed to store n. handle signed or unsigned cases properly +static int nbits(int n, bool issigned) +{ + int nb; + if (n==0) + return 0; // no bits needed for 0, signed or not + else if (n > 0) + { + for (nb=0; n; ++nb, n>>=1) ; + return nb + (issigned?1:0); + } + else + { + nvAssert (issigned); + for (nb=0; n<-1; ++nb, n>>=1) ; + return nb + 1; + } +} + +#define R_0 ep[0].A[i] +#define R_1 ep[0].B[i] + +static void transform_forward(int transform_mode, IntEndptsRGBA ep[NREGIONS]) +{ + int i; + + if (transform_mode & TRANSFORM_MODE_RGB) + for (i=CHANNEL_R; i> 2) & 3 and x = index & 3 +static void swap_indices(int shapeindex, int indexmode, IntEndptsRGBA endpts[NREGIONS], int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W]) +{ + int index_positions[NREGIONS]; + + index_positions[0] = 0; // since WLOG we have the high bit of the shapes at 0 + + for (int region = 0; region < NREGIONS; ++region) + { + int x = index_positions[region] & 3; + int y = (index_positions[region] >> 2) & 3; + nvAssert(REGION(x,y,shapeindex) == region); // double check the table + + // swap RGB + if (indices[INDEXARRAY_RGB][y][x] & HIGH_INDEXBIT_RGB(indexmode)) + { + // high bit is set, swap the endpts and indices for this region + int t; + for (int i=CHANNEL_R; i<=CHANNEL_B; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; } + + for (int y = 0; y < Tile::TILE_H; y++) + for (int x = 0; x < Tile::TILE_W; x++) + if (REGION(x,y,shapeindex) == region) + indices[INDEXARRAY_RGB][y][x] = NINDICES_RGB(indexmode) - 1 - indices[INDEXARRAY_RGB][y][x]; + } + + // swap A + if (indices[INDEXARRAY_A][y][x] & HIGH_INDEXBIT_A(indexmode)) + { + // high bit is set, swap the endpts and indices for this region + int t; + for (int i=CHANNEL_A; i<=CHANNEL_A; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; } + + for (int y = 0; y < Tile::TILE_H; y++) + for (int x = 0; x < Tile::TILE_W; x++) + if (REGION(x,y,shapeindex) == region) + indices[INDEXARRAY_A][y][x] = NINDICES_A(indexmode) - 1 - indices[INDEXARRAY_A][y][x]; + } + } +} + +static bool endpts_fit(IntEndptsRGBA endpts[NREGIONS], const Pattern &p) +{ + return true; +} + +static void write_header(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, int rotatemode, int indexmode, Bits &out) +{ + // ignore shapeindex + out.write(p.mode, p.modebits); + out.write(rotatemode, ROTATEMODE_BITS); + out.write(indexmode, INDEXMODE_BITS); + for (int i=0; i= 0 && pat_index < NPATTERNS); + nvAssert (in.getptr() == patterns[pat_index].modebits); + + p = patterns[pat_index]; + + shapeindex = 0; // we don't have any + + rotatemode = in.read(ROTATEMODE_BITS); + indexmode = in.read(INDEXMODE_BITS); + for (int i=0; i>2][i&3], INDEXBITS2 - (i==0?1:0)); // write i..[1:0] or i..[0] + + // then the 3 bit indices + nvAssert ((indices[INDEXARRAY_3BITS(indexmode)][0][0] & HIGH_INDEXBIT3) == 0); + for (int i = 0; i < Tile::TILE_TOTAL; ++i) + out.write(indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3], INDEXBITS3 - (i==0?1:0)); // write i..[2:0] or i..[1:0] +} + +static void read_indices(Bits &in, int shapeindex, int indexmode, int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W]) +{ + // the indices we shorten is always index 0 + + // do the 2 bit indices first + for (int i = 0; i < Tile::TILE_TOTAL; ++i) + indices[INDEXARRAY_2BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS2 - (i==0?1:0)); // read i..[1:0] or i..[0] + + // then the 3 bit indices + for (int i = 0; i < Tile::TILE_TOTAL; ++i) + indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS3 - (i==0?1:0)); // read i..[1:0] or i..[0] +} + +static void emit_block(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], int rotatemode, int indexmode, char *block) +{ + Bits out(block, AVPCL::BITSIZE); + + write_header(endpts, shapeindex, p, rotatemode, indexmode, out); + + write_indices(indices, shapeindex, indexmode, out); + + nvAssert(out.getptr() == AVPCL::BITSIZE); +} + +static void generate_palette_quantized_rgb_a(const IntEndptsRGBA &endpts, const RegionPrec ®ion_prec, int indexmode, Vector3 palette_rgb[NINDICES3], float palette_a[NINDICES3]) +{ + // scale endpoints for RGB + int a, b; + + a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]); + b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]); + + // interpolate R + for (int i = 0; i < NINDICES_RGB(indexmode); ++i) + palette_rgb[i].x = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode))); + + a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]); + b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]); + + // interpolate G + for (int i = 0; i < NINDICES_RGB(indexmode); ++i) + palette_rgb[i].y = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode))); + + a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]); + b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]); + + // interpolate B + for (int i = 0; i < NINDICES_RGB(indexmode); ++i) + palette_rgb[i].z = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode))); + + a = Utils::unquantize(endpts.A[3], region_prec.endpt_a_prec[3]); + b = Utils::unquantize(endpts.B[3], region_prec.endpt_b_prec[3]); + + // interpolate A + for (int i = 0; i < NINDICES_A(indexmode); ++i) + palette_a[i] = float(Utils::lerp(a, b, i, BIAS_A(indexmode), DENOM_A(indexmode))); + +} + +static void sign_extend(Pattern &p, IntEndptsRGBA endpts[NREGIONS]) +{ + for (int i=0; i 0; ++j) + { + err = Utils::metric1(a, palette_a[j], rotatemode); + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + palette_alpha = palette_a[j]; + indices[INDEXARRAY_A][i] = j; + } + } + toterr += besterr; // squared-error norms are additive since we don't do the square root + + // do RGB index + besterr = FLT_MAX; + for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j) + { + err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) : + Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[j], palette_alpha); + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[INDEXARRAY_RGB][i] = j; + } + } + toterr += besterr; + if (toterr > current_besterr) + { + // fill out bogus index values so it's initialized at least + for (int k = i; k < np; ++k) + { + indices[INDEXARRAY_RGB][k] = -1; + indices[INDEXARRAY_A][k] = -1; + } + return FLT_MAX; + } + } + else + { + // do RGB index + besterr = FLT_MAX; + int bestindex; + for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j) + { + err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) : + Utils::metric3premult_alphain(rgb, palette_rgb[j], rotatemode); + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + bestindex = j; + indices[INDEXARRAY_RGB][i] = j; + } + } + palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[bestindex]).x : + (rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[bestindex]).y : + (rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[bestindex]).z : nvCheckMacro(0); + toterr += besterr; + + // do A index + besterr = FLT_MAX; + for (int j = 0; j < NINDICES_A(indexmode) && besterr > 0; ++j) + { + err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[j], rotatemode) : + Utils::metric1premult(a, tile_alpha, palette_a[j], palette_alpha, rotatemode); + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[INDEXARRAY_A][i] = j; + } + } + toterr += besterr; // squared-error norms are additive since we don't do the square root + if (toterr > current_besterr) + { + // fill out bogus index values so it's initialized at least + for (int k = i; k < np; ++k) + { + indices[INDEXARRAY_RGB][k] = -1; + indices[INDEXARRAY_A][k] = -1; + } + return FLT_MAX; + } + } + } + return toterr; +} + +// assign indices given a tile, shape, and quantized endpoints, return toterr for each region +static void assign_indices(const Tile &tile, int shapeindex, int rotatemode, int indexmode, IntEndptsRGBA endpts[NREGIONS], const PatternPrec &pattern_prec, + int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS]) +{ + Vector3 palette_rgb[NREGIONS][NINDICES3]; // could be nindices2 + float palette_a[NREGIONS][NINDICES3]; // could be nindices2 + + for (int region = 0; region < NREGIONS; ++region) + { + generate_palette_quantized_rgb_a(endpts[region], pattern_prec.region_precs[region], indexmode, &palette_rgb[region][0], &palette_a[region][0]); + toterr[region] = 0; + } + + Vector3 rgb; + float a; + + for (int y = 0; y < tile.size_y; y++) + for (int x = 0; x < tile.size_x; x++) + { + int region = REGION(x,y,shapeindex); + float err, besterr; + float palette_alpha = 0, tile_alpha = 0; + + rgb.x = (tile.data[y][x]).x; + rgb.y = (tile.data[y][x]).y; + rgb.z = (tile.data[y][x]).z; + a = (tile.data[y][x]).w; + + if(AVPCL::flag_premult) + tile_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (tile.data[y][x]).x : + (rotatemode == ROTATEMODE_RGBA_RABG) ? (tile.data[y][x]).y : + (rotatemode == ROTATEMODE_RGBA_RGAB) ? (tile.data[y][x]).z : (tile.data[y][x]).w; + + // compute the two indices separately + // if we're doing premultiplied alpha, we need to choose first the index that + // determines the alpha value, and then do the other index + + if (rotatemode == ROTATEMODE_RGBA_RGBA) + { + // do A index first as it has the alpha + besterr = FLT_MAX; + for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i) + { + err = Utils::metric1(a, palette_a[region][i], rotatemode); + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[INDEXARRAY_A][y][x] = i; + palette_alpha = palette_a[region][i]; + } + } + toterr[region] += besterr; // squared-error norms are additive since we don't do the square root + + // do RGB index + besterr = FLT_MAX; + for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i) + { + err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) : + Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[region][i], palette_alpha); + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[INDEXARRAY_RGB][y][x] = i; + } + } + toterr[region] += besterr; + } + else + { + // do RGB index first as it has the alpha + besterr = FLT_MAX; + int bestindex; + for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i) + { + err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) : + Utils::metric3premult_alphain(rgb, palette_rgb[region][i], rotatemode); + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[INDEXARRAY_RGB][y][x] = i; + bestindex = i; + } + } + palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[region][bestindex]).x : + (rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[region][bestindex]).y : + (rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[region][bestindex]).z : nvCheckMacro(0); + toterr[region] += besterr; + + // do A index + besterr = FLT_MAX; + for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i) + { + err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[region][i], rotatemode) : + Utils::metric1premult(a, tile_alpha, palette_a[region][i], palette_alpha, rotatemode); + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[INDEXARRAY_A][y][x] = i; + } + } + toterr[region] += besterr; // squared-error norms are additive since we don't do the square root + } + } +} + +// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's +// this function returns either old_err or a value smaller (if it was successful in improving the error) +static float perturb_one(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec ®ion_prec, const IntEndptsRGBA &old_endpts, IntEndptsRGBA &new_endpts, + float old_err, int do_b, int indices[NINDEXARRAYS][Tile::TILE_TOTAL]) +{ + // we have the old endpoints: old_endpts + // we have the perturbed endpoints: new_endpts + // we have the temporary endpoints: temp_endpts + + IntEndptsRGBA temp_endpts; + float min_err = old_err; // start with the best current error + int beststep; + int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL]; + + for (int j=0; j>= 1) + { + bool improved = false; + for (int sign = -1; sign <= 1; sign += 2) + { + if (do_b == 0) + { + temp_endpts.A[ch] = new_endpts.A[ch] + sign * step; + if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec)) + continue; + } + else + { + temp_endpts.B[ch] = new_endpts.B[ch] + sign * step; + if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec)) + continue; + } + + float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, min_err, temp_indices); + + if (err < min_err) + { + improved = true; + min_err = err; + beststep = sign * step; + for (int j=0; j 5000 perturb endpoints 50% of precision +// if err > 1000 25% +// if err > 200 12.5% +// if err > 40 6.25% +// for np = 16 -- adjust error thresholds as a function of np +// always ensure endpoint ordering is preserved (no need to overlap the scan) +static float exhaustive(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec ®ion_prec, float orig_err, IntEndptsRGBA &opt_endpts, int indices[NINDEXARRAYS][Tile::TILE_TOTAL]) +{ + IntEndptsRGBA temp_endpts; + float best_err = orig_err; + int aprec = region_prec.endpt_a_prec[ch]; + int bprec = region_prec.endpt_b_prec[ch]; + int good_indices[NINDEXARRAYS][Tile::TILE_TOTAL]; + int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL]; + + for (int j=0; j 5000.0*thr_scale) { adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; } + else if (orig_err > 1000.0*thr_scale) { adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; } + else if (orig_err > 200.0*thr_scale) { adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; } + else if (orig_err > 40.0*thr_scale) { adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; } + adelta = max(adelta, 3); + bdelta = max(bdelta, 3); + +#ifdef DISABLE_EXHAUSTIVE + adelta = bdelta = 3; +#endif + + temp_endpts = opt_endpts; + + // ok figure out the range of A and B + int alow = max(0, opt_endpts.A[ch] - adelta); + int ahigh = min((1<= initial_error) break + rgb0 += delta0 + next = 1 + else + if (err1 >= initial_error) break + rgb1 += delta1 + next = 0 + initial_err = map() + for (;;) + err = perturb(next ? rgb1:rgb0, delta) + if (err >= initial_err) break + next? rgb1 : rgb0 += delta + initial_err = err + */ + IntEndptsRGBA new_a, new_b; + IntEndptsRGBA new_endpt; + int do_b; + int orig_indices[NINDEXARRAYS][Tile::TILE_TOTAL]; + int new_indices[NINDEXARRAYS][Tile::TILE_TOTAL]; + int temp_indices0[NINDEXARRAYS][Tile::TILE_TOTAL]; + int temp_indices1[NINDEXARRAYS][Tile::TILE_TOTAL]; + + // now optimize each channel separately + for (int ch = 0; ch < NCHANNELS_RGBA; ++ch) + { + // figure out which endpoint when perturbed gives the most improvement and start there + // if we just alternate, we can easily end up in a local minima + float err0 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A + float err1 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B + + if (err0 < err1) + { + if (err0 >= opt_err) + continue; + + for (int j=0; j= opt_err) + continue; + + for (int j=0; j= opt_err) + break; + + for (int j=0; j 255.0f) v.x = 255.0f; + if (v.y < 0.0f) v.y = 0.0f; + if (v.y > 255.0f) v.y = 255.0f; + if (v.z < 0.0f) v.z = 0.0f; + if (v.z > 255.0f) v.z = 255.0f; + if (v.w < 0.0f) v.w = 0.0f; + if (v.w > 255.0f) v.w = 255.0f; +} + +// compute initial endpoints for the "RGB" portion and the "A" portion. +// Note these channels may have been rotated. +static void rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS]) +{ + for (int region=0; region maxp) maxp = dp; + + dp = alphas[i] - mean.w; + if (dp < mina) mina = dp; + if (dp > maxa) maxa = dp; + } + + // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values + endpts[region].A = mean + Vector4(minp*direction, mina); + endpts[region].B = mean + Vector4(maxp*direction, maxa); + + // clamp endpoints + // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best + // shape based on endpoints being clamped + clamp(endpts[region].A); + clamp(endpts[region].B); + } +} + +float AVPCL::compress_mode4(const Tile &t, char *block) +{ + FltEndpts endpts[NREGIONS]; + char tempblock[AVPCL::BLOCKSIZE]; + float msebest = FLT_MAX; + int shape = 0; + Tile t1; + + // try all rotations. refine tries the 2 different indexings. + for (int r = 0; r < NROTATEMODES && msebest > 0; ++r) + { + rotate_tile(t, r, t1); + rough(t1, shape, endpts); + for (int i = 0; i < NINDEXMODES && msebest > 0; ++i) + { + float mse = refine(t1, shape, r, i, endpts, tempblock); + if (mse < msebest) + { + memcpy(block, tempblock, sizeof(tempblock)); + msebest = mse; + } + } + } + return msebest; +} diff --git a/3rdparty/nvtt/bc7/avpcl_mode5.cpp b/3rdparty/nvtt/bc7/avpcl_mode5.cpp new file mode 100644 index 00000000..56b24d12 --- /dev/null +++ b/3rdparty/nvtt/bc7/avpcl_mode5.cpp @@ -0,0 +1,1216 @@ +/* +Copyright 2007 nVidia, Inc. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. +*/ + +// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis + +// x100000 2r 777x2 8x2 2bi 2bi + +#include "bits.h" +#include "tile.h" +#include "avpcl.h" +#include "nvcore/Debug.h" +#include "nvmath/Vector.inl" +#include "nvmath/Matrix.inl" +#include "nvmath/Fitting.h" +#include "avpcl_utils.h" +#include "endpts.h" +#include +#include + +using namespace nv; +using namespace AVPCL; + +// there are 2 index arrays. INDEXMODE selects between the arrays being 2 & 3 bits or 3 & 2 bits +// array 0 is always the RGB array and array 1 is always the A array +#define NINDEXARRAYS 2 +#define INDEXARRAY_RGB 0 +#define INDEXARRAY_A 1 +#define INDEXARRAY_2BITS(indexmode) ((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXARRAY_A : INDEXARRAY_RGB) +#define INDEXARRAY_3BITS(indexmode) ((indexmode == INDEXMODE_ALPHA_IS_3BITS) ? INDEXARRAY_A : INDEXARRAY_RGB) + +#define NINDICES3 4 +#define INDEXBITS3 2 +#define HIGH_INDEXBIT3 (1<<(INDEXBITS3-1)) +#define DENOM3 (NINDICES3-1) +#define BIAS3 (DENOM3/2) + +#define NINDICES2 4 +#define INDEXBITS2 2 +#define HIGH_INDEXBIT2 (1<<(INDEXBITS2-1)) +#define DENOM2 (NINDICES2-1) +#define BIAS2 (DENOM2/2) + +#define NINDICES_RGB(indexmode) ((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES3 : NINDICES2) +#define INDEXBITS_RGB(indexmode) ((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS3 : INDEXBITS2) +#define HIGH_INDEXBIT_RGB(indexmode)((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT3 : HIGH_INDEXBIT2) +#define DENOM_RGB(indexmode) ((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM3 : DENOM2) +#define BIAS_RGB(indexmode) ((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS3 : BIAS2) + +#define NINDICES_A(indexmode) ((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES2 : NINDICES3) +#define INDEXBITS_A(indexmode) ((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS2 : INDEXBITS3) +#define HIGH_INDEXBIT_A(indexmode) ((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT2 : HIGH_INDEXBIT3) +#define DENOM_A(indexmode) ((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM2 : DENOM3) +#define BIAS_A(indexmode) ((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS2 : BIAS3) + +#define NSHAPES 1 + +static int shapes[NSHAPES] = +{ + 0x0000, +}; + +#define REGION(x,y,shapeindex) ((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0) + +#define NREGIONS 1 // keep the region stuff in just in case... + +// encoded index compression location: region 0 is always at 0,0. + +#define NBITSIZES 2 // one endpoint pair + +struct ChanBits +{ + int nbitsizes[NBITSIZES]; // bitsizes for one channel +}; + +struct Pattern +{ + ChanBits chan[NCHANNELS_RGBA];// bit patterns used per channel + int transform_mode; // x0 means alpha channel not transformed, x1 otherwise. 0x rgb not transformed, 1x otherwise. + int mode; // associated mode value + int modebits; // number of mode bits + const char *encoding; // verilog description of encoding for this mode +}; + +#define TRANSFORM_MODE_ALPHA 1 +#define TRANSFORM_MODE_RGB 2 + +#define NPATTERNS 1 + +static Pattern patterns[NPATTERNS] = +{ + // red green blue alpha xfm mode mb encoding + 7,7, 7,7, 7,7, 8,8, 0x0, 0x20, 6, "", +}; + +struct RegionPrec +{ + int endpt_a_prec[NCHANNELS_RGBA]; + int endpt_b_prec[NCHANNELS_RGBA]; +}; + +struct PatternPrec +{ + RegionPrec region_precs[NREGIONS]; +}; + +// this is the precision for each channel and region +// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this! +static PatternPrec pattern_precs[NPATTERNS] = +{ + 7,7,7,8, 7,7,7,8, +}; + + +// return # of bits needed to store n. handle signed or unsigned cases properly +static int nbits(int n, bool issigned) +{ + int nb; + if (n==0) + return 0; // no bits needed for 0, signed or not + else if (n > 0) + { + for (nb=0; n; ++nb, n>>=1) ; + return nb + (issigned?1:0); + } + else + { + nvAssert (issigned); + for (nb=0; n<-1; ++nb, n>>=1) ; + return nb + 1; + } +} + +#define R_0 ep[0].A[i] +#define R_1 ep[0].B[i] + +static void transform_forward(int transform_mode, IntEndptsRGBA ep[NREGIONS]) +{ + int i; + + if (transform_mode & TRANSFORM_MODE_RGB) + for (i=CHANNEL_R; i> 2) & 3 and x = index & 3 +static void swap_indices(int shapeindex, int indexmode, IntEndptsRGBA endpts[NREGIONS], int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W]) +{ + int index_positions[NREGIONS]; + + index_positions[0] = 0; // since WLOG we have the high bit of the shapes at 0 + + for (int region = 0; region < NREGIONS; ++region) + { + int x = index_positions[region] & 3; + int y = (index_positions[region] >> 2) & 3; + nvAssert(REGION(x,y,shapeindex) == region); // double check the table + + // swap RGB + if (indices[INDEXARRAY_RGB][y][x] & HIGH_INDEXBIT_RGB(indexmode)) + { + // high bit is set, swap the endpts and indices for this region + int t; + for (int i=CHANNEL_R; i<=CHANNEL_B; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; } + + for (int y = 0; y < Tile::TILE_H; y++) + for (int x = 0; x < Tile::TILE_W; x++) + if (REGION(x,y,shapeindex) == region) + indices[INDEXARRAY_RGB][y][x] = NINDICES_RGB(indexmode) - 1 - indices[INDEXARRAY_RGB][y][x]; + } + + // swap A + if (indices[INDEXARRAY_A][y][x] & HIGH_INDEXBIT_A(indexmode)) + { + // high bit is set, swap the endpts and indices for this region + int t; + for (int i=CHANNEL_A; i<=CHANNEL_A; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; } + + for (int y = 0; y < Tile::TILE_H; y++) + for (int x = 0; x < Tile::TILE_W; x++) + if (REGION(x,y,shapeindex) == region) + indices[INDEXARRAY_A][y][x] = NINDICES_A(indexmode) - 1 - indices[INDEXARRAY_A][y][x]; + } + } +} + +static bool endpts_fit(IntEndptsRGBA endpts[NREGIONS], const Pattern &p) +{ + return true; +} + +static void write_header(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, int rotatemode, int indexmode, Bits &out) +{ + // ignore shapeindex + out.write(p.mode, p.modebits); + out.write(rotatemode, ROTATEMODE_BITS); +// out.write(indexmode, INDEXMODE_BITS); + for (int i=0; i= 0 && pat_index < NPATTERNS); + nvAssert (in.getptr() == patterns[pat_index].modebits); + + p = patterns[pat_index]; + + shapeindex = 0; // we don't have any + + rotatemode = in.read(ROTATEMODE_BITS); + + indexmode = 0; // we don't have any + + for (int i=0; i>2][i&3], INDEXBITS2 - (i==0?1:0)); // write i..[1:0] or i..[0] + + // then the 3 bit indices + nvAssert ((indices[INDEXARRAY_3BITS(indexmode)][0][0] & HIGH_INDEXBIT3) == 0); + for (int i = 0; i < Tile::TILE_TOTAL; ++i) + out.write(indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3], INDEXBITS3 - (i==0?1:0)); // write i..[2:0] or i..[1:0] +} + +static void read_indices(Bits &in, int shapeindex, int indexmode, int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W]) +{ + // the indices we shorten is always index 0 + + // do the 2 bit indices first + for (int i = 0; i < Tile::TILE_TOTAL; ++i) + indices[INDEXARRAY_2BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS2 - (i==0?1:0)); // read i..[1:0] or i..[0] + + // then the 3 bit indices + for (int i = 0; i < Tile::TILE_TOTAL; ++i) + indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS3 - (i==0?1:0)); // read i..[1:0] or i..[0] +} + +static void emit_block(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], int rotatemode, int indexmode, char *block) +{ + Bits out(block, AVPCL::BITSIZE); + + write_header(endpts, shapeindex, p, rotatemode, indexmode, out); + + write_indices(indices, shapeindex, indexmode, out); + + nvAssert(out.getptr() == AVPCL::BITSIZE); +} + +static void generate_palette_quantized_rgb_a(const IntEndptsRGBA &endpts, const RegionPrec ®ion_prec, int indexmode, Vector3 palette_rgb[NINDICES3], float palette_a[NINDICES3]) +{ + // scale endpoints for RGB + int a, b; + + a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]); + b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]); + + // interpolate R + for (int i = 0; i < NINDICES_RGB(indexmode); ++i) + palette_rgb[i].x = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode))); + + a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]); + b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]); + + // interpolate G + for (int i = 0; i < NINDICES_RGB(indexmode); ++i) + palette_rgb[i].y = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode))); + + a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]); + b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]); + + // interpolate B + for (int i = 0; i < NINDICES_RGB(indexmode); ++i) + palette_rgb[i].z = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode))); + + a = Utils::unquantize(endpts.A[3], region_prec.endpt_a_prec[3]); + b = Utils::unquantize(endpts.B[3], region_prec.endpt_b_prec[3]); + + // interpolate A + for (int i = 0; i < NINDICES_A(indexmode); ++i) + palette_a[i] = float(Utils::lerp(a, b, i, BIAS_A(indexmode), DENOM_A(indexmode))); +} + +static void sign_extend(Pattern &p, IntEndptsRGBA endpts[NREGIONS]) +{ + for (int i=0; i 0; ++j) + { + err = Utils::metric1(a, palette_a[j], rotatemode); + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + palette_alpha = palette_a[j]; + indices[INDEXARRAY_A][i] = j; + } + } + toterr += besterr; // squared-error norms are additive since we don't do the square root + + // do RGB index + besterr = FLT_MAX; + for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j) + { + err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) : + Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[j], palette_alpha); + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[INDEXARRAY_RGB][i] = j; + } + } + toterr += besterr; + if (toterr > current_besterr) + { + // fill out bogus index values so it's initialized at least + for (int k = i; k < np; ++k) + { + indices[INDEXARRAY_RGB][k] = -1; + indices[INDEXARRAY_A][k] = -1; + } + return FLT_MAX; + } + } + else + { + // do RGB index + besterr = FLT_MAX; + int bestindex; + for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j) + { + err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) : + Utils::metric3premult_alphain(rgb, palette_rgb[j], rotatemode); + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + bestindex = j; + indices[INDEXARRAY_RGB][i] = j; + } + } + palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[bestindex]).x : + (rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[bestindex]).y : + (rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[bestindex]).z : nvCheckMacro(0); + toterr += besterr; + + // do A index + besterr = FLT_MAX; + for (int j = 0; j < NINDICES_A(indexmode) && besterr > 0; ++j) + { + err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[j], rotatemode) : + Utils::metric1premult(a, tile_alpha, palette_a[j], palette_alpha, rotatemode); + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[INDEXARRAY_A][i] = j; + } + } + toterr += besterr; // squared-error norms are additive since we don't do the square root + if (toterr > current_besterr) + { + // fill out bogus index values so it's initialized at least + for (int k = i; k < np; ++k) + { + indices[INDEXARRAY_RGB][k] = -1; + indices[INDEXARRAY_A][k] = -1; + } + return FLT_MAX; + } + } + } + return toterr; +} + +// assign indices given a tile, shape, and quantized endpoints, return toterr for each region +static void assign_indices(const Tile &tile, int shapeindex, int rotatemode, int indexmode, IntEndptsRGBA endpts[NREGIONS], const PatternPrec &pattern_prec, + int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS]) +{ + Vector3 palette_rgb[NREGIONS][NINDICES3]; // could be nindices2 + float palette_a[NREGIONS][NINDICES3]; // could be nindices2 + + for (int region = 0; region < NREGIONS; ++region) + { + generate_palette_quantized_rgb_a(endpts[region], pattern_prec.region_precs[region], indexmode, &palette_rgb[region][0], &palette_a[region][0]); + toterr[region] = 0; + } + + Vector3 rgb; + float a; + + for (int y = 0; y < tile.size_y; y++) + for (int x = 0; x < tile.size_x; x++) + { + int region = REGION(x,y,shapeindex); + float err, besterr; + float palette_alpha = 0, tile_alpha = 0; + + rgb.x = (tile.data[y][x]).x; + rgb.y = (tile.data[y][x]).y; + rgb.z = (tile.data[y][x]).z; + a = (tile.data[y][x]).w; + + if(AVPCL::flag_premult) + tile_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (tile.data[y][x]).x : + (rotatemode == ROTATEMODE_RGBA_RABG) ? (tile.data[y][x]).y : + (rotatemode == ROTATEMODE_RGBA_RGAB) ? (tile.data[y][x]).z : (tile.data[y][x]).w; + + // compute the two indices separately + // if we're doing premultiplied alpha, we need to choose first the index that + // determines the alpha value, and then do the other index + + if (rotatemode == ROTATEMODE_RGBA_RGBA) + { + // do A index first as it has the alpha + besterr = FLT_MAX; + for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i) + { + err = Utils::metric1(a, palette_a[region][i], rotatemode); + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[INDEXARRAY_A][y][x] = i; + palette_alpha = palette_a[region][i]; + } + } + toterr[region] += besterr; // squared-error norms are additive since we don't do the square root + + // do RGB index + besterr = FLT_MAX; + for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i) + { + err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) : + Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[region][i], palette_alpha); + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[INDEXARRAY_RGB][y][x] = i; + } + } + toterr[region] += besterr; + } + else + { + // do RGB index first as it has the alpha + besterr = FLT_MAX; + int bestindex; + for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i) + { + err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) : + Utils::metric3premult_alphain(rgb, palette_rgb[region][i], rotatemode); + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[INDEXARRAY_RGB][y][x] = i; + bestindex = i; + } + } + palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[region][bestindex]).x : + (rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[region][bestindex]).y : + (rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[region][bestindex]).z : nvCheckMacro(0); + toterr[region] += besterr; + + // do A index + besterr = FLT_MAX; + for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i) + { + err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[region][i], rotatemode) : + Utils::metric1premult(a, tile_alpha, palette_a[region][i], palette_alpha, rotatemode); + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[INDEXARRAY_A][y][x] = i; + } + } + toterr[region] += besterr; // squared-error norms are additive since we don't do the square root + } + } +} + +// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's +// this function returns either old_err or a value smaller (if it was successful in improving the error) +static float perturb_one(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec ®ion_prec, const IntEndptsRGBA &old_endpts, IntEndptsRGBA &new_endpts, + float old_err, int do_b, int indices[NINDEXARRAYS][Tile::TILE_TOTAL]) +{ + // we have the old endpoints: old_endpts + // we have the perturbed endpoints: new_endpts + // we have the temporary endpoints: temp_endpts + + IntEndptsRGBA temp_endpts; + float min_err = old_err; // start with the best current error + int beststep; + int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL]; + + for (int j=0; j>= 1) + { + bool improved = false; + for (int sign = -1; sign <= 1; sign += 2) + { + if (do_b == 0) + { + temp_endpts.A[ch] = new_endpts.A[ch] + sign * step; + if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec)) + continue; + } + else + { + temp_endpts.B[ch] = new_endpts.B[ch] + sign * step; + if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec)) + continue; + } + + float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, min_err, temp_indices); + + if (err < min_err) + { + improved = true; + min_err = err; + beststep = sign * step; + for (int j=0; j 5000 perturb endpoints 50% of precision +// if err > 1000 25% +// if err > 200 12.5% +// if err > 40 6.25% +// for np = 16 -- adjust error thresholds as a function of np +// always ensure endpoint ordering is preserved (no need to overlap the scan) +static float exhaustive(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec ®ion_prec, float orig_err, IntEndptsRGBA &opt_endpts, int indices[NINDEXARRAYS][Tile::TILE_TOTAL]) +{ + IntEndptsRGBA temp_endpts; + float best_err = orig_err; + int aprec = region_prec.endpt_a_prec[ch]; + int bprec = region_prec.endpt_b_prec[ch]; + int good_indices[NINDEXARRAYS][Tile::TILE_TOTAL]; + int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL]; + + for (int j=0; j 5000.0*thr_scale) { adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; } + else if (orig_err > 1000.0*thr_scale) { adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; } + else if (orig_err > 200.0*thr_scale) { adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; } + else if (orig_err > 40.0*thr_scale) { adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; } + adelta = max(adelta, 3); + bdelta = max(bdelta, 3); + +#ifdef DISABLE_EXHAUSTIVE + adelta = bdelta = 3; +#endif + + temp_endpts = opt_endpts; + + // ok figure out the range of A and B + int alow = max(0, opt_endpts.A[ch] - adelta); + int ahigh = min((1<= initial_error) break + rgb0 += delta0 + next = 1 + else + if (err1 >= initial_error) break + rgb1 += delta1 + next = 0 + initial_err = map() + for (;;) + err = perturb(next ? rgb1:rgb0, delta) + if (err >= initial_err) break + next? rgb1 : rgb0 += delta + initial_err = err + */ + IntEndptsRGBA new_a, new_b; + IntEndptsRGBA new_endpt; + int do_b; + int orig_indices[NINDEXARRAYS][Tile::TILE_TOTAL]; + int new_indices[NINDEXARRAYS][Tile::TILE_TOTAL]; + int temp_indices0[NINDEXARRAYS][Tile::TILE_TOTAL]; + int temp_indices1[NINDEXARRAYS][Tile::TILE_TOTAL]; + + // now optimize each channel separately + for (int ch = 0; ch < NCHANNELS_RGBA; ++ch) + { + // figure out which endpoint when perturbed gives the most improvement and start there + // if we just alternate, we can easily end up in a local minima + float err0 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A + float err1 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B + + if (err0 < err1) + { + if (err0 >= opt_err) + continue; + + for (int j=0; j= opt_err) + continue; + + for (int j=0; j= opt_err) + break; + + for (int j=0; j 255.0f) v.x = 255.0f; + if (v.y < 0.0f) v.y = 0.0f; + if (v.y > 255.0f) v.y = 255.0f; + if (v.z < 0.0f) v.z = 0.0f; + if (v.z > 255.0f) v.z = 255.0f; + if (v.w < 0.0f) v.w = 0.0f; + if (v.w > 255.0f) v.w = 255.0f; +} + +// compute initial endpoints for the "RGB" portion and the "A" portion. +// Note these channels may have been rotated. +static void rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS]) +{ + for (int region=0; region maxp) maxp = dp; + + dp = alphas[i] - mean.w; + if (dp < mina) mina = dp; + if (dp > maxa) maxa = dp; + } + + // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values + endpts[region].A = mean + Vector4(minp*direction, mina); + endpts[region].B = mean + Vector4(maxp*direction, maxa); + + // clamp endpoints + // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best + // shape based on endpoints being clamped + clamp(endpts[region].A); + clamp(endpts[region].B); + } +} + +float AVPCL::compress_mode5(const Tile &t, char *block) +{ + FltEndpts endpts[NREGIONS]; + char tempblock[AVPCL::BLOCKSIZE]; + float msebest = FLT_MAX; + int shape = 0; + Tile t1; + + // try all rotations. refine tries the 2 different indexings. + for (int r = 0; r < NROTATEMODES && msebest > 0; ++r) + { + rotate_tile(t, r, t1); + rough(t1, shape, endpts); +// for (int i = 0; i < NINDEXMODES && msebest > 0; ++i) + for (int i = 0; i < 1 && msebest > 0; ++i) + { + float mse = refine(t1, shape, r, i, endpts, tempblock); + if (mse < msebest) + { + memcpy(block, tempblock, sizeof(tempblock)); + msebest = mse; + } + } + } + return msebest; +} diff --git a/3rdparty/nvtt/bc7/avpcl_mode6.cpp b/3rdparty/nvtt/bc7/avpcl_mode6.cpp new file mode 100644 index 00000000..10d7bf08 --- /dev/null +++ b/3rdparty/nvtt/bc7/avpcl_mode6.cpp @@ -0,0 +1,1055 @@ +/* +Copyright 2007 nVidia, Inc. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. +*/ + +// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis + +// x1000000 7777.1x2 4bi + +#include "bits.h" +#include "tile.h" +#include "avpcl.h" +#include "nvcore/Debug.h" +#include "nvmath/Vector.inl" +#include "nvmath/Matrix.inl" +#include "nvmath/Fitting.h" +#include "avpcl_utils.h" +#include "endpts.h" +#include +#include + +using namespace nv; +using namespace AVPCL; + +#define NLSBMODES 4 // number of different lsb modes per region. since we have two .1 per region, that can have 4 values + +#define NINDICES 16 +#define INDEXBITS 4 +#define HIGH_INDEXBIT (1<<(INDEXBITS-1)) +#define DENOM (NINDICES-1) +#define BIAS (DENOM/2) + +#define NSHAPES 1 + +static int shapes[NSHAPES] = +{ + 0x0000, +}; + +#define REGION(x,y,shapeindex) ((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0) + +#define NREGIONS 1 + +#define NBITSIZES (NREGIONS*2) +#define ABITINDEX(region) (2*(region)+0) +#define BBITINDEX(region) (2*(region)+1) + +struct ChanBits +{ + int nbitsizes[NBITSIZES]; // bitsizes for one channel +}; + +struct Pattern +{ + ChanBits chan[NCHANNELS_RGBA];// bit patterns used per channel + int mode; // associated mode value + int modebits; // number of mode bits + const char *encoding; // verilog description of encoding for this mode +}; + +#define NPATTERNS 1 + +static Pattern patterns[NPATTERNS] = +{ + // red green blue alpha mode mb verilog + 7,7, 7,7, 7,7, 7,7, 0x40, 7, "", +}; + +struct RegionPrec +{ + int endpt_a_prec[NCHANNELS_RGBA]; + int endpt_b_prec[NCHANNELS_RGBA]; +}; + +struct PatternPrec +{ + RegionPrec region_precs[NREGIONS]; +}; + +// this is the precision for each channel and region +// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this! +static PatternPrec pattern_precs[NPATTERNS] = +{ + 7,7,7,7, 7,7,7,7, +}; + +// return # of bits needed to store n. handle signed or unsigned cases properly +static int nbits(int n, bool issigned) +{ + int nb; + if (n==0) + return 0; // no bits needed for 0, signed or not + else if (n > 0) + { + for (nb=0; n; ++nb, n>>=1) ; + return nb + (issigned?1:0); + } + else + { + nvAssert (issigned); + for (nb=0; n<-1; ++nb, n>>=1) ; + return nb + 1; + } +} + +/* +we're using this table to assign lsbs +abgr >=2 correct +0000 0 0 +0001 0 0 +0010 0 0 +0011 1 x1 +0100 0 0 +0101 1 x1 +0110 1 x1 +0111 1 1 +1000 0 0 +1001 1 x0 +1010 1 x0 +1011 1 1 +1100 1 x0 +1101 1 1 +1110 1 1 +1111 1 1 + +we need 8 0's and 8 1's. the x's can be either 0 or 1 as long as you get 8/8. +I choose to assign the lsbs so that the rgb channels are as good as possible. +*/ + +// 8888 ->7777.1, use the "correct" column above to assign the lsb +static void compress_one(const IntEndptsRGBA& endpts, IntEndptsRGBA_2& compr_endpts) +{ + int onescnt; + + onescnt = 0; + for (int j=0; j> 1; + nvAssert (compr_endpts.A[j] < 128); + } + compr_endpts.a_lsb = onescnt >= 2; + + onescnt = 0; + for (int j=0; j> 1; + nvAssert (compr_endpts.B[j] < 128); + } + compr_endpts.b_lsb = onescnt >= 2; +} + +static void uncompress_one(const IntEndptsRGBA_2& compr_endpts, IntEndptsRGBA& endpts) +{ + for (int j=0; j> 2) & 3 and x = index & 3 +static void swap_indices(IntEndptsRGBA_2 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex) +{ + int index_positions[NREGIONS]; + + index_positions[0] = 0; // since WLOG we have the high bit of the shapes at 0 + + for (int region = 0; region < NREGIONS; ++region) + { + int x = index_positions[region] & 3; + int y = (index_positions[region] >> 2) & 3; + nvAssert(REGION(x,y,shapeindex) == region); // double check the table + if (indices[y][x] & HIGH_INDEXBIT) + { + // high bit is set, swap the endpts and indices for this region + int t; + for (int i=0; i= 0 && pat_index < NPATTERNS); + nvAssert (in.getptr() == patterns[pat_index].modebits); + + p = patterns[pat_index]; + + shapeindex = 0; // we don't have any + + for (int j=0; j>2][i&3], INDEXBITS-1); // write i..[2:0] + else + out.write(indices[i>>2][i&3], INDEXBITS); // write i..[3:0] + } + +} + +static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W]) +{ + // the index we shorten is always index 0 + for (int i = 0; i < Tile::TILE_TOTAL; ++i) + { + if (i==0) + indices[i>>2][i&3] = in.read(INDEXBITS-1); // read i..[1:0] + else + indices[i>>2][i&3] = in.read(INDEXBITS); // read i..[2:0] + } +} + +static void emit_block(const IntEndptsRGBA_2 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block) +{ + Bits out(block, AVPCL::BITSIZE); + + write_header(endpts, shapeindex, p, out); + + write_indices(indices, shapeindex, out); + + nvAssert(out.getptr() == AVPCL::BITSIZE); +} + +static void generate_palette_quantized(const IntEndptsRGBA_2 &endpts_2, const RegionPrec ®ion_prec, Vector4 palette[NINDICES]) +{ + IntEndptsRGBA endpts; + + uncompress_one(endpts_2, endpts); + + // scale endpoints + int a, b; // really need a IntVec4... + + a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1); // +1 since we are in uncompressed space + b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1); + + // interpolate + for (int i = 0; i < NINDICES; ++i) + palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM)); + + a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); + b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1); + + // interpolate + for (int i = 0; i < NINDICES; ++i) + palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM)); + + a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); + b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1); + + // interpolate + for (int i = 0; i < NINDICES; ++i) + palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM)); + + a = Utils::unquantize(endpts.A[3], region_prec.endpt_a_prec[3]+1); + b = Utils::unquantize(endpts.B[3], region_prec.endpt_b_prec[3]+1); + + // interpolate + for (int i = 0; i < NINDICES; ++i) + palette[i].w = float(Utils::lerp(a, b, i, BIAS, DENOM)); +} + +void AVPCL::decompress_mode6(const char *block, Tile &t) +{ + Bits in(block, AVPCL::BITSIZE); + + Pattern p; + IntEndptsRGBA_2 endpts[NREGIONS]; + int shapeindex, pat_index; + + read_header(in, endpts, shapeindex, p, pat_index); + + Vector4 palette[NREGIONS][NINDICES]; + for (int r = 0; r < NREGIONS; ++r) + generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]); + + int indices[Tile::TILE_H][Tile::TILE_W]; + + read_indices(in, shapeindex, indices); + + nvAssert(in.getptr() == AVPCL::BITSIZE); + + // lookup + for (int y = 0; y < Tile::TILE_H; y++) + for (int x = 0; x < Tile::TILE_W; x++) + t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]]; +} + +// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr +static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGBA_2 &endpts, const RegionPrec ®ion_prec, float current_err, int indices[Tile::TILE_TOTAL]) +{ + Vector4 palette[NINDICES]; + float toterr = 0; + Vector4 err; + + generate_palette_quantized(endpts, region_prec, palette); + + for (int i = 0; i < np; ++i) + { + float err, besterr = FLT_MAX; + + for (int j = 0; j < NINDICES && besterr > 0; ++j) + { + err = !AVPCL::flag_premult ? Utils::metric4(colors[i], palette[j]) : + Utils::metric4premult(colors[i], palette[j]) ; + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[i] = j; + } + } + toterr += besterr; + + // check for early exit + if (toterr > current_err) + { + // fill out bogus index values so it's initialized at least + for (int k = i; k < np; ++k) + indices[k] = -1; + + return FLT_MAX; + } + } + return toterr; +} + +// assign indices given a tile, shape, and quantized endpoints, return toterr for each region +static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGBA_2 endpts[NREGIONS], const PatternPrec &pattern_prec, + int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS]) +{ + // build list of possibles + Vector4 palette[NREGIONS][NINDICES]; + + for (int region = 0; region < NREGIONS; ++region) + { + generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]); + toterr[region] = 0; + } + + Vector4 err; + + for (int y = 0; y < tile.size_y; y++) + for (int x = 0; x < tile.size_x; x++) + { + int region = REGION(x,y,shapeindex); + float err, besterr = FLT_MAX; + + for (int i = 0; i < NINDICES && besterr > 0; ++i) + { + err = !AVPCL::flag_premult ? Utils::metric4(tile.data[y][x], palette[region][i]) : + Utils::metric4premult(tile.data[y][x], palette[region][i]) ; + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[y][x] = i; + } + } + toterr[region] += besterr; + } +} + +// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's +// this function returns either old_err or a value smaller (if it was successful in improving the error) +static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, const IntEndptsRGBA_2 &old_endpts, IntEndptsRGBA_2 &new_endpts, + float old_err, int do_b, int indices[Tile::TILE_TOTAL]) +{ + // we have the old endpoints: old_endpts + // we have the perturbed endpoints: new_endpts + // we have the temporary endpoints: temp_endpts + + IntEndptsRGBA_2 temp_endpts; + float min_err = old_err; // start with the best current error + int beststep; + int temp_indices[Tile::TILE_TOTAL]; + + for (int i=0; i>= 1) + { + bool improved = false; + for (int sign = -1; sign <= 1; sign += 2) + { + if (do_b == 0) + { + temp_endpts.A[ch] = new_endpts.A[ch] + sign * step; + if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec)) + continue; + } + else + { + temp_endpts.B[ch] = new_endpts.B[ch] + sign * step; + if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec)) + continue; + } + + float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices); + + if (err < min_err) + { + improved = true; + min_err = err; + beststep = sign * step; + for (int i=0; i 5000 perturb endpoints 50% of precision +// if err > 1000 25% +// if err > 200 12.5% +// if err > 40 6.25% +// for np = 16 -- adjust error thresholds as a function of np +// always ensure endpoint ordering is preserved (no need to overlap the scan) +// if orig_err returned from this is less than its input value, then indices[] will contain valid indices +static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, float orig_err, IntEndptsRGBA_2 &opt_endpts, int indices[Tile::TILE_TOTAL]) +{ + IntEndptsRGBA_2 temp_endpts; + float best_err = orig_err; + int aprec = region_prec.endpt_a_prec[ch]; + int bprec = region_prec.endpt_b_prec[ch]; + int good_indices[Tile::TILE_TOTAL]; + int temp_indices[Tile::TILE_TOTAL]; + + for (int i=0; i 5000.0*thr_scale) { adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; } + else if (orig_err > 1000.0*thr_scale) { adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; } + else if (orig_err > 200.0*thr_scale) { adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; } + else if (orig_err > 40.0*thr_scale) { adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; } + adelta = max(adelta, 3); + bdelta = max(bdelta, 3); + +#ifdef DISABLE_EXHAUSTIVE + adelta = bdelta = 3; +#endif + + temp_endpts = opt_endpts; + + // ok figure out the range of A and B + int alow = max(0, opt_endpts.A[ch] - adelta); + int ahigh = min((1<= initial_error) break + rgb0 += delta0 + next = 1 + else + if (err1 >= initial_error) break + rgb1 += delta1 + next = 0 + initial_err = map() + for (;;) + err = perturb(next ? rgb1:rgb0, delta) + if (err >= initial_err) break + next? rgb1 : rgb0 += delta + initial_err = err + */ + IntEndptsRGBA_2 new_a, new_b; + IntEndptsRGBA_2 new_endpt; + int do_b; + int orig_indices[Tile::TILE_TOTAL]; + int new_indices[Tile::TILE_TOTAL]; + int temp_indices0[Tile::TILE_TOTAL]; + int temp_indices1[Tile::TILE_TOTAL]; + + // now optimize each channel separately + // for the first error improvement, we save the indices. then, for any later improvement, we compare the indices + // if they differ, we restart the loop (which then falls back to looking for a first improvement.) + for (int ch = 0; ch < NCHANNELS_RGBA; ++ch) + { + // figure out which endpoint when perturbed gives the most improvement and start there + // if we just alternate, we can easily end up in a local minima + float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A + float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B + + if (err0 < err1) + { + if (err0 >= opt_err) + continue; + + for (int i=0; i= opt_err) + continue; + + for (int i=0; i= opt_err) + break; + + for (int i=0; i> 1) & 1; + + // make sure we have a valid error for temp_in + // we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts + // (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position) + float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices); + + // now try to optimize these endpoints + float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out); + + // if we find an improvement, update the best so far and correct the output endpoints and errors + if (temp_out_err < best_err) + { + best_err = temp_out_err; + opt_err[region] = temp_out_err; + opt_endpts[region] = temp_out; + } + } + } +} + +/* optimization algorithm + for each pattern + convert endpoints using pattern precision + assign indices and get initial error + compress indices (and possibly reorder endpoints) + transform endpoints + if transformed endpoints fit pattern + get original endpoints back + optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better + compress new indices + transform new endpoints + if new endpoints fit pattern AND if error is improved + emit compressed block with new data + else + emit compressed block with original data // to try to preserve maximum endpoint precision + + simplify the above given that there is no transform now and that endpoints will always fit +*/ + +static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block) +{ + float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS]; + IntEndptsRGBA_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS]; + int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W]; + + for (int sp = 0; sp < NPATTERNS; ++sp) + { + quantize_endpts(endpts, pattern_precs[sp], orig_endpts); + assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err); + swap_indices(orig_endpts, orig_indices, shapeindex_best); + + optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts); + + assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err); + // (nreed) Commented out asserts because they go off all the time...not sure why + //for (int i=0; i 255.0f) v.x = 255.0f; + if (v.y < 0.0f) v.y = 0.0f; + if (v.y > 255.0f) v.y = 255.0f; + if (v.z < 0.0f) v.z = 0.0f; + if (v.z > 255.0f) v.z = 255.0f; + if (v.w < 0.0f) v.w = 0.0f; + if (v.w > 255.0f) v.w = 255.0f; +} + +static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES]) +{ + for (int region = 0; region < NREGIONS; ++region) + for (int i = 0; i < NINDICES; ++i) + palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM); +} + +// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined +static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS]) +{ + // build list of possibles + Vector4 palette[NREGIONS][NINDICES]; + + generate_palette_unquantized(endpts, palette); + + float toterr = 0; + Vector4 err; + + for (int y = 0; y < tile.size_y; y++) + for (int x = 0; x < tile.size_x; x++) + { + int region = REGION(x,y,shapeindex); + float err, besterr; + + besterr = Utils::metric4(tile.data[y][x], palette[region][0]); + + for (int i = 1; i < NINDICES && besterr > 0; ++i) + { + err = Utils::metric4(tile.data[y][x], palette[region][i]); + + if (err > besterr) // error increased, so we're done searching. this works for most norms. + break; + if (err < besterr) + besterr = err; + } + toterr += besterr; + } + return toterr; +} + +static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS]) +{ + for (int region=0; region maxp) maxp = dp; + } + + // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values + endpts[region].A = mean + minp*direction; + endpts[region].B = mean + maxp*direction; + + // clamp endpoints + // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best + // shape based on endpoints being clamped + clamp(endpts[region].A); + clamp(endpts[region].B); + } + + return map_colors(tile, shapeindex, endpts); +} + +static void swap(float *list1, int *list2, int i, int j) +{ + float t = list1[i]; list1[i] = list1[j]; list1[j] = t; + int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1; +} + +float AVPCL::compress_mode6(const Tile &t, char *block) +{ + // number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES + // NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out + const int NITEMS=1; + + // pick the best NITEMS shapes and refine these. + struct { + FltEndpts endpts[NREGIONS]; + } all[NSHAPES]; + float roughmse[NSHAPES]; + int index[NSHAPES]; + char tempblock[AVPCL::BLOCKSIZE]; + float msebest = FLT_MAX; + + for (int i=0; i roughmse[j]) + swap(roughmse, index, i, j); + + for (int i=0; i0; ++i) + { + int shape = index[i]; + float mse = refine(t, shape, &all[shape].endpts[0], tempblock); + if (mse < msebest) + { + memcpy(block, tempblock, sizeof(tempblock)); + msebest = mse; + } + } + return msebest; +} + diff --git a/3rdparty/nvtt/bc7/avpcl_mode7.cpp b/3rdparty/nvtt/bc7/avpcl_mode7.cpp new file mode 100644 index 00000000..a2af403e --- /dev/null +++ b/3rdparty/nvtt/bc7/avpcl_mode7.cpp @@ -0,0 +1,1094 @@ +/* +Copyright 2007 nVidia, Inc. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. +*/ + +// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis + +// x10000000 5555.1x4 64p 2bi (30b) + +#include "bits.h" +#include "tile.h" +#include "avpcl.h" +#include "nvcore/Debug.h" +#include "nvmath/Vector.inl" +#include "nvmath/Matrix.inl" +#include "nvmath/Fitting.h" +#include "avpcl_utils.h" +#include "endpts.h" +#include +#include + +#include "shapes_two.h" + +using namespace nv; +using namespace AVPCL; + +#define NLSBMODES 4 // number of different lsb modes per region. since we have two .1 per region, that can have 4 values + +#define NINDICES 4 +#define INDEXBITS 2 +#define HIGH_INDEXBIT (1<<(INDEXBITS-1)) +#define DENOM (NINDICES-1) +#define BIAS (DENOM/2) + +// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like? +// i.e. can we search shapes in a particular order so we can see the global error minima easily and +// stop without having to touch all shapes? + +#define POS_TO_X(pos) ((pos)&3) +#define POS_TO_Y(pos) (((pos)>>2)&3) + +#define NBITSIZES (NREGIONS*2) +#define ABITINDEX(region) (2*(region)+0) +#define BBITINDEX(region) (2*(region)+1) + +struct ChanBits +{ + int nbitsizes[NBITSIZES]; // bitsizes for one channel +}; + +struct Pattern +{ + ChanBits chan[NCHANNELS_RGBA];// bit patterns used per channel + int transformed; // if 0, deltas are unsigned and no transform; otherwise, signed and transformed + int mode; // associated mode value + int modebits; // number of mode bits + const char *encoding; // verilog description of encoding for this mode +}; + +#define NPATTERNS 1 +#define NREGIONS 2 + +static Pattern patterns[NPATTERNS] = +{ + // red green blue alpha xfm mode mb + 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5, 0, 0x80, 8, "", +}; + +struct RegionPrec +{ + int endpt_a_prec[NCHANNELS_RGBA]; + int endpt_b_prec[NCHANNELS_RGBA]; +}; + +struct PatternPrec +{ + RegionPrec region_precs[NREGIONS]; +}; + + +// this is the precision for each channel and region +// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this! +static PatternPrec pattern_precs[NPATTERNS] = +{ + 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5, +}; + +// return # of bits needed to store n. handle signed or unsigned cases properly +static int nbits(int n, bool issigned) +{ + int nb; + if (n==0) + return 0; // no bits needed for 0, signed or not + else if (n > 0) + { + for (nb=0; n; ++nb, n>>=1) ; + return nb + (issigned?1:0); + } + else + { + nvAssert (issigned); + for (nb=0; n<-1; ++nb, n>>=1) ; + return nb + 1; + } +} + +static void transform_forward(IntEndptsRGBA_2 ep[NREGIONS]) +{ + nvUnreachable(); +} + +static void transform_inverse(IntEndptsRGBA_2 ep[NREGIONS]) +{ + nvUnreachable(); +} + +/* +we're using this table to assign lsbs +abgr >=2 correct +0000 0 0 +0001 0 0 +0010 0 0 +0011 1 x1 +0100 0 0 +0101 1 x1 +0110 1 x1 +0111 1 1 +1000 0 0 +1001 1 x0 +1010 1 x0 +1011 1 1 +1100 1 x0 +1101 1 1 +1110 1 1 +1111 1 1 + +we need 8 0's and 8 1's. the x's can be either 0 or 1 as long as you get 8/8. +I choose to assign the lsbs so that the rgb channels are as good as possible. +*/ + +// 6666 ->5555.1, use the "correct" column above to assign the lsb +static void compress_one(const IntEndptsRGBA& endpts, IntEndptsRGBA_2& compr_endpts) +{ + int onescnt; + + onescnt = 0; + for (int j=0; j> 1; + nvAssert (compr_endpts.A[j] < 32); + } + compr_endpts.a_lsb = onescnt >= 2; + + onescnt = 0; + for (int j=0; j> 1; + nvAssert (compr_endpts.B[j] < 32); + } + compr_endpts.b_lsb = onescnt >= 2; +} + +static void uncompress_one(const IntEndptsRGBA_2& compr_endpts, IntEndptsRGBA& endpts) +{ + for (int j=0; j> 2) & 3 and x = index & 3 +static void swap_indices(IntEndptsRGBA_2 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex) +{ + for (int region = 0; region < NREGIONS; ++region) + { + int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region); + + int x = POS_TO_X(position); + int y = POS_TO_Y(position); + nvAssert(REGION(x,y,shapeindex) == region); // double check the table + if (indices[y][x] & HIGH_INDEXBIT) + { + // high bit is set, swap the endpts and indices for this region + int t; + for (int i=0; i= 0 && pat_index < NPATTERNS); + nvAssert (in.getptr() == patterns[pat_index].modebits); + + shapeindex = in.read(SHAPEBITS); + p = patterns[pat_index]; + + for (int j=0; j 0; ++j) + { + err = !AVPCL::flag_premult ? Utils::metric4(colors[i], palette[j]) : + Utils::metric4premult(colors[i], palette[j]) ; + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[i] = j; + } + } + toterr += besterr; + + // check for early exit + if (toterr > current_err) + { + // fill out bogus index values so it's initialized at least + for (int k = i; k < np; ++k) + indices[k] = -1; + + return FLT_MAX; + } + } + return toterr; +} + +// assign indices given a tile, shape, and quantized endpoints, return toterr for each region +static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGBA_2 endpts[NREGIONS], const PatternPrec &pattern_prec, + int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS]) +{ + // build list of possibles + Vector4 palette[NREGIONS][NINDICES]; + + for (int region = 0; region < NREGIONS; ++region) + { + generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]); + toterr[region] = 0; + } + + Vector4 err; + + for (int y = 0; y < tile.size_y; y++) + for (int x = 0; x < tile.size_x; x++) + { + int region = REGION(x,y,shapeindex); + float err, besterr = FLT_MAX; + + for (int i = 0; i < NINDICES && besterr > 0; ++i) + { + err = !AVPCL::flag_premult ? Utils::metric4(tile.data[y][x], palette[region][i]) : + Utils::metric4premult(tile.data[y][x], palette[region][i]) ; + + if (err > besterr) // error increased, so we're done searching + break; + if (err < besterr) + { + besterr = err; + indices[y][x] = i; + } + } + toterr[region] += besterr; + } +} + +// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's +// this function returns either old_err or a value smaller (if it was successful in improving the error) +static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, const IntEndptsRGBA_2 &old_endpts, IntEndptsRGBA_2 &new_endpts, + float old_err, int do_b, int indices[Tile::TILE_TOTAL]) +{ + // we have the old endpoints: old_endpts + // we have the perturbed endpoints: new_endpts + // we have the temporary endpoints: temp_endpts + + IntEndptsRGBA_2 temp_endpts; + float min_err = old_err; // start with the best current error + int beststep; + int temp_indices[Tile::TILE_TOTAL]; + + for (int i=0; i>= 1) + { + bool improved = false; + for (int sign = -1; sign <= 1; sign += 2) + { + if (do_b == 0) + { + temp_endpts.A[ch] = new_endpts.A[ch] + sign * step; + if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec)) + continue; + } + else + { + temp_endpts.B[ch] = new_endpts.B[ch] + sign * step; + if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec)) + continue; + } + + float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices); + + if (err < min_err) + { + improved = true; + min_err = err; + beststep = sign * step; + for (int i=0; i 5000 perturb endpoints 50% of precision +// if err > 1000 25% +// if err > 200 12.5% +// if err > 40 6.25% +// for np = 16 -- adjust error thresholds as a function of np +// always ensure endpoint ordering is preserved (no need to overlap the scan) +// if orig_err returned from this is less than its input value, then indices[] will contain valid indices +static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec ®ion_prec, float orig_err, IntEndptsRGBA_2 &opt_endpts, int indices[Tile::TILE_TOTAL]) +{ + IntEndptsRGBA_2 temp_endpts; + float best_err = orig_err; + int aprec = region_prec.endpt_a_prec[ch]; + int bprec = region_prec.endpt_b_prec[ch]; + int good_indices[Tile::TILE_TOTAL]; + int temp_indices[Tile::TILE_TOTAL]; + + for (int i=0; i 5000.0*thr_scale) { adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; } + else if (orig_err > 1000.0*thr_scale) { adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; } + else if (orig_err > 200.0*thr_scale) { adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; } + else if (orig_err > 40.0*thr_scale) { adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; } + adelta = max(adelta, 3); + bdelta = max(bdelta, 3); + +#ifdef DISABLE_EXHAUSTIVE + adelta = bdelta = 3; +#endif + + temp_endpts = opt_endpts; + + // ok figure out the range of A and B + int alow = max(0, opt_endpts.A[ch] - adelta); + int ahigh = min((1<= initial_error) break + rgb0 += delta0 + next = 1 + else + if (err1 >= initial_error) break + rgb1 += delta1 + next = 0 + initial_err = map() + for (;;) + err = perturb(next ? rgb1:rgb0, delta) + if (err >= initial_err) break + next? rgb1 : rgb0 += delta + initial_err = err + */ + IntEndptsRGBA_2 new_a, new_b; + IntEndptsRGBA_2 new_endpt; + int do_b; + int orig_indices[Tile::TILE_TOTAL]; + int new_indices[Tile::TILE_TOTAL]; + int temp_indices0[Tile::TILE_TOTAL]; + int temp_indices1[Tile::TILE_TOTAL]; + + // now optimize each channel separately + // for the first error improvement, we save the indices. then, for any later improvement, we compare the indices + // if they differ, we restart the loop (which then falls back to looking for a first improvement.) + for (int ch = 0; ch < NCHANNELS_RGBA; ++ch) + { + // figure out which endpoint when perturbed gives the most improvement and start there + // if we just alternate, we can easily end up in a local minima + float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0); // perturb endpt A + float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1); // perturb endpt B + + if (err0 < err1) + { + if (err0 >= opt_err) + continue; + + for (int i=0; i= opt_err) + continue; + + for (int i=0; i= opt_err) + break; + + for (int i=0; i> 1) & 1; + + // make sure we have a valid error for temp_in + // we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts + // (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position) + float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices); + + // now try to optimize these endpoints + float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out); + + // if we find an improvement, update the best so far and correct the output endpoints and errors + if (temp_out_err < best_err) + { + best_err = temp_out_err; + opt_err[region] = temp_out_err; + opt_endpts[region] = temp_out; + } + } + } +} + +/* optimization algorithm + for each pattern + convert endpoints using pattern precision + assign indices and get initial error + compress indices (and possibly reorder endpoints) + transform endpoints + if transformed endpoints fit pattern + get original endpoints back + optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better + compress new indices + transform new endpoints + if new endpoints fit pattern AND if error is improved + emit compressed block with new data + else + emit compressed block with original data // to try to preserve maximum endpoint precision +*/ + +static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block) +{ + float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS]; + IntEndptsRGBA_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS]; + int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W]; + + for (int sp = 0; sp < NPATTERNS; ++sp) + { + quantize_endpts(endpts, pattern_precs[sp], orig_endpts); + assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err); + swap_indices(orig_endpts, orig_indices, shapeindex_best); + if (patterns[sp].transformed) + transform_forward(orig_endpts); + // apply a heuristic here -- we check if the endpoints fit before we try to optimize them. + // the assumption made is that if they don't fit now, they won't fit after optimizing. + if (endpts_fit(orig_endpts, patterns[sp])) + { + if (patterns[sp].transformed) + transform_inverse(orig_endpts); + optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts); + assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err); + // (nreed) Commented out asserts because they go off all the time...not sure why + //for (int i=0; i 255.0f) v.x = 255.0f; + if (v.y < 0.0f) v.y = 0.0f; + if (v.y > 255.0f) v.y = 255.0f; + if (v.z < 0.0f) v.z = 0.0f; + if (v.z > 255.0f) v.z = 255.0f; + if (v.w < 0.0f) v.w = 0.0f; + if (v.w > 255.0f) v.w = 255.0f; +} + +static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES]) +{ + for (int region = 0; region < NREGIONS; ++region) + for (int i = 0; i < NINDICES; ++i) + palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM); +} + +// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined +static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS]) +{ + // build list of possibles + Vector4 palette[NREGIONS][NINDICES]; + + generate_palette_unquantized(endpts, palette); + + float toterr = 0; + Vector4 err; + + for (int y = 0; y < tile.size_y; y++) + for (int x = 0; x < tile.size_x; x++) + { + int region = REGION(x,y,shapeindex); + float err, besterr = FLT_MAX; + + for (int i = 0; i < NINDICES && besterr > 0; ++i) + { + err = Utils::metric4(tile.data[y][x], palette[region][i]); + + if (err > besterr) // error increased, so we're done searching. this works for most norms. + break; + if (err < besterr) + besterr = err; + } + toterr += besterr; + } + return toterr; +} + +static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS]) +{ + for (int region=0; region maxp) maxp = dp; + } + + // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values + endpts[region].A = mean + minp*direction; + endpts[region].B = mean + maxp*direction; + + // clamp endpoints + // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best + // shape based on endpoints being clamped + clamp(endpts[region].A); + clamp(endpts[region].B); + } + + return map_colors(tile, shapeindex, endpts); +} + +static void swap(float *list1, int *list2, int i, int j) +{ + float t = list1[i]; list1[i] = list1[j]; list1[j] = t; + int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1; +} + +float AVPCL::compress_mode7(const Tile &t, char *block) +{ + // number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES + // NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out + const int NITEMS=NSHAPES/4; + + // pick the best NITEMS shapes and refine these. + struct { + FltEndpts endpts[NREGIONS]; + } all[NSHAPES]; + float roughmse[NSHAPES]; + int index[NSHAPES]; + char tempblock[AVPCL::BLOCKSIZE]; + float msebest = FLT_MAX; + + for (int i=0; i roughmse[j]) + swap(roughmse, index, i, j); + + for (int i=0; i0; ++i) + { + int shape = index[i]; + float mse = refine(t, shape, &all[shape].endpts[0], tempblock); + if (mse < msebest) + { + memcpy(block, tempblock, sizeof(tempblock)); + msebest = mse; + } + } + return msebest; +} + diff --git a/3rdparty/nvtt/bc7/avpcl_utils.cpp b/3rdparty/nvtt/bc7/avpcl_utils.cpp new file mode 100644 index 00000000..efe426b2 --- /dev/null +++ b/3rdparty/nvtt/bc7/avpcl_utils.cpp @@ -0,0 +1,389 @@ +/* +Copyright 2007 nVidia, Inc. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. +*/ + +// Utility and common routines + +#include "avpcl_utils.h" +#include "avpcl.h" +#include "nvmath/Vector.inl" +#include + +using namespace nv; +using namespace AVPCL; + +static const int denom7_weights[] = {0, 9, 18, 27, 37, 46, 55, 64}; // divided by 64 +static const int denom15_weights[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64}; // divided by 64 + +int Utils::lerp(int a, int b, int i, int bias, int denom) +{ +#ifdef USE_ZOH_INTERP + nvAssert (denom == 3 || denom == 7 || denom == 15); + nvAssert (i >= 0 && i <= denom); + nvAssert (bias >= 0 && bias <= denom/2); + nvAssert (a >= 0 && b >= 0); + + int round = 0; +#ifdef USE_ZOH_INTERP_ROUNDED + round = 32; +#endif + + switch (denom) + { + case 3: denom *= 5; i *= 5; // fall through to case 15 + case 15:return (a*denom15_weights[denom-i] + b*denom15_weights[i] + round) >> 6; + case 7: return (a*denom7_weights[denom-i] + b*denom7_weights[i] + round) >> 6; + default: nvUnreachable(); return 0; + } +#else + return (((a)*((denom)-i)+(b)*(i)+(bias))/(denom)); // simple exact interpolation +#endif +} + +Vector4 Utils::lerp(Vector4::Arg a, Vector4::Arg b, int i, int bias, int denom) +{ +#ifdef USE_ZOH_INTERP + nvAssert (denom == 3 || denom == 7 || denom == 15); + nvAssert (i >= 0 && i <= denom); + nvAssert (bias >= 0 && bias <= denom/2); +// nvAssert (a >= 0 && b >= 0); + + // no need to bias these as this is an exact division + + switch (denom) + { + case 3: denom *= 5; i *= 5; // fall through to case 15 + case 15:return (a*float(denom15_weights[denom-i]) + b*float(denom15_weights[i])) / 64.0f; + case 7: return (a*float(denom7_weights[denom-i]) + b*float(denom7_weights[i])) / 64.0f; + default: nvUnreachable(); return Vector4(0); + } +#else + return (((a)*((denom)-i)+(b)*(i)+(bias))/(denom)); // simple exact interpolation +#endif +} + + +int Utils::unquantize(int q, int prec) +{ + int unq; + + nvAssert (prec > 3); // we only want to do one replicate + +#ifdef USE_ZOH_QUANT + if (prec >= 8) + unq = q; + else if (q == 0) + unq = 0; + else if (q == ((1<> prec; +#else + // avpcl unquantizer -- bit replicate + unq = (q << (8-prec)) | (q >> (2*prec-8)); +#endif + + return unq; +} + +// quantize to the best value -- i.e., minimize unquantize error +int Utils::quantize(float value, int prec) +{ + int q, unq; + + nvAssert (prec > 3); // we only want to do one replicate + + unq = (int)floor(value + 0.5f); + nvAssert (unq <= 255); + +#ifdef USE_ZOH_QUANT + q = (prec >= 8) ? unq : (unq << prec) / 256; +#else + // avpcl quantizer -- scale properly for best possible bit-replicated result + q = (unq * ((1<= 0 && q < (1 << prec)); + + return q; +} + +float Utils::metric4(Vector4::Arg a, Vector4::Arg b) +{ + Vector4 err = a - b; + + // if nonuniform, select weights and weigh away + if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati) + { + float rwt, gwt, bwt; + if (AVPCL::flag_nonuniform) + { + rwt = 0.299f; gwt = 0.587f; bwt = 0.114f; + } + else /*if (AVPCL::flag_nonuniform_ati)*/ + { + rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f; + } + + // weigh the components + err.x *= rwt; + err.y *= gwt; + err.z *= bwt; + } + + return lengthSquared(err); +} + +// WORK -- implement rotatemode for the below -- that changes where the rwt, gwt, and bwt's go. +float Utils::metric3(Vector3::Arg a, Vector3::Arg b, int rotatemode) +{ + Vector3 err = a - b; + + // if nonuniform, select weights and weigh away + if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati) + { + float rwt, gwt, bwt; + if (AVPCL::flag_nonuniform) + { + rwt = 0.299f; gwt = 0.587f; bwt = 0.114f; + } + else if (AVPCL::flag_nonuniform_ati) + { + rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f; + } + + // adjust weights based on rotatemode + switch(rotatemode) + { + case ROTATEMODE_RGBA_RGBA: break; + case ROTATEMODE_RGBA_AGBR: rwt = 1.0f; break; + case ROTATEMODE_RGBA_RABG: gwt = 1.0f; break; + case ROTATEMODE_RGBA_RGAB: bwt = 1.0f; break; + default: nvUnreachable(); + } + + // weigh the components + err.x *= rwt; + err.y *= gwt; + err.z *= bwt; + } + + return lengthSquared(err); +} + +float Utils::metric1(const float a, const float b, int rotatemode) +{ + float err = a - b; + + // if nonuniform, select weights and weigh away + if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati) + { + float rwt, gwt, bwt, awt; + if (AVPCL::flag_nonuniform) + { + rwt = 0.299f; gwt = 0.587f; bwt = 0.114f; + } + else if (AVPCL::flag_nonuniform_ati) + { + rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f; + } + + // adjust weights based on rotatemode + switch(rotatemode) + { + case ROTATEMODE_RGBA_RGBA: awt = 1.0f; break; + case ROTATEMODE_RGBA_AGBR: awt = rwt; break; + case ROTATEMODE_RGBA_RABG: awt = gwt; break; + case ROTATEMODE_RGBA_RGAB: awt = bwt; break; + default: nvUnreachable(); + } + + // weigh the components + err *= awt; + } + + return err * err; +} + +float Utils::premult(float r, float a) +{ + // note that the args are really integers stored in floats + int R = int(r), A = int(a); + + nvAssert ((R==r) && (A==a)); + + return float((R*A + 127)/255); +} + +static void premult4(Vector4& rgba) +{ + rgba.x = Utils::premult(rgba.x, rgba.w); + rgba.y = Utils::premult(rgba.y, rgba.w); + rgba.z = Utils::premult(rgba.z, rgba.w); +} + +static void premult3(Vector3& rgb, float a) +{ + rgb.x = Utils::premult(rgb.x, a); + rgb.y = Utils::premult(rgb.y, a); + rgb.z = Utils::premult(rgb.z, a); +} + +float Utils::metric4premult(Vector4::Arg a, Vector4::Arg b) +{ + Vector4 pma = a, pmb = b; + + premult4(pma); + premult4(pmb); + + Vector4 err = pma - pmb; + + // if nonuniform, select weights and weigh away + if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati) + { + float rwt, gwt, bwt; + if (AVPCL::flag_nonuniform) + { + rwt = 0.299f; gwt = 0.587f; bwt = 0.114f; + } + else /*if (AVPCL::flag_nonuniform_ati)*/ + { + rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f; + } + + // weigh the components + err.x *= rwt; + err.y *= gwt; + err.z *= bwt; + } + + return lengthSquared(err); +} + +float Utils::metric3premult_alphaout(Vector3::Arg rgb0, float a0, Vector3::Arg rgb1, float a1) +{ + Vector3 pma = rgb0, pmb = rgb1; + + premult3(pma, a0); + premult3(pmb, a1); + + Vector3 err = pma - pmb; + + // if nonuniform, select weights and weigh away + if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati) + { + float rwt, gwt, bwt; + if (AVPCL::flag_nonuniform) + { + rwt = 0.299f; gwt = 0.587f; bwt = 0.114f; + } + else /*if (AVPCL::flag_nonuniform_ati)*/ + { + rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f; + } + + // weigh the components + err.x *= rwt; + err.y *= gwt; + err.z *= bwt; + } + + return lengthSquared(err); +} + +float Utils::metric3premult_alphain(Vector3::Arg rgb0, Vector3::Arg rgb1, int rotatemode) +{ + Vector3 pma = rgb0, pmb = rgb1; + + switch(rotatemode) + { + case ROTATEMODE_RGBA_RGBA: + // this function isn't supposed to be called for this rotatemode + nvUnreachable(); + break; + case ROTATEMODE_RGBA_AGBR: + pma.y = premult(pma.y, pma.x); + pma.z = premult(pma.z, pma.x); + pmb.y = premult(pmb.y, pmb.x); + pmb.z = premult(pmb.z, pmb.x); + break; + case ROTATEMODE_RGBA_RABG: + pma.x = premult(pma.x, pma.y); + pma.z = premult(pma.z, pma.y); + pmb.x = premult(pmb.x, pmb.y); + pmb.z = premult(pmb.z, pmb.y); + break; + case ROTATEMODE_RGBA_RGAB: + pma.x = premult(pma.x, pma.z); + pma.y = premult(pma.y, pma.z); + pmb.x = premult(pmb.x, pmb.z); + pmb.y = premult(pmb.y, pmb.z); + break; + default: nvUnreachable(); + } + + Vector3 err = pma - pmb; + + // if nonuniform, select weights and weigh away + if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati) + { + float rwt, gwt, bwt; + if (AVPCL::flag_nonuniform) + { + rwt = 0.299f; gwt = 0.587f; bwt = 0.114f; + } + else /*if (AVPCL::flag_nonuniform_ati)*/ + { + rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f; + } + + // weigh the components + err.x *= rwt; + err.y *= gwt; + err.z *= bwt; + } + + return lengthSquared(err); +} + +float Utils::metric1premult(float rgb0, float a0, float rgb1, float a1, int rotatemode) +{ + float err = premult(rgb0, a0) - premult(rgb1, a1); + + // if nonuniform, select weights and weigh away + if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati) + { + float rwt, gwt, bwt, awt; + if (AVPCL::flag_nonuniform) + { + rwt = 0.299f; gwt = 0.587f; bwt = 0.114f; + } + else if (AVPCL::flag_nonuniform_ati) + { + rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f; + } + + // adjust weights based on rotatemode + switch(rotatemode) + { + case ROTATEMODE_RGBA_RGBA: awt = 1.0f; break; + case ROTATEMODE_RGBA_AGBR: awt = rwt; break; + case ROTATEMODE_RGBA_RABG: awt = gwt; break; + case ROTATEMODE_RGBA_RGAB: awt = bwt; break; + default: nvUnreachable(); + } + + // weigh the components + err *= awt; + } + + return err * err; +} diff --git a/3rdparty/nvtt/bc7/avpcl_utils.h b/3rdparty/nvtt/bc7/avpcl_utils.h new file mode 100644 index 00000000..9ebf8d17 --- /dev/null +++ b/3rdparty/nvtt/bc7/avpcl_utils.h @@ -0,0 +1,61 @@ +/* +Copyright 2007 nVidia, Inc. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. +*/ + +// utility class holding common routines +#ifndef _AVPCL_UTILS_H +#define _AVPCL_UTILS_H + +#include "nvmath/Vector.h" + +namespace AVPCL { + +inline int SIGN_EXTEND(int x, int nb) { return ((((x)&(1<<((nb)-1)))?((~0)<<(nb)):0)|(x)); } + +static const int INDEXMODE_BITS = 1; // 2 different index modes +static const int NINDEXMODES = (1<<(INDEXMODE_BITS)); +static const int INDEXMODE_ALPHA_IS_3BITS = 0; +static const int INDEXMODE_ALPHA_IS_2BITS = 1; + +static const int ROTATEMODE_BITS = 2; // 4 different rotate modes +static const int NROTATEMODES = (1<<(ROTATEMODE_BITS)); +static const int ROTATEMODE_RGBA_RGBA = 0; +static const int ROTATEMODE_RGBA_AGBR = 1; +static const int ROTATEMODE_RGBA_RABG = 2; +static const int ROTATEMODE_RGBA_RGAB = 3; + +class Utils +{ +public: + // error metrics + static float metric4(nv::Vector4::Arg a, nv::Vector4::Arg b); + static float metric3(nv::Vector3::Arg a, nv::Vector3::Arg b, int rotatemode); + static float metric1(float a, float b, int rotatemode); + + static float metric4premult(nv::Vector4::Arg rgba0, nv::Vector4::Arg rgba1); + static float metric3premult_alphaout(nv::Vector3::Arg rgb0, float a0, nv::Vector3::Arg rgb1, float a1); + static float metric3premult_alphain(nv::Vector3::Arg rgb0, nv::Vector3::Arg rgb1, int rotatemode); + static float metric1premult(float rgb0, float a0, float rgb1, float a1, int rotatemode); + + static float premult(float r, float a); + + // quantization and unquantization + static int unquantize(int q, int prec); + static int quantize(float value, int prec); + + // lerping + static int lerp(int a, int b, int i, int bias, int denom); + static nv::Vector4 lerp(nv::Vector4::Arg a, nv::Vector4::Arg b, int i, int bias, int denom); +}; + +} + +#endif \ No newline at end of file diff --git a/3rdparty/nvtt/bc7/bits.h b/3rdparty/nvtt/bc7/bits.h new file mode 100644 index 00000000..3c579d8f --- /dev/null +++ b/3rdparty/nvtt/bc7/bits.h @@ -0,0 +1,76 @@ +/* +Copyright 2007 nVidia, Inc. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. +*/ + +#ifndef _AVPCL_BITS_H +#define _AVPCL_BITS_H + +// read/write a bitstream + +#include "nvcore/Debug.h" + +namespace AVPCL { + +class Bits +{ +public: + + Bits(char *data, int maxdatabits) { nvAssert (data && maxdatabits > 0); bptr = bend = 0; bits = data; maxbits = maxdatabits; readonly = 0;} + Bits(const char *data, int availdatabits) { nvAssert (data && availdatabits > 0); bptr = 0; bend = availdatabits; cbits = data; maxbits = availdatabits; readonly = 1;} + + void write(int value, int nbits) { + nvAssert (nbits >= 0 && nbits < 32); + nvAssert (sizeof(int)>= 4); + for (int i=0; i>i); + } + int read(int nbits) { + nvAssert (nbits >= 0 && nbits < 32); + nvAssert (sizeof(int)>= 4); + int out = 0; + for (int i=0; i= 0 && ptr < maxbits); bptr = ptr; } + int getsize() { return bend; } + +private: + int bptr; // next bit to read + int bend; // last written bit + 1 + char *bits; // ptr to user bit stream + const char *cbits; // ptr to const user bit stream + int maxbits; // max size of user bit stream + char readonly; // 1 if this is a read-only stream + + int readone() { + nvAssert (bptr < bend); + if (bptr >= bend) return 0; + int bit = (readonly ? cbits[bptr>>3] : bits[bptr>>3]) & (1 << (bptr & 7)); + ++bptr; + return bit != 0; + } + void writeone(int bit) { + nvAssert (!readonly); // "Writing a read-only bit stream" + nvAssert (bptr < maxbits); + if (bptr >= maxbits) return; + if (bit&1) + bits[bptr>>3] |= 1 << (bptr & 7); + else + bits[bptr>>3] &= ~(1 << (bptr & 7)); + if (bptr++ >= bend) bend = bptr; + } +}; + +} + +#endif \ No newline at end of file diff --git a/3rdparty/nvtt/bc7/endpts.h b/3rdparty/nvtt/bc7/endpts.h new file mode 100644 index 00000000..e635ff13 --- /dev/null +++ b/3rdparty/nvtt/bc7/endpts.h @@ -0,0 +1,81 @@ +/* +Copyright 2007 nVidia, Inc. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. +*/ + +#ifndef _AVPCL_ENDPTS_H +#define _AVPCL_ENDPTS_H + +// endpoint definitions and routines to search through endpoint space + +#include "nvmath/Vector.h" + +namespace AVPCL { + +static const int NCHANNELS_RGB = 3; +static const int NCHANNELS_RGBA = 4; +static const int CHANNEL_R = 0; +static const int CHANNEL_G = 1; +static const int CHANNEL_B = 2; +static const int CHANNEL_A = 3; + +struct FltEndpts +{ + nv::Vector4 A; + nv::Vector4 B; +}; + +struct IntEndptsRGB +{ + int A[NCHANNELS_RGB]; + int B[NCHANNELS_RGB]; +}; + +struct IntEndptsRGB_1 +{ + int A[NCHANNELS_RGB]; + int B[NCHANNELS_RGB]; + int lsb; // shared lsb for A and B +}; + +struct IntEndptsRGB_2 +{ + int A[NCHANNELS_RGB]; + int B[NCHANNELS_RGB]; + int a_lsb; // lsb for A + int b_lsb; // lsb for B +}; + + +struct IntEndptsRGBA +{ + int A[NCHANNELS_RGBA]; + int B[NCHANNELS_RGBA]; +}; + +struct IntEndptsRGBA_2 +{ + int A[NCHANNELS_RGBA]; + int B[NCHANNELS_RGBA]; + int a_lsb; // lsb for A + int b_lsb; // lsb for B +}; + +struct IntEndptsRGBA_2a +{ + int A[NCHANNELS_RGBA]; + int B[NCHANNELS_RGBA]; + int a_lsb; // lsb for RGB channels of A + int b_lsb; // lsb for RGB channels of A +}; + +} + +#endif diff --git a/3rdparty/nvtt/bc7/shapes_three.h b/3rdparty/nvtt/bc7/shapes_three.h new file mode 100644 index 00000000..dc95ba5f --- /dev/null +++ b/3rdparty/nvtt/bc7/shapes_three.h @@ -0,0 +1,132 @@ +/* +Copyright 2007 nVidia, Inc. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. +*/ + +#ifndef _AVPCL_SHAPES_THREE_H +#define _AVPCL_SHAPES_THREE_H + +// shapes for 3 regions + +#define NREGIONS 3 +#define NSHAPES 64 +#define SHAPEBITS 6 + +static int shapes[NSHAPES*16] = +{ +0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 2, 2, +0, 0, 1, 1, 0, 0, 1, 1, 2, 0, 0, 1, 0, 0, 2, 2, +0, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 0, 0, 1, 1, +2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 0, 1, 1, 1, + +0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 0, 0, 1, 1, +0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 0, 0, 1, 1, +1, 1, 2, 2, 0, 0, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, +1, 1, 2, 2, 0, 0, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, + +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, +0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 2, +1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 1, 2, +2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 1, 2, + +0, 1, 1, 2, 0, 1, 2, 2, 0, 0, 1, 1, 0, 0, 1, 1, +0, 1, 1, 2, 0, 1, 2, 2, 0, 1, 1, 2, 2, 0, 0, 1, +0, 1, 1, 2, 0, 1, 2, 2, 1, 1, 2, 2, 2, 2, 0, 0, +0, 1, 1, 2, 0, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 0, + +0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 2, 2, +0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, +0, 1, 1, 2, 2, 0, 0, 1, 1, 1, 2, 2, 0, 0, 2, 2, +1, 1, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2, 1, 1, 1, 1, + +0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, +0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, +0, 2, 2, 2, 2, 2, 2, 1, 0, 1, 2, 2, 2, 2, 1, 0, +0, 2, 2, 2, 2, 2, 2, 1, 0, 1, 2, 2, 2, 2, 1, 0, + +0, 1, 2, 2, 0, 0, 1, 2, 0, 1, 1, 0, 0, 0, 0, 0, +0, 1, 2, 2, 0, 0, 1, 2, 1, 2, 2, 1, 0, 1, 1, 0, +0, 0, 1, 1, 1, 1, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, +0, 0, 0, 0, 2, 2, 2, 2, 0, 1, 1, 0, 1, 2, 2, 1, + +0, 0, 2, 2, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, +1, 1, 0, 2, 0, 1, 1, 0, 0, 1, 2, 2, 2, 0, 0, 0, +1, 1, 0, 2, 2, 0, 0, 2, 0, 1, 2, 2, 2, 2, 1, 1, +0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2, 2, 1, + +0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 1, 1, 0, 1, 2, 0, +0, 0, 0, 2, 0, 0, 2, 2, 0, 0, 1, 2, 0, 1, 2, 0, +1, 1, 2, 2, 0, 0, 1, 2, 0, 0, 2, 2, 0, 1, 2, 0, +1, 2, 2, 2, 0, 0, 1, 1, 0, 2, 2, 2, 0, 1, 2, 0, + +0, 0, 0, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 0, 1, 1, +1, 1, 1, 1, 1, 2, 0, 1, 2, 0, 1, 2, 2, 2, 0, 0, +2, 2, 2, 2, 2, 0, 1, 2, 1, 2, 0, 1, 1, 1, 2, 2, +0, 0, 0, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 0, 1, 1, + +0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 2, 2, +1, 1, 2, 2, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 2, 2, +2, 2, 0, 0, 2, 2, 2, 2, 2, 1, 2, 1, 0, 0, 2, 2, +0, 0, 1, 1, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 2, 2, + +0, 0, 2, 2, 0, 2, 2, 0, 0, 1, 0, 1, 0, 0, 0, 0, +0, 0, 1, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, +0, 0, 2, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 1, 2, 1, +0, 0, 1, 1, 1, 2, 2, 1, 0, 1, 0, 1, 2, 1, 2, 1, + +0, 1, 0, 1, 0, 2, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, +0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, +0, 1, 0, 1, 0, 2, 2, 2, 0, 0, 0, 2, 2, 1, 1, 2, +2, 2, 2, 2, 0, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, + +0, 2, 2, 2, 0, 0, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, +0, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 0, 0, 0, 0, 0, +0, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 0, 2, 1, 1, 2, +0, 2, 2, 2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 1, 1, 2, + +0, 1, 1, 0, 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 0, 0, +0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 0, 0, +2, 2, 2, 2, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 0, 0, +2, 2, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 2, 1, 1, 2, + +0, 0, 0, 2, 0, 2, 2, 2, 0, 1, 0, 1, 0, 1, 1, 1, +0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 1, +0, 0, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, +0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, +}; + +#define REGION(x,y,si) shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16] + +static int shapeindex_to_compressed_indices[NSHAPES*3] = +{ + 0, 3,15, 0, 3, 8, 0,15, 8, 0,15, 3, + 0, 8,15, 0, 3,15, 0,15, 3, 0,15, 8, + 0, 8,15, 0, 8,15, 0, 6,15, 0, 6,15, + 0, 6,15, 0, 5,15, 0, 3,15, 0, 3, 8, + + 0, 3,15, 0, 3, 8, 0, 8,15, 0,15, 3, + 0, 3,15, 0, 3, 8, 0, 6,15, 0,10, 8, + 0, 5, 3, 0, 8,15, 0, 8, 6, 0, 6,10, + 0, 8,15, 0, 5,15, 0,15,10, 0,15, 8, + + 0, 8,15, 0,15, 3, 0, 3,15, 0, 5,10, + 0, 6,10, 0,10, 8, 0, 8, 9, 0,15,10, + 0,15, 6, 0, 3,15, 0,15, 8, 0, 5,15, + 0,15, 3, 0,15, 6, 0,15, 6, 0,15, 8, + + 0, 3,15, 0,15, 3, 0, 5,15, 0, 5,15, + 0, 5,15, 0, 8,15, 0, 5,15, 0,10,15, + 0, 5,15, 0,10,15, 0, 8,15, 0,13,15, + 0,15, 3, 0,12,15, 0, 3,15, 0, 3, 8 + +}; +#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region) shapeindex_to_compressed_indices[(si)*3+(region)] + +#endif diff --git a/3rdparty/nvtt/bc7/shapes_two.h b/3rdparty/nvtt/bc7/shapes_two.h new file mode 100644 index 00000000..853d557a --- /dev/null +++ b/3rdparty/nvtt/bc7/shapes_two.h @@ -0,0 +1,133 @@ +/* +Copyright 2007 nVidia, Inc. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. +*/ + +#ifndef _AVPCL_SHAPES_TWO_H +#define _AVPCL_SHAPES_TWO_H + +// shapes for two regions + +#define NREGIONS 2 +#define NSHAPES 64 +#define SHAPEBITS 6 + +static int shapes[NSHAPES*16] = +{ +0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, +0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, +0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, +0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, + +0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, +0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, +0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, +0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, + +0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, +0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, +0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, +0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, + +0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + +0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, +1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, +1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, +1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, + +0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, +0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, +0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, +0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, + +0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, +0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, +0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, +0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, + +0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, +0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, +1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, +1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, + +0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, +0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, +0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, +0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, + +0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, +1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, +0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, +1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, + +0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, +0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, +1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, +1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, + +0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, +1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, +1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, +0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, + +0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, +1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, +0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, + +0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, +1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, +1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, +0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, + +0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, +1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, +1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, +1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, + +0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, +1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, +0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, +0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, + +}; + +#define REGION(x,y,si) shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16] + +static int shapeindex_to_compressed_indices[NSHAPES*2] = +{ + 0,15, 0,15, 0,15, 0,15, + 0,15, 0,15, 0,15, 0,15, + 0,15, 0,15, 0,15, 0,15, + 0,15, 0,15, 0,15, 0,15, + + 0,15, 0, 2, 0, 8, 0, 2, + 0, 2, 0, 8, 0, 8, 0,15, + 0, 2, 0, 8, 0, 2, 0, 2, + 0, 8, 0, 8, 0, 2, 0, 2, + + 0,15, 0,15, 0, 6, 0, 8, + 0, 2, 0, 8, 0,15, 0,15, + 0, 2, 0, 8, 0, 2, 0, 2, + 0, 2, 0,15, 0,15, 0, 6, + + 0, 6, 0, 2, 0, 6, 0, 8, + 0,15, 0,15, 0, 2, 0, 2, + 0,15, 0,15, 0,15, 0,15, + 0,15, 0, 2, 0, 2, 0,15 + +}; +#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region) shapeindex_to_compressed_indices[(si)*2+(region)] + +#endif diff --git a/3rdparty/nvtt/bc7/tile.h b/3rdparty/nvtt/bc7/tile.h new file mode 100644 index 00000000..47ea91b7 --- /dev/null +++ b/3rdparty/nvtt/bc7/tile.h @@ -0,0 +1,41 @@ +/* +Copyright 2007 nVidia, Inc. +Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. + +You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + +See the License for the specific language governing permissions and limitations under the License. +*/ + +#ifndef _AVPCL_TILE_H +#define _AVPCL_TILE_H + +#include "nvmath/Vector.h" +#include +#include "avpcl_utils.h" + +namespace AVPCL { + +// extract a tile of pixels from an array + +class Tile +{ +public: + static const int TILE_H = 4; + static const int TILE_W = 4; + static const int TILE_TOTAL = TILE_H * TILE_W; + nv::Vector4 data[TILE_H][TILE_W]; + float importance_map[TILE_H][TILE_W]; + int size_x, size_y; // actual size of tile + + Tile() {}; + ~Tile(){}; + Tile(int xs, int ys) {size_x = xs; size_y = ys;} +}; + +} + +#endif \ No newline at end of file diff --git a/3rdparty/nvtt/nvcore/Array.inl b/3rdparty/nvtt/nvcore/Array.inl new file mode 100644 index 00000000..2138b3ab --- /dev/null +++ b/3rdparty/nvtt/nvcore/Array.inl @@ -0,0 +1,437 @@ +// This code is in the public domain -- Ignacio Castaņo + +#ifndef NV_CORE_ARRAY_INL +#define NV_CORE_ARRAY_INL + +#include "array.h" + +#include "stream.h" +#include "utils.h" // swap + +#include // memmove +#include // for placement new + + + +namespace nv +{ + template + NV_FORCEINLINE T & Array::append() + { + uint old_size = m_size; + uint new_size = m_size + 1; + + setArraySize(new_size); + + construct_range(m_buffer, new_size, old_size); + + return m_buffer[old_size]; // Return reference to last element. + } + + // Push an element at the end of the vector. + template + NV_FORCEINLINE void Array::push_back( const T & val ) + { +#if 1 + nvDebugCheck(&val < m_buffer || &val >= m_buffer+m_size); + + uint old_size = m_size; + uint new_size = m_size + 1; + + setArraySize(new_size); + + construct_range(m_buffer, new_size, old_size, val); +#else + uint new_size = m_size + 1; + + if (new_size > m_capacity) + { + // @@ Is there any way to avoid this copy? + // @@ Can we create a copy without side effects? Ie. without calls to constructor/destructor. Use alloca + memcpy? + // @@ Assert instead of copy? + const T copy(val); // create a copy in case value is inside of this array. + + setArraySize(new_size); + + new (m_buffer+new_size-1) T(copy); + } + else + { + m_size = new_size; + new(m_buffer+new_size-1) T(val); + } +#endif // 0/1 + } + template + NV_FORCEINLINE void Array::pushBack( const T & val ) + { + push_back(val); + } + template + NV_FORCEINLINE Array & Array::append( const T & val ) + { + push_back(val); + return *this; + } + + // Qt like push operator. + template + NV_FORCEINLINE Array & Array::operator<< ( T & t ) + { + push_back(t); + return *this; + } + + // Pop the element at the end of the vector. + template + NV_FORCEINLINE void Array::pop_back() + { + nvDebugCheck( m_size > 0 ); + resize( m_size - 1 ); + } + template + NV_FORCEINLINE void Array::popBack(uint count) + { + nvDebugCheck(m_size >= count); + resize(m_size - count); + } + + template + NV_FORCEINLINE void Array::popFront(uint count) + { + nvDebugCheck(m_size >= count); + //resize(m_size - count); + + if (m_size == count) { + clear(); + } + else { + destroy_range(m_buffer, 0, count); + + memmove(m_buffer, m_buffer + count, sizeof(T) * (m_size - count)); + + m_size -= count; + } + + } + + + // Get back element. + template + NV_FORCEINLINE const T & Array::back() const + { + nvDebugCheck( m_size > 0 ); + return m_buffer[m_size-1]; + } + + // Get back element. + template + NV_FORCEINLINE T & Array::back() + { + nvDebugCheck( m_size > 0 ); + return m_buffer[m_size-1]; + } + + // Get front element. + template + NV_FORCEINLINE const T & Array::front() const + { + nvDebugCheck( m_size > 0 ); + return m_buffer[0]; + } + + // Get front element. + template + NV_FORCEINLINE T & Array::front() + { + nvDebugCheck( m_size > 0 ); + return m_buffer[0]; + } + + // Check if the given element is contained in the array. + template + NV_FORCEINLINE bool Array::contains(const T & e) const + { + return find(e, NULL); + } + + // Return true if element found. + template + NV_FORCEINLINE bool Array::find(const T & element, uint * indexPtr) const + { + return find(element, 0, m_size, indexPtr); + } + + // Return true if element found within the given range. + template + NV_FORCEINLINE bool Array::find(const T & element, uint begin, uint end, uint * indexPtr) const + { + return ::nv::find(element, m_buffer, begin, end, indexPtr); + } + + + // Remove the element at the given index. This is an expensive operation! + template + void Array::removeAt(uint index) + { + nvDebugCheck(index >= 0 && index < m_size); + + if (m_size == 1) { + clear(); + } + else { + m_buffer[index].~T(); + + memmove(m_buffer+index, m_buffer+index+1, sizeof(T) * (m_size - 1 - index)); + m_size--; + } + } + + // Remove the first instance of the given element. + template + bool Array::remove(const T & element) + { + uint index; + if (find(element, &index)) { + removeAt(index); + return true; + } + return false; + } + + // Insert the given element at the given index shifting all the elements up. + template + void Array::insertAt(uint index, const T & val/*=T()*/) + { + nvDebugCheck( index >= 0 && index <= m_size ); + + setArraySize(m_size + 1); + + if (index < m_size - 1) { + memmove(m_buffer+index+1, m_buffer+index, sizeof(T) * (m_size - 1 - index)); + } + + // Copy-construct into the newly opened slot. + new(m_buffer+index) T(val); + } + + // Append the given data to our vector. + template + NV_FORCEINLINE void Array::append(const Array & other) + { + append(other.m_buffer, other.m_size); + } + + // Append the given data to our vector. + template + void Array::append(const T other[], uint count) + { + if (count > 0) { + const uint old_size = m_size; + + setArraySize(m_size + count); + + for (uint i = 0; i < count; i++ ) { + new(m_buffer + old_size + i) T(other[i]); + } + } + } + + + // Remove the given element by replacing it with the last one. + template + void Array::replaceWithLast(uint index) + { + nvDebugCheck( index < m_size ); + nv::swap(m_buffer[index], back()); // @@ Is this OK when index == size-1? + (m_buffer+m_size-1)->~T(); + m_size--; + } + + // Resize the vector preserving existing elements. + template + void Array::resize(uint new_size) + { + uint old_size = m_size; + + // Destruct old elements (if we're shrinking). + destroy_range(m_buffer, new_size, old_size); + + setArraySize(new_size); + + // Call default constructors + construct_range(m_buffer, new_size, old_size); + } + + + // Resize the vector preserving existing elements and initializing the + // new ones with the given value. + template + void Array::resize(uint new_size, const T & elem) + { + nvDebugCheck(&elem < m_buffer || &elem > m_buffer+m_size); + + uint old_size = m_size; + + // Destruct old elements (if we're shrinking). + destroy_range(m_buffer, new_size, old_size); + + setArraySize(new_size); + + // Call copy constructors + construct_range(m_buffer, new_size, old_size, elem); + } + + // Fill array with the given value. + template + void Array::fill(const T & elem) + { + fill(m_buffer, m_size, elem); + } + + // Clear the buffer. + template + NV_FORCEINLINE void Array::clear() + { + nvDebugCheck(isValidPtr(m_buffer)); + + // Destruct old elements + destroy_range(m_buffer, 0, m_size); + + m_size = 0; + } + + // Shrink the allocated vector. + template + NV_FORCEINLINE void Array::shrink() + { + if (m_size < m_capacity) { + setArrayCapacity(m_size); + } + } + + // Preallocate space. + template + NV_FORCEINLINE void Array::reserve(uint desired_size) + { + if (desired_size > m_capacity) { + setArrayCapacity(desired_size); + } + } + + // Copy elements to this array. Resizes it if needed. + template + NV_FORCEINLINE void Array::copy(const T * data, uint count) + { +#if 1 // More simple, but maybe not be as efficient? + destroy_range(m_buffer, 0, m_size); + + setArraySize(count); + + construct_range(m_buffer, count, 0, data); +#else + const uint old_size = m_size; + + destroy_range(m_buffer, count, old_size); + + setArraySize(count); + + copy_range(m_buffer, data, old_size); + + construct_range(m_buffer, count, old_size, data); +#endif + } + + // Assignment operator. + template + NV_FORCEINLINE Array & Array::operator=( const Array & a ) + { + copy(a.m_buffer, a.m_size); + return *this; + } + + // Release ownership of allocated memory and returns pointer to it. + template + T * Array::release() { + T * tmp = m_buffer; + m_buffer = NULL; + m_capacity = 0; + m_size = 0; + return tmp; + } + + + + // Change array size. + template + inline void Array::setArraySize(uint new_size) { + m_size = new_size; + + if (new_size > m_capacity) { + uint new_buffer_size; + if (m_capacity == 0) { + // first allocation is exact + new_buffer_size = new_size; + } + else { + // following allocations grow array by 25% + new_buffer_size = new_size + (new_size >> 2); + } + + setArrayCapacity( new_buffer_size ); + } + } + + // Change array capacity. + template + inline void Array::setArrayCapacity(uint new_capacity) { + nvDebugCheck(new_capacity >= m_size); + + if (new_capacity == 0) { + // free the buffer. + if (m_buffer != NULL) { + free(m_buffer); + m_buffer = NULL; + } + } + else { + // realloc the buffer + m_buffer = realloc(m_buffer, new_capacity); + } + + m_capacity = new_capacity; + } + + // Array serialization. + template + inline Stream & operator<< ( Stream & s, Array & p ) + { + if (s.isLoading()) { + uint size; + s << size; + p.resize( size ); + } + else { + s << p.m_size; + } + + for (uint i = 0; i < p.m_size; i++) { + s << p.m_buffer[i]; + } + + return s; + } + + // Swap the members of the two given vectors. + template + inline void swap(Array & a, Array & b) + { + nv::swap(a.m_buffer, b.m_buffer); + nv::swap(a.m_capacity, b.m_capacity); + nv::swap(a.m_size, b.m_size); + } + + +} // nv namespace + +#endif // NV_CORE_ARRAY_INL diff --git a/3rdparty/nvtt/nvcore/Debug.h b/3rdparty/nvtt/nvcore/Debug.h new file mode 100644 index 00000000..61fbd2fc --- /dev/null +++ b/3rdparty/nvtt/nvcore/Debug.h @@ -0,0 +1,216 @@ +// This code is in the public domain -- Ignacio Castaņo + +#ifndef NV_CORE_DEBUG_H +#define NV_CORE_DEBUG_H + +#include "nvcore.h" + +#include // va_list + + +// Make sure we are using our assert. +#undef assert + +#define NV_ABORT_DEBUG 1 +#define NV_ABORT_IGNORE 2 +#define NV_ABORT_EXIT 3 + +#define nvNoAssert(exp) \ + NV_MULTI_LINE_MACRO_BEGIN \ + (void)sizeof(exp); \ + NV_MULTI_LINE_MACRO_END + +#if NV_NO_ASSERT + +# define nvAssert(exp) nvNoAssert(exp) +# define nvCheck(exp) nvNoAssert(exp) +# define nvDebugAssert(exp) nvNoAssert(exp) +# define nvDebugCheck(exp) nvNoAssert(exp) +# define nvDebugBreak() nvNoAssert(0) + +#else // NV_NO_ASSERT + +# if NV_CC_MSVC + // @@ Does this work in msvc-6 and earlier? +# define nvDebugBreak() __debugbreak() +//# define nvDebugBreak() __asm { int 3 } +# elif NV_OS_ORBIS +# define nvDebugBreak() __debugbreak() +# elif NV_CC_GNUC +# define nvDebugBreak() __builtin_trap() +# else +# error "No nvDebugBreak()!" +# endif + +/* +# elif NV_CC_GNUC || NV_CPU_PPC && NV_OS_DARWIN + // @@ Use __builtin_trap() on GCC +# define nvDebugBreak() __asm__ volatile ("trap") +# elif (NV_CC_GNUC || NV_CPU_X86 || NV_CPU_X86_64) && NV_OS_DARWIN +# define nvDebugBreak() __asm__ volatile ("int3") +# elif NV_CC_GNUC || NV_CPU_X86 || NV_CPU_X86_64 +# define nvDebugBreak() __asm__ ( "int %0" : :"I"(3) ) +# else +# include +# define nvDebugBreak() raise(SIGTRAP) +# endif +*/ + +#define nvDebugBreakOnce() \ + NV_MULTI_LINE_MACRO_BEGIN \ + static bool firstTime = true; \ + if (firstTime) { firstTime = false; nvDebugBreak(); } \ + NV_MULTI_LINE_MACRO_END + +#define nvAssertMacro(exp) \ + NV_MULTI_LINE_MACRO_BEGIN \ + if (!(exp)) { \ + if (nvAbort(#exp, __FILE__, __LINE__, __FUNC__) == NV_ABORT_DEBUG) { \ + nvDebugBreak(); \ + } \ + } \ + NV_MULTI_LINE_MACRO_END + +// GCC, LLVM need "##" before the __VA_ARGS__, MSVC doesn't care +#define nvAssertMacroWithIgnoreAll(exp,...) \ + NV_MULTI_LINE_MACRO_BEGIN \ + static bool ignoreAll = false; \ + if (!ignoreAll && !(exp)) { \ + int result = nvAbort(#exp, __FILE__, __LINE__, __FUNC__, ##__VA_ARGS__); \ + if (result == NV_ABORT_DEBUG) { \ + nvDebugBreak(); \ + } else if (result == NV_ABORT_IGNORE) { \ + ignoreAll = true; \ + } \ + } \ + NV_MULTI_LINE_MACRO_END + +// Interesting assert macro from Insomniac: +// http://www.gdcvault.com/play/1015319/Developing-Imperfect-Software-How-to +// Used as follows: +// if (nvCheck(i < count)) { +// normal path +// } else { +// fixup code. +// } +// This style of macro could be combined with __builtin_expect to let the compiler know failure is unlikely. +#define nvCheckMacro(exp) \ + (\ + (exp) ? true : ( \ + (nvAbort(#exp, __FILE__, __LINE__, __FUNC__) == NV_ABORT_DEBUG) ? (nvDebugBreak(), true) : ( false ) \ + ) \ + ) + + +#define nvAssert(exp) nvAssertMacro(exp) +#define nvCheck(exp) nvAssertMacro(exp) + +#if defined(_DEBUG) +# define nvDebugAssert(exp) nvAssertMacro(exp) +# define nvDebugCheck(exp) nvAssertMacro(exp) +#else // _DEBUG +# define nvDebugAssert(exp) nvNoAssert(exp) +# define nvDebugCheck(exp) nvNoAssert(exp) +#endif // _DEBUG + +#endif // NV_NO_ASSERT + +// Use nvAssume for very simple expresions only: nvAssume(0), nvAssume(value == true), etc. +/*#if !defined(_DEBUG) +# if NV_CC_MSVC +# define nvAssume(exp) __assume(exp) +# else +# define nvAssume(exp) nvCheck(exp) +# endif +#else +# define nvAssume(exp) nvCheck(exp) +#endif*/ + +#if defined(_DEBUG) +# if NV_CC_MSVC +# define nvUnreachable() nvAssert(0 && "unreachable"); __assume(0) +# else +# define nvUnreachable() nvAssert(0 && "unreachable"); __builtin_unreachable() +# endif +#else +# if NV_CC_MSVC +# define nvUnreachable() __assume(0) +# else +# define nvUnreachable() __builtin_unreachable() +# endif +#endif + + +#define nvError(x) nvAbort(x, __FILE__, __LINE__, __FUNC__) +#define nvWarning(x) nvDebugPrint("*** Warning %s/%d: %s\n", __FILE__, __LINE__, (x)) + +#ifndef NV_DEBUG_PRINT +#define NV_DEBUG_PRINT 1 //defined(_DEBUG) +#endif + +#if NV_DEBUG_PRINT +#define nvDebug(...) nvDebugPrint(__VA_ARGS__) +#else +#if NV_CC_MSVC +#define nvDebug(...) __noop(__VA_ARGS__) +#else +#define nvDebug(...) ((void)0) // Non-msvc platforms do not evaluate arguments? +#endif +#endif + + +NVCORE_API int nvAbort(const char *exp, const char *file, int line, const char * func = NULL, const char * msg = NULL, ...) __attribute__((format (printf, 5, 6))); +NVCORE_API void NV_CDECL nvDebugPrint( const char *msg, ... ) __attribute__((format (printf, 1, 2))); + +namespace nv +{ + inline bool isValidPtr(const void * ptr) { + #if NV_CPU_X86_64 + if (ptr == NULL) return true; + if (reinterpret_cast(ptr) < 0x10000ULL) return false; + if (reinterpret_cast(ptr) >= 0x000007FFFFFEFFFFULL) return false; + #else + if (reinterpret_cast(ptr) == 0xcccccccc) return false; + if (reinterpret_cast(ptr) == 0xcdcdcdcd) return false; + if (reinterpret_cast(ptr) == 0xdddddddd) return false; + if (reinterpret_cast(ptr) == 0xffffffff) return false; + #endif + return true; + } + + // Message handler interface. + struct MessageHandler { + virtual void log(const char * str, va_list arg) = 0; + virtual ~MessageHandler() {} + }; + + // Assert handler interface. + struct AssertHandler { + virtual int assertion(const char *exp, const char *file, int line, const char *func, const char *msg, va_list arg) = 0; + virtual ~AssertHandler() {} + }; + + + namespace debug + { + NVCORE_API void dumpInfo(); + NVCORE_API void dumpCallstack( MessageHandler *messageHandler, int callstackLevelsToSkip = 0 ); + + NVCORE_API void setMessageHandler( MessageHandler * messageHandler ); + NVCORE_API void resetMessageHandler(); + + NVCORE_API void setAssertHandler( AssertHandler * assertHanlder ); + NVCORE_API void resetAssertHandler(); + + NVCORE_API void enableSigHandler(bool interactive); + NVCORE_API void disableSigHandler(); + + NVCORE_API bool isDebuggerPresent(); + NVCORE_API bool attachToDebugger(); + + NVCORE_API void terminate(int code); + } + +} // nv namespace + +#endif // NV_CORE_DEBUG_H diff --git a/3rdparty/nvtt/nvcore/array.h b/3rdparty/nvtt/nvcore/array.h new file mode 100644 index 00000000..f4460f3b --- /dev/null +++ b/3rdparty/nvtt/nvcore/array.h @@ -0,0 +1,181 @@ +// This code is in the public domain -- Ignacio Castaņo + +#ifndef NV_CORE_ARRAY_H +#define NV_CORE_ARRAY_H + +/* +This array class requires the elements to be relocable; it uses memmove and realloc. Ideally I should be +using swap, but I honestly don't care. The only thing that you should be aware of is that internal pointers +are not supported. + +Note also that push_back and resize does not support inserting arguments elements that are in the same +container. This is forbidden to prevent an extra copy. +*/ + + +#include "memory.h" +#include "debug.h" +#include "foreach.h" // pseudoindex + + +namespace nv +{ + class Stream; + + /** + * Replacement for std::vector that is easier to debug and provides + * some nice foreach enumerators. + */ + template + class NVCORE_CLASS Array { + public: + typedef uint size_type; + + // Default constructor. + NV_FORCEINLINE Array() : m_buffer(NULL), m_capacity(0), m_size(0) {} + + // Copy constructor. + NV_FORCEINLINE Array(const Array & a) : m_buffer(NULL), m_capacity(0), m_size(0) { + copy(a.m_buffer, a.m_size); + } + + // Constructor that initializes the vector with the given elements. + NV_FORCEINLINE Array(const T * ptr, uint num) : m_buffer(NULL), m_capacity(0), m_size(0) { + copy(ptr, num); + } + + // Allocate array. + NV_FORCEINLINE explicit Array(uint capacity) : m_buffer(NULL), m_capacity(0), m_size(0) { + setArrayCapacity(capacity); + } + + // Destructor. + NV_FORCEINLINE ~Array() { + clear(); + free(m_buffer); + } + + + /// Const element access. + NV_FORCEINLINE const T & operator[]( uint index ) const + { + nvDebugCheck(index < m_size); + return m_buffer[index]; + } + NV_FORCEINLINE const T & at( uint index ) const + { + nvDebugCheck(index < m_size); + return m_buffer[index]; + } + + /// Element access. + NV_FORCEINLINE T & operator[] ( uint index ) + { + nvDebugCheck(index < m_size); + return m_buffer[index]; + } + NV_FORCEINLINE T & at( uint index ) + { + nvDebugCheck(index < m_size); + return m_buffer[index]; + } + + /// Get vector size. + NV_FORCEINLINE uint size() const { return m_size; } + + /// Get vector size. + NV_FORCEINLINE uint count() const { return m_size; } + + /// Get vector capacity. + NV_FORCEINLINE uint capacity() const { return m_capacity; } + + /// Get const vector pointer. + NV_FORCEINLINE const T * buffer() const { return m_buffer; } + + /// Get vector pointer. + NV_FORCEINLINE T * buffer() { return m_buffer; } + + /// Provide begin/end pointers for C++11 range-based for loops. + NV_FORCEINLINE T * begin() { return m_buffer; } + NV_FORCEINLINE T * end() { return m_buffer + m_size; } + NV_FORCEINLINE const T * begin() const { return m_buffer; } + NV_FORCEINLINE const T * end() const { return m_buffer + m_size; } + + /// Is vector empty. + NV_FORCEINLINE bool isEmpty() const { return m_size == 0; } + + /// Is a null vector. + NV_FORCEINLINE bool isNull() const { return m_buffer == NULL; } + + + T & append(); + void push_back( const T & val ); + void pushBack( const T & val ); + Array & append( const T & val ); + Array & operator<< ( T & t ); + void pop_back(); + void popBack(uint count = 1); + void popFront(uint count = 1); + const T & back() const; + T & back(); + const T & front() const; + T & front(); + bool contains(const T & e) const; + bool find(const T & element, uint * indexPtr) const; + bool find(const T & element, uint begin, uint end, uint * indexPtr) const; + void removeAt(uint index); + bool remove(const T & element); + void insertAt(uint index, const T & val = T()); + void append(const Array & other); + void append(const T other[], uint count); + void replaceWithLast(uint index); + void resize(uint new_size); + void resize(uint new_size, const T & elem); + void fill(const T & elem); + void clear(); + void shrink(); + void reserve(uint desired_size); + void copy(const T * data, uint count); + Array & operator=( const Array & a ); + T * release(); + + + // Array enumerator. + typedef uint PseudoIndex; + + NV_FORCEINLINE PseudoIndex start() const { return 0; } + NV_FORCEINLINE bool isDone(const PseudoIndex & i) const { nvDebugCheck(i <= this->m_size); return i == this->m_size; } + NV_FORCEINLINE void advance(PseudoIndex & i) const { nvDebugCheck(i <= this->m_size); i++; } + +#if NV_CC_MSVC + NV_FORCEINLINE T & operator[]( const PseudoIndexWrapper & i ) { + return m_buffer[i(this)]; + } + NV_FORCEINLINE const T & operator[]( const PseudoIndexWrapper & i ) const { + return m_buffer[i(this)]; + } +#endif + + // Friends. + template + friend Stream & operator<< ( Stream & s, Array & p ); + + template + friend void swap(Array & a, Array & b); + + + protected: + + void setArraySize(uint new_size); + void setArrayCapacity(uint new_capacity); + + T * m_buffer; + uint m_capacity; + uint m_size; + + }; + + +} // nv namespace + +#endif // NV_CORE_ARRAY_H diff --git a/3rdparty/nvtt/nvcore/defsgnucdarwin.h b/3rdparty/nvtt/nvcore/defsgnucdarwin.h new file mode 100644 index 00000000..e082ddc8 --- /dev/null +++ b/3rdparty/nvtt/nvcore/defsgnucdarwin.h @@ -0,0 +1,53 @@ +#ifndef NV_CORE_H +#error "Do not include this file directly." +#endif + +#include // uint8_t, int8_t, ... uintptr_t +#include // operator new, size_t, NULL + +// Function linkage +#define DLL_IMPORT +#if __GNUC__ >= 4 +# define DLL_EXPORT __attribute__((visibility("default"))) +# define DLL_EXPORT_CLASS DLL_EXPORT +#else +# define DLL_EXPORT +# define DLL_EXPORT_CLASS +#endif + +// Function calling modes +#if NV_CPU_X86 +# define NV_CDECL __attribute__((cdecl)) +# define NV_STDCALL __attribute__((stdcall)) +#else +# define NV_CDECL +# define NV_STDCALL +#endif + +#define NV_FASTCALL __attribute__((fastcall)) +#define NV_FORCEINLINE __attribute__((always_inline)) inline +#define NV_DEPRECATED __attribute__((deprecated)) +#define NV_THREAD_LOCAL //ACS: there's no "__thread" or equivalent on iOS/OSX + +#if __GNUC__ > 2 +#define NV_PURE __attribute__((pure)) +#define NV_CONST __attribute__((const)) +#else +#define NV_PURE +#define NV_CONST +#endif + +#define NV_NOINLINE __attribute__((noinline)) + +// Define __FUNC__ properly. +#if __STDC_VERSION__ < 199901L +# if __GNUC__ >= 2 +# define __FUNC__ __PRETTY_FUNCTION__ // __FUNCTION__ +# else +# define __FUNC__ "" +# endif +#else +# define __FUNC__ __PRETTY_FUNCTION__ +#endif + +#define restrict __restrict__ diff --git a/3rdparty/nvtt/nvcore/defsgnuclinux.h b/3rdparty/nvtt/nvcore/defsgnuclinux.h new file mode 100644 index 00000000..2126d866 --- /dev/null +++ b/3rdparty/nvtt/nvcore/defsgnuclinux.h @@ -0,0 +1,59 @@ +#ifndef NV_CORE_H +#error "Do not include this file directly." +#endif + +#include // uint8_t, int8_t, ... uintptr_t +#include // operator new, size_t, NULL + +// Function linkage +#define DLL_IMPORT +#if __GNUC__ >= 4 +# define DLL_EXPORT __attribute__((visibility("default"))) +# define DLL_EXPORT_CLASS DLL_EXPORT +#else +# define DLL_EXPORT +# define DLL_EXPORT_CLASS +#endif + +// Function calling modes +#if NV_CPU_X86 +# define NV_CDECL __attribute__((cdecl)) +# define NV_STDCALL __attribute__((stdcall)) +#else +# define NV_CDECL +# define NV_STDCALL +#endif + +#define NV_FASTCALL __attribute__((fastcall)) +//#if __GNUC__ > 3 +// It seems that GCC does not assume always_inline implies inline. I think this depends on the GCC version :( +#define NV_FORCEINLINE inline __attribute__((always_inline)) +//#else +// Some compilers complain that inline and always_inline are redundant. +//#define NV_FORCEINLINE __attribute__((always_inline)) +//#endif +#define NV_DEPRECATED __attribute__((deprecated)) +#define NV_THREAD_LOCAL __thread + +#if __GNUC__ > 2 +#define NV_PURE __attribute__((pure)) +#define NV_CONST __attribute__((const)) +#else +#define NV_PURE +#define NV_CONST +#endif + +#define NV_NOINLINE __attribute__((noinline)) + +// Define __FUNC__ properly. +#if __STDC_VERSION__ < 199901L +# if __GNUC__ >= 2 +# define __FUNC__ __PRETTY_FUNCTION__ // __FUNCTION__ +# else +# define __FUNC__ "" +# endif +#else +# define __FUNC__ __PRETTY_FUNCTION__ +#endif + +#define restrict __restrict__ diff --git a/3rdparty/nvtt/nvcore/defsgnucwin32.h b/3rdparty/nvtt/nvcore/defsgnucwin32.h new file mode 100644 index 00000000..f35ed885 --- /dev/null +++ b/3rdparty/nvtt/nvcore/defsgnucwin32.h @@ -0,0 +1,65 @@ +#ifndef NV_CORE_H +#error "Do not include this file directly." +#endif + +//#include // size_t, NULL + +// Function linkage +#define DLL_IMPORT __declspec(dllimport) +#define DLL_EXPORT __declspec(dllexport) +#define DLL_EXPORT_CLASS DLL_EXPORT + +// Function calling modes +#if NV_CPU_X86 +# define NV_CDECL __attribute__((cdecl)) +# define NV_STDCALL __attribute__((stdcall)) +#else +# define NV_CDECL +# define NV_STDCALL +#endif + +#define NV_FASTCALL __attribute__((fastcall)) +#define NV_FORCEINLINE __attribute__((always_inline)) +#define NV_DEPRECATED __attribute__((deprecated)) + +#if __GNUC__ > 2 +#define NV_PURE __attribute__((pure)) +#define NV_CONST __attribute__((const)) +#else +#define NV_PURE +#define NV_CONST +#endif + +#define NV_NOINLINE __attribute__((noinline)) + +// Define __FUNC__ properly. +#if __STDC_VERSION__ < 199901L +# if __GNUC__ >= 2 +# define __FUNC__ __PRETTY_FUNCTION__ // __FUNCTION__ +# else +# define __FUNC__ "" +# endif +#else +# define __FUNC__ __PRETTY_FUNCTION__ +#endif + +#define restrict __restrict__ + +/* +// Type definitions +typedef unsigned char uint8; +typedef signed char int8; + +typedef unsigned short uint16; +typedef signed short int16; + +typedef unsigned int uint32; +typedef signed int int32; + +typedef unsigned long long uint64; +typedef signed long long int64; + +// Aliases +typedef uint32 uint; +*/ + diff --git a/3rdparty/nvtt/nvcore/defsvcwin32.h b/3rdparty/nvtt/nvcore/defsvcwin32.h new file mode 100644 index 00000000..7b3876ab --- /dev/null +++ b/3rdparty/nvtt/nvcore/defsvcwin32.h @@ -0,0 +1,94 @@ +// This code is in the public domain -- Ignacio Castaņo + +#ifndef NV_CORE_H +#error "Do not include this file directly." +#endif + +// Function linkage +#define DLL_IMPORT __declspec(dllimport) +#define DLL_EXPORT __declspec(dllexport) +#define DLL_EXPORT_CLASS DLL_EXPORT + +// Function calling modes +#define NV_CDECL __cdecl +#define NV_STDCALL __stdcall +#define NV_FASTCALL __fastcall +#define NV_DEPRECATED + +#define NV_PURE +#define NV_CONST + +// Set standard function names. +#if _MSC_VER < 1900 +# define snprintf _snprintf +#endif +#if _MSC_VER < 1500 +# define vsnprintf _vsnprintf +#endif +#if _MSC_VER < 1700 +# define strtoll _strtoi64 +# define strtoull _strtoui64 +#endif +#define chdir _chdir +#define getcwd _getcwd + +#if _MSC_VER < 1800 // Not sure what version introduced this. +#define va_copy(a, b) (a) = (b) +#endif + +#if !defined restrict +#define restrict +#endif + +// Ignore gcc attributes. +#define __attribute__(X) + +#if !defined __FUNC__ +#define __FUNC__ __FUNCTION__ +#endif + +#define NV_NOINLINE __declspec(noinline) +#define NV_FORCEINLINE __forceinline + +#define NV_THREAD_LOCAL __declspec(thread) + +/* +// Type definitions +typedef unsigned char uint8; +typedef signed char int8; + +typedef unsigned short uint16; +typedef signed short int16; + +typedef unsigned int uint32; +typedef signed int int32; + +typedef unsigned __int64 uint64; +typedef signed __int64 int64; + +// Aliases +typedef uint32 uint; +*/ + +// Unwanted VC++ warnings to disable. +/* +#pragma warning(disable : 4244) // conversion to float, possible loss of data +#pragma warning(disable : 4245) // conversion from 'enum ' to 'unsigned long', signed/unsigned mismatch +#pragma warning(disable : 4100) // unreferenced formal parameter +#pragma warning(disable : 4514) // unreferenced inline function has been removed +#pragma warning(disable : 4710) // inline function not expanded +#pragma warning(disable : 4127) // Conditional expression is constant +#pragma warning(disable : 4305) // truncation from 'const double' to 'float' +#pragma warning(disable : 4505) // unreferenced local function has been removed + +#pragma warning(disable : 4702) // unreachable code in inline expanded function +#pragma warning(disable : 4711) // function selected for automatic inlining +#pragma warning(disable : 4725) // Pentium fdiv bug + +#pragma warning(disable : 4786) // Identifier was truncated and cannot be debugged. + +#pragma warning(disable : 4675) // resolved overload was found by argument-dependent lookup +*/ + +#pragma warning(1 : 4705) // Report unused local variables. +#pragma warning(1 : 4555) // Expression has no effect. diff --git a/3rdparty/nvtt/nvcore/foreach.h b/3rdparty/nvtt/nvcore/foreach.h new file mode 100644 index 00000000..71b19f77 --- /dev/null +++ b/3rdparty/nvtt/nvcore/foreach.h @@ -0,0 +1,68 @@ +// This code is in the public domain -- Ignacio Castaņo + +#pragma once +#ifndef NV_CORE_FOREACH_H +#define NV_CORE_FOREACH_H + +/* +These foreach macros are very non-standard and somewhat confusing, but I like them. +*/ + +#include "nvcore.h" + +#if NV_CC_GNUC // If typeof or decltype is available: +#if !NV_CC_CPP11 +# define NV_DECLTYPE typeof // Using a non-standard extension over typeof that behaves as C++11 decltype +#else +# define NV_DECLTYPE decltype +#endif + +/* +Ideally we would like to write this: + +#define NV_FOREACH(i, container) \ + for(NV_DECLTYPE(container)::PseudoIndex i((container).start()); !(container).isDone(i); (container).advance(i)) + +But gcc versions prior to 4.7 required an intermediate type. See: +https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6709 +*/ + +#define NV_FOREACH(i, container) \ + typedef NV_DECLTYPE(container) NV_STRING_JOIN2(cont,__LINE__); \ + for(NV_STRING_JOIN2(cont,__LINE__)::PseudoIndex i((container).start()); !(container).isDone(i); (container).advance(i)) + +#else // If typeof not available: + +#include // placement new + +struct PseudoIndexWrapper { + template + PseudoIndexWrapper(const T & container) { + nvStaticCheck(sizeof(typename T::PseudoIndex) <= sizeof(memory)); + new (memory) typename T::PseudoIndex(container.start()); + } + // PseudoIndex cannot have a dtor! + + template typename T::PseudoIndex & operator()(const T * /*container*/) { + return *reinterpret_cast(memory); + } + template const typename T::PseudoIndex & operator()(const T * /*container*/) const { + return *reinterpret_cast(memory); + } + + uint8 memory[4]; // Increase the size if we have bigger enumerators. +}; + +#define NV_FOREACH(i, container) \ + for(PseudoIndexWrapper i(container); !(container).isDone(i(&(container))); (container).advance(i(&(container)))) + +#endif + +// Declare foreach keyword. +#if !defined NV_NO_USE_KEYWORDS +# define foreach NV_FOREACH +# define foreach_index NV_FOREACH +#endif + + +#endif // NV_CORE_FOREACH_H diff --git a/3rdparty/nvtt/nvcore/hash.h b/3rdparty/nvtt/nvcore/hash.h new file mode 100644 index 00000000..a8b0b2c6 --- /dev/null +++ b/3rdparty/nvtt/nvcore/hash.h @@ -0,0 +1,83 @@ +// This code is in the public domain -- Ignacio Castaņo + +#pragma once +#ifndef NV_CORE_HASH_H +#define NV_CORE_HASH_H + +#include "nvcore.h" + +namespace nv +{ + inline uint sdbmHash(const void * data_in, uint size, uint h = 5381) + { + const uint8 * data = (const uint8 *) data_in; + uint i = 0; + while (i < size) { + h = (h << 16) + (h << 6) - h + (uint) data[i++]; + } + return h; + } + + // Note that this hash does not handle NaN properly. + inline uint sdbmFloatHash(const float * f, uint count, uint h = 5381) + { + for (uint i = 0; i < count; i++) { + //nvDebugCheck(nv::isFinite(*f)); + union { float f; uint32 i; } x = { f[i] }; + if (x.i == 0x80000000) x.i = 0; + h = sdbmHash(&x, 4, h); + } + return h; + } + + + template + inline uint hash(const T & t, uint h = 5381) + { + return sdbmHash(&t, sizeof(T), h); + } + + template <> + inline uint hash(const float & f, uint h) + { + return sdbmFloatHash(&f, 1, h); + } + + + // Functors for hash table: + template struct Hash + { + uint operator()(const Key & k) const { + return hash(k); + } + }; + + template struct Equal + { + bool operator()(const Key & k0, const Key & k1) const { + return k0 == k1; + } + }; + + + // @@ Move to Utils.h? + template + struct Pair { + T1 first; + T2 second; + }; + + template + bool operator==(const Pair & p0, const Pair & p1) { + return p0.first == p1.first && p0.second == p1.second; + } + + template + uint hash(const Pair & p, uint h = 5381) { + return hash(p.second, hash(p.first)); + } + + +} // nv namespace + +#endif // NV_CORE_HASH_H diff --git a/3rdparty/nvtt/nvcore/memory.h b/3rdparty/nvtt/nvcore/memory.h new file mode 100644 index 00000000..22229aab --- /dev/null +++ b/3rdparty/nvtt/nvcore/memory.h @@ -0,0 +1,29 @@ +// This code is in the public domain -- Ignacio CastaÃąo + +#ifndef NV_CORE_MEMORY_H +#define NV_CORE_MEMORY_H + +#include "nvcore.h" + +namespace nv { + + // C++ helpers. + template NV_FORCEINLINE T * malloc(size_t count) { + return (T *)::malloc(sizeof(T) * count); + } + + template NV_FORCEINLINE T * realloc(T * ptr, size_t count) { + return (T *)::realloc(ptr, sizeof(T) * count); + } + + template NV_FORCEINLINE void free(const T * ptr) { + ::free((void *)ptr); + } + + template NV_FORCEINLINE void zero(T & data) { + memset(&data, 0, sizeof(T)); + } + +} // nv namespace + +#endif // NV_CORE_MEMORY_H diff --git a/3rdparty/nvtt/nvcore/nvcore.h b/3rdparty/nvtt/nvcore/nvcore.h new file mode 100644 index 00000000..7f662725 --- /dev/null +++ b/3rdparty/nvtt/nvcore/nvcore.h @@ -0,0 +1,299 @@ +// This code is in the public domain -- Ignacio Castaņo + +#ifndef NV_CORE_H +#define NV_CORE_H + +// Function linkage +#if NVCORE_SHARED +#ifdef NVCORE_EXPORTS +#define NVCORE_API DLL_EXPORT +#define NVCORE_CLASS DLL_EXPORT_CLASS +#else +#define NVCORE_API DLL_IMPORT +#define NVCORE_CLASS DLL_IMPORT +#endif +#else // NVCORE_SHARED +#define NVCORE_API +#define NVCORE_CLASS +#endif // NVCORE_SHARED + + +// Platform definitions +#include "posh.h" + +// OS: +// NV_OS_WIN32 +// NV_OS_WIN64 +// NV_OS_MINGW +// NV_OS_CYGWIN +// NV_OS_LINUX +// NV_OS_UNIX +// NV_OS_DARWIN +// NV_OS_XBOX +// NV_OS_ORBIS +// NV_OS_IOS + +#define NV_OS_STRING POSH_OS_STRING + +#if defined POSH_OS_LINUX +# define NV_OS_LINUX 1 +# define NV_OS_UNIX 1 +#elif defined POSH_OS_ORBIS +# define NV_OS_ORBIS 1 +#elif defined POSH_OS_FREEBSD +# define NV_OS_FREEBSD 1 +# define NV_OS_UNIX 1 +#elif defined POSH_OS_OPENBSD +# define NV_OS_OPENBSD 1 +# define NV_OS_UNIX 1 +#elif defined POSH_OS_CYGWIN32 +# define NV_OS_CYGWIN 1 +#elif defined POSH_OS_MINGW +# define NV_OS_MINGW 1 +# define NV_OS_WIN32 1 +#elif defined POSH_OS_OSX +# define NV_OS_DARWIN 1 +# define NV_OS_UNIX 1 +#elif defined POSH_OS_IOS +# define NV_OS_DARWIN 1 //ACS should we keep this on IOS? +# define NV_OS_UNIX 1 +# define NV_OS_IOS 1 +#elif defined POSH_OS_UNIX +# define NV_OS_UNIX 1 +#elif defined POSH_OS_WIN64 +# define NV_OS_WIN32 1 +# define NV_OS_WIN64 1 +#elif defined POSH_OS_WIN32 +# define NV_OS_WIN32 1 +#elif defined POSH_OS_XBOX +# define NV_OS_XBOX 1 +#else +# error "Unsupported OS" +#endif + + +// Threading: +// some platforms don't implement __thread or similar for thread-local-storage +#if NV_OS_UNIX || NV_OS_ORBIS || NV_OS_IOS //ACStodoIOS darwin instead of ios? +# define NV_OS_USE_PTHREAD 1 +# if NV_OS_DARWIN || NV_OS_IOS +# define NV_OS_HAS_TLS_QUALIFIER 0 +# else +# define NV_OS_HAS_TLS_QUALIFIER 1 +# endif +#else +# define NV_OS_USE_PTHREAD 0 +# define NV_OS_HAS_TLS_QUALIFIER 1 +#endif + + +// CPUs: +// NV_CPU_X86 +// NV_CPU_X86_64 +// NV_CPU_PPC +// NV_CPU_ARM +// NV_CPU_AARCH64 + +#define NV_CPU_STRING POSH_CPU_STRING + +#if defined POSH_CPU_X86_64 +//# define NV_CPU_X86 1 +# define NV_CPU_X86_64 1 +#elif defined POSH_CPU_X86 +# define NV_CPU_X86 1 +#elif defined POSH_CPU_PPC +# define NV_CPU_PPC 1 +#elif defined POSH_CPU_STRONGARM +# define NV_CPU_ARM 1 +#elif defined POSH_CPU_AARCH64 +# define NV_CPU_AARCH64 1 +#else +# error "Unsupported CPU" +#endif + + +// Compiler: +// NV_CC_GNUC +// NV_CC_MSVC +// NV_CC_CLANG + +#if defined POSH_COMPILER_CLANG +# define NV_CC_CLANG 1 +# define NV_CC_GNUC 1 // Clang is compatible with GCC. +# define NV_CC_STRING "clang" +#elif defined POSH_COMPILER_GCC +# define NV_CC_GNUC 1 +# define NV_CC_STRING "gcc" +#elif defined POSH_COMPILER_MSVC +# define NV_CC_MSVC 1 +# define NV_CC_STRING "msvc" +#else +# error "Unsupported compiler" +#endif + +#if NV_CC_MSVC +#define NV_CC_CPP11 (__cplusplus > 199711L || _MSC_VER >= 1800) // Visual Studio 2013 has all the features we use, but doesn't advertise full C++11 support yet. +#else +// @@ IC: This works in CLANG, about GCC? +// @@ ES: Doesn't work in gcc. These 3 features are available in GCC >= 4.4. +#ifdef __clang__ +#define NV_CC_CPP11 (__has_feature(cxx_deleted_functions) && __has_feature(cxx_rvalue_references) && __has_feature(cxx_static_assert)) +#elif defined __GNUC__ +#define NV_CC_CPP11 ( __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)) +#endif +#endif + +// Endiannes: +#define NV_LITTLE_ENDIAN POSH_LITTLE_ENDIAN +#define NV_BIG_ENDIAN POSH_BIG_ENDIAN +#define NV_ENDIAN_STRING POSH_ENDIAN_STRING + + +// Define the right printf prefix for size_t arguments: +#if POSH_64BIT_POINTER +# define NV_SIZET_PRINTF_PREFIX POSH_I64_PRINTF_PREFIX +#else +# define NV_SIZET_PRINTF_PREFIX +#endif + + +// Type definitions: +typedef posh_u8_t uint8; +typedef posh_i8_t int8; + +typedef posh_u16_t uint16; +typedef posh_i16_t int16; + +typedef posh_u32_t uint32; +typedef posh_i32_t int32; + +typedef posh_u64_t uint64; +typedef posh_i64_t int64; + +// Aliases +typedef uint32 uint; + + +// Version string: +#define NV_VERSION_STRING \ + NV_OS_STRING "/" NV_CC_STRING "/" NV_CPU_STRING"/" \ + NV_ENDIAN_STRING"-endian - " __DATE__ "-" __TIME__ + + +// Disable copy constructor and assignment operator. +#if NV_CC_CPP11 +#define NV_FORBID_COPY(C) \ + C( const C & ) = delete; \ + C &operator=( const C & ) = delete +#else +#define NV_FORBID_COPY(C) \ + private: \ + C( const C & ); \ + C &operator=( const C & ) +#endif + +// Disable dynamic allocation on the heap. +// See Prohibiting Heap-Based Objects in More Effective C++. +#define NV_FORBID_HEAPALLOC() \ + private: \ + void *operator new(size_t size); \ + void *operator new[](size_t size) + //static void *operator new(size_t size); \ + //static void *operator new[](size_t size); + +// String concatenation macros. +#define NV_STRING_JOIN2(arg1, arg2) NV_DO_STRING_JOIN2(arg1, arg2) +#define NV_DO_STRING_JOIN2(arg1, arg2) arg1 ## arg2 +#define NV_STRING_JOIN3(arg1, arg2, arg3) NV_DO_STRING_JOIN3(arg1, arg2, arg3) +#define NV_DO_STRING_JOIN3(arg1, arg2, arg3) arg1 ## arg2 ## arg3 +#define NV_STRING2(x) #x +#define NV_STRING(x) NV_STRING2(x) + +#if NV_CC_MSVC +#define NV_MULTI_LINE_MACRO_BEGIN do { +#define NV_MULTI_LINE_MACRO_END \ + __pragma(warning(push)) \ + __pragma(warning(disable:4127)) \ + } while(false) \ + __pragma(warning(pop)) +#else +#define NV_MULTI_LINE_MACRO_BEGIN do { +#define NV_MULTI_LINE_MACRO_END } while(false) +#endif + +#if NV_CC_CPP11 +#define nvStaticCheck(x) static_assert((x), "Static assert "#x" failed") +#else +#define nvStaticCheck(x) typedef char NV_STRING_JOIN2(__static_assert_,__LINE__)[(x)] +#endif +#define NV_COMPILER_CHECK(x) nvStaticCheck(x) // I like this name best. + +// Make sure type definitions are fine. +NV_COMPILER_CHECK(sizeof(int8) == 1); +NV_COMPILER_CHECK(sizeof(uint8) == 1); +NV_COMPILER_CHECK(sizeof(int16) == 2); +NV_COMPILER_CHECK(sizeof(uint16) == 2); +NV_COMPILER_CHECK(sizeof(int32) == 4); +NV_COMPILER_CHECK(sizeof(uint32) == 4); +NV_COMPILER_CHECK(sizeof(int32) == 4); +NV_COMPILER_CHECK(sizeof(uint32) == 4); + + +#define NV_ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0])) + +#if 0 // Disabled in The Witness. +#if NV_CC_MSVC +#define NV_MESSAGE(x) message(__FILE__ "(" NV_STRING(__LINE__) ") : " x) +#else +#define NV_MESSAGE(x) message(x) +#endif +#else +#define NV_MESSAGE(x) +#endif + + +// Startup initialization macro. +#define NV_AT_STARTUP(some_code) \ + namespace { \ + static struct NV_STRING_JOIN2(AtStartup_, __LINE__) { \ + NV_STRING_JOIN2(AtStartup_, __LINE__)() { some_code; } \ + } \ + NV_STRING_JOIN3(AtStartup_, __LINE__, Instance); \ + } + +// Indicate the compiler that the parameter is not used to suppress compier warnings. +#define NV_UNUSED(a) ((a)=(a)) + +// Null index. @@ Move this somewhere else... it's only used by nvmesh. +//const unsigned int NIL = unsigned int(~0); +//#define NIL uint(~0) + +// Null pointer. +#ifndef NULL +#define NULL 0 +#endif + +// Platform includes +#if NV_CC_MSVC +# if NV_OS_WIN32 +# include "DefsVcWin32.h" +# elif NV_OS_XBOX +# include "DefsVcXBox.h" +# else +# error "MSVC: Platform not supported" +# endif +#elif NV_CC_GNUC +# if NV_OS_LINUX +# include "DefsGnucLinux.h" +# elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD +# include "DefsGnucDarwin.h" +# elif NV_OS_MINGW +# include "DefsGnucWin32.h" +# elif NV_OS_CYGWIN +# error "GCC: Cygwin not supported" +# else +# error "GCC: Platform not supported" +# endif +#endif + +#endif // NV_CORE_H diff --git a/3rdparty/nvtt/nvcore/posh.h b/3rdparty/nvtt/nvcore/posh.h new file mode 100644 index 00000000..e401fb8f --- /dev/null +++ b/3rdparty/nvtt/nvcore/posh.h @@ -0,0 +1,1030 @@ +/** +@file posh.h +@author Brian Hook +@version 1.3.001 + +Header file for POSH, the Portable Open Source Harness project. + +NOTE: Unlike most header files, this one is designed to be included +multiple times, which is why it does not have the @#ifndef/@#define +preamble. + +POSH relies on environment specified preprocessor symbols in order +to infer as much as possible about the target OS/architecture and +the host compiler capabilities. + +NOTE: POSH is simple and focused. It attempts to provide basic +functionality and information, but it does NOT attempt to emulate +missing functionality. I am also not willing to make POSH dirty +and hackish to support truly ancient and/or outmoded and/or bizarre +technologies such as non-ANSI compilers, systems with non-IEEE +floating point formats, segmented 16-bit operating systems, etc. + +Please refer to the accompanying HTML documentation or visit +http://www.poshlib.org for more information on how to use POSH. + +LICENSE: + +Copyright (c) 2004, Brian Hook +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * The names of this package'ss contributors contributors may not + be used to endorse or promote products derived from this + software without specific prior written permission. + + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +REVISION: + +I've been lax about revision histories, so this starts at, um, 1.3.001. +Sorry for any inconveniences. + +1.3.001 - 2/23/2006 - Incorporated fix for bug reported by Bill Cary, + where I was not detecting Visual Studio + compilation on x86-64 systems. Added check for + _M_X64 which should fix that. + +*/ +/* +I have yet to find an authoritative reference on preprocessor +symbols, but so far this is what I've gleaned: + +GNU GCC/G++: + - __GNUC__: GNU C version + - __GNUG__: GNU C++ compiler + - __sun__ : on Sun platforms + - __svr4__: on Solaris and other SysV R4 platforms + - __mips__: on MIPS processor platforms + - __sparc_v9__: on Sparc 64-bit CPUs + - __sparcv9: 64-bit Solaris + - __MIPSEL__: mips processor, compiled for little endian + - __MIPSEB__: mips processor, compiled for big endian + - _R5900: MIPS/Sony/Toshiba R5900 (PS2) + - mc68000: 68K + - m68000: 68K + - m68k: 68K + - __palmos__: PalmOS + +Intel C/C++ Compiler: + - __ECC : compiler version, IA64 only + - __EDG__ + - __ELF__ + - __GXX_ABI_VERSION + - __i386 : IA-32 only + - __i386__ : IA-32 only + - i386 : IA-32 only + - __ia64 : IA-64 only + - __ia64__ : IA-64 only + - ia64 : IA-64 only + - __ICC : IA-32 only + - __INTEL_COMPILER : IA-32 or IA-64, newer versions only + +Apple's C/C++ Compiler for OS X: + - __APPLE_CC__ + - __APPLE__ + - __BIG_ENDIAN__ + - __APPLE__ + - __ppc__ + - __MACH__ + +DJGPP: + - __MSDOS__ + - __unix__ + - __unix + - __GNUC__ + - __GO32 + - DJGPP + - __i386, __i386, i386 + +Cray's C compiler: + - _ADDR64: if 64-bit pointers + - _UNICOS: + - __unix: + +SGI's CC compiler predefines the following (and more) with -ansi: + - __sgi + - __unix + - __host_mips + - _SYSTYPE_SVR4 + - __mips + - _MIPSEB + - anyone know if there is a predefined symbol for the compiler?! + +MinGW: + - as GnuC but also defines _WIN32, __WIN32, WIN32, _X86_, __i386, __i386__, and several others + - __MINGW32__ + +Cygwin: + - as Gnu C, but also + - __unix__ + - __CYGWIN32__ + +Microsoft Visual Studio predefines the following: + - _MSC_VER + - _WIN32: on Win32 + - _M_IX6 (on x86 systems) + - _M_X64: on x86-64 systems + - _M_ALPHA (on DEC AXP systems) + - _SH3: WinCE, Hitachi SH-3 + - _MIPS: WinCE, MIPS + - _ARM: WinCE, ARM + +Sun's C Compiler: + - sun and _sun + - unix and _unix + - sparc and _sparc (SPARC systems only) + - i386 and _i386 (x86 systems only) + - __SVR4 (Solaris only) + - __sparcv9: 64-bit solaris + - __SUNPRO_C + - _LP64: defined in 64-bit LP64 mode, but only if is included + +Borland C/C++ predefines the following: + - __BORLANDC__: + +DEC/Compaq C/C++ on Alpha: + - __alpha + - __arch64__ + - __unix__ (on Tru64 Unix) + - __osf__ + - __DECC + - __DECCXX (C++ compilation) + - __DECC_VER + - __DECCXX_VER + +IBM's AIX compiler: + - __64BIT__ if 64-bit mode + - _AIX + - __IBMC__: C compiler version + - __IBMCPP__: C++ compiler version + - _LONG_LONG: compiler allows long long + +Watcom: + - __WATCOMC__ + - __DOS__ : if targeting DOS + - __386__ : if 32-bit support + - __WIN32__ : if targetin 32-bit Windows + +HP-UX C/C++ Compiler: + - __hpux + - __unix + - __hppa (on PA-RISC) + - __LP64__: if compiled in 64-bit mode + +Metrowerks: + - __MWERKS__ + - __powerpc__ + - _powerc + - __MC68K__ + - macintosh when compiling for MacOS + - __INTEL__ for x86 targets + - __POWERPC__ + +LLVM: + - __llvm__ + - __clang__ +*/ + +/* +** ---------------------------------------------------------------------------- +** Include optionally +** ---------------------------------------------------------------------------- +*/ +#ifdef POSH_USE_LIMITS_H +# include +#endif + +/* +** ---------------------------------------------------------------------------- +** Determine compilation environment +** ---------------------------------------------------------------------------- +*/ +#if defined __ECC || defined __ICC || defined __INTEL_COMPILER +# define POSH_COMPILER_STRING "Intel C/C++" +# define POSH_COMPILER_INTEL 1 +#endif + +#if ( defined __host_mips || defined __sgi ) && !defined __GNUC__ +# define POSH_COMPILER_STRING "MIPSpro C/C++" +# define POSH_COMPILER_MIPSPRO 1 +#endif + +#if defined __hpux && !defined __GNUC__ +# define POSH_COMPILER_STRING "HP-UX CC" +# define POSH_COMPILER_HPCC 1 +#endif + +#if defined __clang__ +# define POSH_COMPILER_STRING "Clang" +# define POSH_COMPILER_CLANG 1 +#endif + +#if defined __GNUC__ && !defined __clang__ +# define POSH_COMPILER_STRING "Gnu GCC" +# define POSH_COMPILER_GCC 1 +#endif + +#if defined __APPLE_CC__ + /* we don't define the compiler string here, let it be GNU */ +# define POSH_COMPILER_APPLECC 1 +#endif + +#if defined __IBMC__ || defined __IBMCPP__ +# define POSH_COMPILER_STRING "IBM C/C++" +# define POSH_COMPILER_IBM 1 +#endif + +#if defined _MSC_VER +# define POSH_COMPILER_STRING "Microsoft Visual C++" +# define POSH_COMPILER_MSVC 1 +#endif + +#if defined __SUNPRO_C +# define POSH_COMPILER_STRING "Sun Pro" +# define POSH_COMPILER_SUN 1 +#endif + +#if defined __BORLANDC__ +# define POSH_COMPILER_STRING "Borland C/C++" +# define POSH_COMPILER_BORLAND 1 +#endif + +#if defined __MWERKS__ +# define POSH_COMPILER_STRING "MetroWerks CodeWarrior" +# define POSH_COMPILER_METROWERKS 1 +#endif + +#if defined __DECC || defined __DECCXX +# define POSH_COMPILER_STRING "Compaq/DEC C/C++" +# define POSH_COMPILER_DEC 1 +#endif + +#if defined __WATCOMC__ +# define POSH_COMPILER_STRING "Watcom C/C++" +# define POSH_COMPILER_WATCOM 1 +#endif + +#if !defined POSH_COMPILER_STRING +# define POSH_COMPILER_STRING "Unknown compiler" +#endif + +/* +** ---------------------------------------------------------------------------- +** Determine target operating system +** ---------------------------------------------------------------------------- +*/ +#if defined linux || defined __linux__ +# define POSH_OS_LINUX 1 +# define POSH_OS_STRING "Linux" +#endif + +#if defined __FreeBSD__ +# define POSH_OS_FREEBSD 1 +# define POSH_OS_STRING "FreeBSD" +#endif + +#if defined __OpenBSD__ +# define POSH_OS_OPENBSD 1 +# define POSH_OS_STRING "OpenBSD" +#endif + +#if defined __CYGWIN32__ +# define POSH_OS_CYGWIN32 1 +# define POSH_OS_STRING "Cygwin" +#endif + +#if defined GEKKO +# define POSH_OS_GAMECUBE +# define __powerpc__ +# define POSH_OS_STRING "GameCube" +#endif + +#if defined __MINGW32__ +# define POSH_OS_MINGW 1 +# define POSH_OS_STRING "MinGW" +#endif + +#if defined GO32 && defined DJGPP && defined __MSDOS__ +# define POSH_OS_GO32 1 +# define POSH_OS_STRING "GO32/MS-DOS" +#endif + +/* NOTE: make sure you use /bt=DOS if compiling for 32-bit DOS, + otherwise Watcom assumes host=target */ +#if defined __WATCOMC__ && defined __386__ && defined __DOS__ +# define POSH_OS_DOS32 1 +# define POSH_OS_STRING "DOS/32-bit" +#endif + +#if defined _UNICOS +# define POSH_OS_UNICOS 1 +# define POSH_OS_STRING "UNICOS" +#endif + +#if ( defined __MWERKS__ && defined __powerc && !defined macintosh ) || defined __APPLE_CC__ || defined macosx +# define POSH_OS_OSX 1 +# define POSH_OS_STRING "MacOS X" +#endif + +#if defined __sun__ || defined sun || defined __sun || defined __solaris__ +# if defined __SVR4 || defined __svr4__ || defined __solaris__ +# define POSH_OS_STRING "Solaris" +# define POSH_OS_SOLARIS 1 +# endif +# if !defined POSH_OS_STRING +# define POSH_OS_STRING "SunOS" +# define POSH_OS_SUNOS 1 +# endif +#endif + +#if defined __sgi__ || defined sgi || defined __sgi +# define POSH_OS_IRIX 1 +# define POSH_OS_STRING "Irix" +#endif + +#if defined __hpux__ || defined __hpux +# define POSH_OS_HPUX 1 +# define POSH_OS_STRING "HP-UX" +#endif + +#if defined _AIX +# define POSH_OS_AIX 1 +# define POSH_OS_STRING "AIX" +#endif + +#if ( defined __alpha && defined __osf__ ) +# define POSH_OS_TRU64 1 +# define POSH_OS_STRING "Tru64" +#endif + +#if defined __BEOS__ || defined __beos__ +# define POSH_OS_BEOS 1 +# define POSH_OS_STRING "BeOS" +#endif + +#if defined amiga || defined amigados || defined AMIGA || defined _AMIGA +# define POSH_OS_AMIGA 1 +# define POSH_OS_STRING "Amiga" +#endif + +#if defined __unix__ +# define POSH_OS_UNIX 1 +# if !defined POSH_OS_STRING +# define POSH_OS_STRING "Unix-like(generic)" +# endif +#endif + +#if defined _WIN32_WCE +# define POSH_OS_WINCE 1 +# define POSH_OS_STRING "Windows CE" +#endif + +#if defined _XBOX || defined _XBOX_VER +# define POSH_OS_XBOX 1 +# define POSH_OS_STRING "XBOX" +#endif + +#if defined _WIN32 || defined WIN32 || defined __NT__ || defined __WIN32__ +# define POSH_OS_WIN32 1 +# if !defined POSH_OS_XBOX +# if defined _WIN64 +# define POSH_OS_WIN64 1 +# define POSH_OS_STRING "Win64" +# else +# if !defined POSH_OS_STRING +# define POSH_OS_STRING "Win32" +# endif +# endif +# endif +#endif + +#if defined __palmos__ +# define POSH_OS_PALM 1 +# define POSH_OS_STRING "PalmOS" +#endif + +#if defined THINK_C || defined macintosh +# define POSH_OS_MACOS 1 +# define POSH_OS_STRING "MacOS" +#endif + +/* +** ----------------------------------------------------------------------------- +** Determine target CPU +** ----------------------------------------------------------------------------- +*/ + +#if defined GEKKO +# define POSH_CPU_PPC750 1 +# define POSH_CPU_STRING "IBM PowerPC 750 (NGC)" +#endif + +#if defined mc68000 || defined m68k || defined __MC68K__ || defined m68000 +# define POSH_CPU_68K 1 +# define POSH_CPU_STRING "MC68000" +#endif + +#if defined __PPC__ || defined __POWERPC__ || defined powerpc || defined _POWER || defined __ppc__ || defined __powerpc__ || defined _M_PPC +# define POSH_CPU_PPC 1 +# if !defined POSH_CPU_STRING +# if defined __powerpc64__ +# define POSH_CPU_STRING "PowerPC64" +# else +# define POSH_CPU_STRING "PowerPC" +# endif +# endif +#endif + +#if defined _CRAYT3E || defined _CRAYMPP +# define POSH_CPU_CRAYT3E 1 /* target processor is a DEC Alpha 21164 used in a Cray T3E*/ +# define POSH_CPU_STRING "Cray T3E (Alpha 21164)" +#endif + +#if defined CRAY || defined _CRAY && !defined _CRAYT3E +# error Non-AXP Cray systems not supported +#endif + +#if defined _SH3 +# define POSH_CPU_SH3 1 +# define POSH_CPU_STRING "Hitachi SH-3" +#endif + +#if defined __sh4__ || defined __SH4__ +# define POSH_CPU_SH3 1 +# define POSH_CPU_SH4 1 +# define POSH_CPU_STRING "Hitachi SH-4" +#endif + +#if defined __sparc__ || defined __sparc +# if defined __arch64__ || defined __sparcv9 || defined __sparc_v9__ +# define POSH_CPU_SPARC64 1 +# define POSH_CPU_STRING "Sparc/64" +# else +# define POSH_CPU_STRING "Sparc/32" +# endif +# define POSH_CPU_SPARC 1 +#endif + +#if defined ARM || defined __arm__ || defined _ARM +# define POSH_CPU_STRONGARM 1 +# define POSH_CPU_STRING "ARM" +#endif + +#if defined __aarch64__ +# define POSH_CPU_AARCH64 1 +# define POSH_CPU_STRING "ARM64" +#endif + +#if defined mips || defined __mips__ || defined __MIPS__ || defined _MIPS +# define POSH_CPU_MIPS 1 +# if defined _R5900 +# define POSH_CPU_STRING "MIPS R5900 (PS2)" +# else +# define POSH_CPU_STRING "MIPS" +# endif +#endif + +#if defined __ia64 || defined _M_IA64 || defined __ia64__ +# define POSH_CPU_IA64 1 +# define POSH_CPU_STRING "IA64" +#endif + +#if defined __X86__ || defined __i386__ || defined i386 || defined _M_IX86 || defined __386__ || defined __x86_64__ || defined _M_X64 +# define POSH_CPU_X86 1 +# if defined __x86_64__ || defined _M_X64 +# define POSH_CPU_X86_64 1 +# endif +# if defined POSH_CPU_X86_64 +# define POSH_CPU_STRING "AMD x86-64" +# else +# define POSH_CPU_STRING "Intel 386+" +# endif +#endif + +#if defined __alpha || defined alpha || defined _M_ALPHA || defined __alpha__ +# define POSH_CPU_AXP 1 +# define POSH_CPU_STRING "AXP" +#endif + +#if defined __hppa || defined hppa +# define POSH_CPU_HPPA 1 +# define POSH_CPU_STRING "PA-RISC" +#endif + +#if !defined POSH_CPU_STRING +# error POSH cannot determine target CPU +# define POSH_CPU_STRING "Unknown" /* this is here for Doxygen's benefit */ +#endif + +/* +** ----------------------------------------------------------------------------- +** Attempt to autodetect building for embedded on Sony PS2 +** ----------------------------------------------------------------------------- +*/ +#if !defined POSH_OS_STRING +# if !defined FORCE_DOXYGEN +# define POSH_OS_EMBEDDED 1 +# endif +# if defined _R5900 +# define POSH_OS_STRING "Sony PS2(embedded)" +# else +# define POSH_OS_STRING "Embedded/Unknown" +# endif +#endif + +/* +** --------------------------------------------------------------------------- +** Handle cdecl, stdcall, fastcall, etc. +** --------------------------------------------------------------------------- +*/ +#if defined POSH_CPU_X86 && !defined POSH_CPU_X86_64 +# if defined __GNUC__ +# define POSH_CDECL __attribute__((cdecl)) +# define POSH_STDCALL __attribute__((stdcall)) +# define POSH_FASTCALL __attribute__((fastcall)) +# elif ( defined _MSC_VER || defined __WATCOMC__ || defined __BORLANDC__ || defined __MWERKS__ ) +# define POSH_CDECL __cdecl +# define POSH_STDCALL __stdcall +# define POSH_FASTCALL __fastcall +# endif +#else +# define POSH_CDECL +# define POSH_STDCALL +# define POSH_FASTCALL +#endif + +/* +** --------------------------------------------------------------------------- +** Define POSH_IMPORTEXPORT signature based on POSH_DLL and POSH_BUILDING_LIB +** --------------------------------------------------------------------------- +*/ + +/* +** We undefine this so that multiple inclusions will work +*/ +#if defined POSH_IMPORTEXPORT +# undef POSH_IMPORTEXPORT +#endif + +#if defined POSH_DLL +# if defined POSH_OS_WIN32 +# if defined _MSC_VER +# if ( _MSC_VER >= 800 ) +# if defined POSH_BUILDING_LIB +# define POSH_IMPORTEXPORT __declspec( dllexport ) +# else +# define POSH_IMPORTEXPORT __declspec( dllimport ) +# endif +# else +# if defined POSH_BUILDING_LIB +# define POSH_IMPORTEXPORT __export +# else +# define POSH_IMPORTEXPORT +# endif +# endif +# endif /* defined _MSC_VER */ +# if defined __BORLANDC__ +# if ( __BORLANDC__ >= 0x500 ) +# if defined POSH_BUILDING_LIB +# define POSH_IMPORTEXPORT __declspec( dllexport ) +# else +# define POSH_IMPORTEXPORT __declspec( dllimport ) +# endif +# else +# if defined POSH_BUILDING_LIB +# define POSH_IMPORTEXPORT __export +# else +# define POSH_IMPORTEXPORT +# endif +# endif +# endif /* defined __BORLANDC__ */ + /* for all other compilers, we're just making a blanket assumption */ +# if defined __GNUC__ || defined __WATCOMC__ || defined __MWERKS__ +# if defined POSH_BUILDING_LIB +# define POSH_IMPORTEXPORT __declspec( dllexport ) +# else +# define POSH_IMPORTEXPORT __declspec( dllimport ) +# endif +# endif /* all other compilers */ +# if !defined POSH_IMPORTEXPORT +# error Building DLLs not supported on this compiler (poshlib@poshlib.org if you know how) +# endif +# endif /* defined POSH_OS_WIN32 */ +#endif + +/* On pretty much everything else, we can thankfully just ignore this */ +#if !defined POSH_IMPORTEXPORT +# define POSH_IMPORTEXPORT +#endif + +#if defined FORCE_DOXYGEN +# define POSH_DLL +# define POSH_BUILDING_LIB +# undef POSH_DLL +# undef POSH_BUILDING_LIB +#endif + +/* +** ---------------------------------------------------------------------------- +** (Re)define POSH_PUBLIC_API export signature +** ---------------------------------------------------------------------------- +*/ +#ifdef POSH_PUBLIC_API +# undef POSH_PUBLIC_API +#endif + +#if ( ( defined _MSC_VER ) && ( _MSC_VER < 800 ) ) || ( defined __BORLANDC__ && ( __BORLANDC__ < 0x500 ) ) +# define POSH_PUBLIC_API(rtype) extern rtype POSH_IMPORTEXPORT +#else +# define POSH_PUBLIC_API(rtype) extern POSH_IMPORTEXPORT rtype +#endif + +/* +** ---------------------------------------------------------------------------- +** Try to infer endianess. Basically we just go through the CPUs we know are +** little endian, and assume anything that isn't one of those is big endian. +** As a sanity check, we also do this with operating systems we know are +** little endian, such as Windows. Some processors are bi-endian, such as +** the MIPS series, so we have to be careful about those. +** ---------------------------------------------------------------------------- +*/ +#if defined POSH_CPU_X86 || defined POSH_CPU_AXP || defined POSH_CPU_STRONGARM || defined POSH_CPU_AARCH64 || defined POSH_OS_WIN32 || defined POSH_OS_WINCE || defined __MIPSEL__ +# define POSH_ENDIAN_STRING "little" +# define POSH_LITTLE_ENDIAN 1 +#else +# define POSH_ENDIAN_STRING "big" +# define POSH_BIG_ENDIAN 1 +#endif + +#if defined FORCE_DOXYGEN +# define POSH_LITTLE_ENDIAN +#endif + +/* +** ---------------------------------------------------------------------------- +** Cross-platform compile time assertion macro +** ---------------------------------------------------------------------------- +*/ +#define POSH_COMPILE_TIME_ASSERT(name, x) typedef int _POSH_dummy_ ## name[(x) ? 1 : -1 ] + +/* +** ---------------------------------------------------------------------------- +** 64-bit Integer +** +** We don't require 64-bit support, nor do we emulate its functionality, we +** simply export it if it's available. Since we can't count on +** for 64-bit support, we ignore the POSH_USE_LIMITS_H directive. +** ---------------------------------------------------------------------------- +*/ +#if defined ( __LP64__ ) || defined ( __powerpc64__ ) || defined POSH_CPU_SPARC64 +# define POSH_64BIT_INTEGER 1 +typedef long posh_i64_t; +typedef unsigned long posh_u64_t; +# define POSH_I64( x ) ((posh_i64_t)x) +# define POSH_U64( x ) ((posh_u64_t)x) +# define POSH_I64_PRINTF_PREFIX "l" +#elif defined _MSC_VER || defined __BORLANDC__ || defined __WATCOMC__ || ( defined __alpha && defined __DECC ) +# define POSH_64BIT_INTEGER 1 +typedef __int64 posh_i64_t; +typedef unsigned __int64 posh_u64_t; +# define POSH_I64( x ) ((posh_i64_t)(x##i64)) +# define POSH_U64( x ) ((posh_u64_t)(x##ui64)) +# define POSH_I64_PRINTF_PREFIX "I64" +#elif defined __GNUC__ || defined __MWERKS__ || defined __SUNPRO_C || defined __SUNPRO_CC || defined __APPLE_CC__ || defined POSH_OS_IRIX || defined _LONG_LONG || defined _CRAYC +# define POSH_64BIT_INTEGER 1 +typedef long long posh_i64_t; +typedef unsigned long long posh_u64_t; +# define POSH_U64( x ) ((posh_u64_t)(x##LL)) +# define POSH_I64( x ) ((posh_i64_t)(x##LL)) +# define POSH_I64_PRINTF_PREFIX "ll" +#endif + +/* hack */ +/*#ifdef __MINGW32__ +#undef POSH_I64 +#undef POSH_U64 +#undef POSH_I64_PRINTF_PREFIX +#define POSH_I64( x ) ((posh_i64_t)x) +#define POSH_U64( x ) ((posh_u64_t)x) +#define POSH_I64_PRINTF_PREFIX "I64" +#endif*/ + +#ifdef FORCE_DOXYGEN +typedef long long posh_i64_t; +typedef unsigned long posh_u64_t; +# define POSH_64BIT_INTEGER +# define POSH_I64_PRINTF_PREFIX +# define POSH_I64(x) +# define POSH_U64(x) +#endif + +/** Minimum value for a 64-bit signed integer */ +#define POSH_I64_MIN POSH_I64(0x8000000000000000) +/** Maximum value for a 64-bit signed integer */ +#define POSH_I64_MAX POSH_I64(0x7FFFFFFFFFFFFFFF) +/** Minimum value for a 64-bit unsigned integer */ +#define POSH_U64_MIN POSH_U64(0) +/** Maximum value for a 64-bit unsigned integer */ +#define POSH_U64_MAX POSH_U64(0xFFFFFFFFFFFFFFFF) + +/* ---------------------------------------------------------------------------- +** Basic Sized Types +** +** These types are expected to be EXACTLY sized so you can use them for +** serialization. +** ---------------------------------------------------------------------------- +*/ +#define POSH_FALSE 0 +#define POSH_TRUE 1 + +typedef int posh_bool_t; +typedef unsigned char posh_byte_t; + +/* NOTE: These assume that CHAR_BIT is 8!! */ +typedef unsigned char posh_u8_t; +typedef signed char posh_i8_t; + +#if defined POSH_USE_LIMITS_H +# if CHAR_BITS > 8 +# error This machine uses 9-bit characters. This is a warning, you can comment this out now. +# endif /* CHAR_BITS > 8 */ + +/* 16-bit */ +# if ( USHRT_MAX == 65535 ) + typedef unsigned short posh_u16_t; + typedef short posh_i16_t; +# else + /* Yes, in theory there could still be a 16-bit character type and shorts are + 32-bits in size...if you find such an architecture, let me know =P */ +# error No 16-bit type found +# endif + +/* 32-bit */ +# if ( INT_MAX == 2147483647 ) + typedef unsigned posh_u32_t; + typedef int posh_i32_t; +# elif ( LONG_MAX == 2147483647 ) + typedef unsigned long posh_u32_t; + typedef long posh_i32_t; +# else + error No 32-bit type found +# endif + +#else /* POSH_USE_LIMITS_H */ + + typedef unsigned short posh_u16_t; + typedef short posh_i16_t; + +# if !defined POSH_OS_PALM + typedef unsigned posh_u32_t; + typedef int posh_i32_t; +# else + typedef unsigned long posh_u32_t; + typedef long posh_i32_t; +# endif +#endif + +/** Minimum value for a byte */ +#define POSH_BYTE_MIN 0 +/** Maximum value for an 8-bit unsigned value */ +#define POSH_BYTE_MAX 255 +/** Minimum value for a byte */ +#define POSH_I16_MIN ( ( posh_i16_t ) 0x8000 ) +/** Maximum value for a 16-bit signed value */ +#define POSH_I16_MAX ( ( posh_i16_t ) 0x7FFF ) +/** Minimum value for a 16-bit unsigned value */ +#define POSH_U16_MIN 0 +/** Maximum value for a 16-bit unsigned value */ +#define POSH_U16_MAX ( ( posh_u16_t ) 0xFFFF ) +/** Minimum value for a 32-bit signed value */ +#define POSH_I32_MIN ( ( posh_i32_t ) 0x80000000 ) +/** Maximum value for a 32-bit signed value */ +#define POSH_I32_MAX ( ( posh_i32_t ) 0x7FFFFFFF ) +/** Minimum value for a 32-bit unsigned value */ +#define POSH_U32_MIN 0 +/** Maximum value for a 32-bit unsigned value */ +#define POSH_U32_MAX ( ( posh_u32_t ) 0xFFFFFFFF ) + +/* +** ---------------------------------------------------------------------------- +** Sanity checks on expected sizes +** ---------------------------------------------------------------------------- +*/ +#if !defined FORCE_DOXYGEN + +POSH_COMPILE_TIME_ASSERT(posh_byte_t, sizeof(posh_byte_t) == 1); +POSH_COMPILE_TIME_ASSERT(posh_u8_t, sizeof(posh_u8_t) == 1); +POSH_COMPILE_TIME_ASSERT(posh_i8_t, sizeof(posh_i8_t) == 1); +POSH_COMPILE_TIME_ASSERT(posh_u16_t, sizeof(posh_u16_t) == 2); +POSH_COMPILE_TIME_ASSERT(posh_i16_t, sizeof(posh_i16_t) == 2); +POSH_COMPILE_TIME_ASSERT(posh_u32_t, sizeof(posh_u32_t) == 4); +POSH_COMPILE_TIME_ASSERT(posh_i32_t, sizeof(posh_i32_t) == 4); + +#if !defined POSH_NO_FLOAT + POSH_COMPILE_TIME_ASSERT(posh_testfloat_t, sizeof(float)==4 ); + POSH_COMPILE_TIME_ASSERT(posh_testdouble_t, sizeof(double)==8); +#endif + +#if defined POSH_64BIT_INTEGER + POSH_COMPILE_TIME_ASSERT(posh_u64_t, sizeof(posh_u64_t) == 8); + POSH_COMPILE_TIME_ASSERT(posh_i64_t, sizeof(posh_i64_t) == 8); +#endif + +#endif + +/* +** ---------------------------------------------------------------------------- +** 64-bit pointer support +** ---------------------------------------------------------------------------- +*/ +#if defined POSH_CPU_AXP && ( defined POSH_OS_TRU64 || defined POSH_OS_LINUX ) +# define POSH_64BIT_POINTER 1 +#endif + +#if defined POSH_CPU_X86_64 && defined POSH_OS_LINUX +# define POSH_64BIT_POINTER 1 +#endif + +#if defined POSH_CPU_SPARC64 || defined POSH_OS_WIN64 || defined __64BIT__ || defined __LP64 || defined _LP64 || defined __LP64__ || defined _ADDR64 || defined _CRAYC +# define POSH_64BIT_POINTER 1 +#endif + +#if defined POSH_64BIT_POINTER + POSH_COMPILE_TIME_ASSERT( posh_64bit_pointer, sizeof( void * ) == 8 ); +#elif !defined FORCE_DOXYGEN +/* if this assertion is hit then you're on a system that either has 64-bit + addressing and we didn't catch it, or you're on a system with 16-bit + pointers. In the latter case, POSH doesn't actually care, we're just + triggering this assertion to make sure you're aware of the situation, + so feel free to delete it. + + If this assertion is triggered on a known 32 or 64-bit platform, + please let us know (poshlib@poshlib.org) */ + POSH_COMPILE_TIME_ASSERT( posh_32bit_pointer, sizeof( void * ) == 4 ); +#endif + +#if defined FORCE_DOXYGEN +# define POSH_64BIT_POINTER +#endif + +/* +** ---------------------------------------------------------------------------- +** POSH Utility Functions +** +** These are optional POSH utility functions that are not required if you don't +** need anything except static checking of your host and target environment. +** +** These functions are NOT wrapped with POSH_PUBLIC_API because I didn't want +** to enforce their export if your own library is only using them internally. +** ---------------------------------------------------------------------------- +*/ +#ifdef __cplusplus +extern "C" { +#endif + +const char *POSH_GetArchString( void ); + +#if !defined POSH_NO_FLOAT + +posh_u32_t POSH_LittleFloatBits( float f ); +posh_u32_t POSH_BigFloatBits( float f ); +float POSH_FloatFromLittleBits( posh_u32_t bits ); +float POSH_FloatFromBigBits( posh_u32_t bits ); + +void POSH_DoubleBits( double d, posh_byte_t dst[ 8 ] ); +double POSH_DoubleFromBits( const posh_byte_t src[ 8 ] ); + +/* unimplemented +float *POSH_WriteFloatToLittle( void *dst, float f ); +float *POSH_WriteFloatToBig( void *dst, float f ); +float POSH_ReadFloatFromLittle( const void *src ); +float POSH_ReadFloatFromBig( const void *src ); + +double *POSH_WriteDoubleToLittle( void *dst, double d ); +double *POSH_WriteDoubleToBig( void *dst, double d ); +double POSH_ReadDoubleFromLittle( const void *src ); +double POSH_ReadDoubleFromBig( const void *src ); +*/ +#endif /* !defined POSH_NO_FLOAT */ + +#if defined FORCE_DOXYGEN +# define POSH_NO_FLOAT +# undef POSH_NO_FLOAT +#endif + +extern posh_u16_t POSH_SwapU16( posh_u16_t u ); +extern posh_i16_t POSH_SwapI16( posh_i16_t u ); +extern posh_u32_t POSH_SwapU32( posh_u32_t u ); +extern posh_i32_t POSH_SwapI32( posh_i32_t u ); + +#if defined POSH_64BIT_INTEGER + +extern posh_u64_t POSH_SwapU64( posh_u64_t u ); +extern posh_i64_t POSH_SwapI64( posh_i64_t u ); + +#endif /*POSH_64BIT_INTEGER */ + +extern posh_u16_t *POSH_WriteU16ToLittle( void *dst, posh_u16_t value ); +extern posh_i16_t *POSH_WriteI16ToLittle( void *dst, posh_i16_t value ); +extern posh_u32_t *POSH_WriteU32ToLittle( void *dst, posh_u32_t value ); +extern posh_i32_t *POSH_WriteI32ToLittle( void *dst, posh_i32_t value ); + +extern posh_u16_t *POSH_WriteU16ToBig( void *dst, posh_u16_t value ); +extern posh_i16_t *POSH_WriteI16ToBig( void *dst, posh_i16_t value ); +extern posh_u32_t *POSH_WriteU32ToBig( void *dst, posh_u32_t value ); +extern posh_i32_t *POSH_WriteI32ToBig( void *dst, posh_i32_t value ); + +extern posh_u16_t POSH_ReadU16FromLittle( const void *src ); +extern posh_i16_t POSH_ReadI16FromLittle( const void *src ); +extern posh_u32_t POSH_ReadU32FromLittle( const void *src ); +extern posh_i32_t POSH_ReadI32FromLittle( const void *src ); + +extern posh_u16_t POSH_ReadU16FromBig( const void *src ); +extern posh_i16_t POSH_ReadI16FromBig( const void *src ); +extern posh_u32_t POSH_ReadU32FromBig( const void *src ); +extern posh_i32_t POSH_ReadI32FromBig( const void *src ); + +#if defined POSH_64BIT_INTEGER +extern posh_u64_t *POSH_WriteU64ToLittle( void *dst, posh_u64_t value ); +extern posh_i64_t *POSH_WriteI64ToLittle( void *dst, posh_i64_t value ); +extern posh_u64_t *POSH_WriteU64ToBig( void *dst, posh_u64_t value ); +extern posh_i64_t *POSH_WriteI64ToBig( void *dst, posh_i64_t value ); + +extern posh_u64_t POSH_ReadU64FromLittle( const void *src ); +extern posh_i64_t POSH_ReadI64FromLittle( const void *src ); +extern posh_u64_t POSH_ReadU64FromBig( const void *src ); +extern posh_i64_t POSH_ReadI64FromBig( const void *src ); +#endif /* POSH_64BIT_INTEGER */ + +#if defined POSH_LITTLE_ENDIAN + +# define POSH_LittleU16(x) (x) +# define POSH_LittleU32(x) (x) +# define POSH_LittleI16(x) (x) +# define POSH_LittleI32(x) (x) +# if defined POSH_64BIT_INTEGER +# define POSH_LittleU64(x) (x) +# define POSH_LittleI64(x) (x) +# endif /* defined POSH_64BIT_INTEGER */ + +# define POSH_BigU16(x) POSH_SwapU16(x) +# define POSH_BigU32(x) POSH_SwapU32(x) +# define POSH_BigI16(x) POSH_SwapI16(x) +# define POSH_BigI32(x) POSH_SwapI32(x) +# if defined POSH_64BIT_INTEGER +# define POSH_BigU64(x) POSH_SwapU64(x) +# define POSH_BigI64(x) POSH_SwapI64(x) +# endif /* defined POSH_64BIT_INTEGER */ + +#else + +# define POSH_BigU16(x) (x) +# define POSH_BigU32(x) (x) +# define POSH_BigI16(x) (x) +# define POSH_BigI32(x) (x) + +# if defined POSH_64BIT_INTEGER +# define POSH_BigU64(x) (x) +# define POSH_BigI64(x) (x) +# endif /* POSH_64BIT_INTEGER */ + +# define POSH_LittleU16(x) POSH_SwapU16(x) +# define POSH_LittleU32(x) POSH_SwapU32(x) +# define POSH_LittleI16(x) POSH_SwapI16(x) +# define POSH_LittleI32(x) POSH_SwapI32(x) + +# if defined POSH_64BIT_INTEGER +# define POSH_LittleU64(x) POSH_SwapU64(x) +# define POSH_LittleI64(x) POSH_SwapI64(x) +# endif /* POSH_64BIT_INTEGER */ + +#endif + +#ifdef __cplusplus +} +#endif + + diff --git a/3rdparty/nvtt/nvcore/stdstream.h b/3rdparty/nvtt/nvcore/stdstream.h new file mode 100644 index 00000000..4f0a10a4 --- /dev/null +++ b/3rdparty/nvtt/nvcore/stdstream.h @@ -0,0 +1,459 @@ +// This code is in the public domain -- Ignacio Castaņo + +#include "nvcore.h" +#include "stream.h" +#include "array.h" + +#include // fopen +#include // memcpy + +namespace nv +{ + + // Portable version of fopen. + inline FILE * fileOpen(const char * fileName, const char * mode) + { + nvCheck(fileName != NULL); +#if NV_CC_MSVC && _MSC_VER >= 1400 + FILE * fp; + if (fopen_s(&fp, fileName, mode) == 0) { + return fp; + } + return NULL; +#else + return fopen(fileName, mode); +#endif + } + + + /// Base stdio stream. + class NVCORE_CLASS StdStream : public Stream + { + NV_FORBID_COPY(StdStream); + public: + + /// Ctor. + StdStream( FILE * fp, bool autoclose ) : m_fp(fp), m_autoclose(autoclose) { } + + /// Dtor. + virtual ~StdStream() + { + if( m_fp != NULL && m_autoclose ) { +#if NV_OS_WIN32 + _fclose_nolock( m_fp ); +#else + fclose( m_fp ); +#endif + } + } + + + /** @name Stream implementation. */ + //@{ + virtual void seek( uint pos ) + { + nvDebugCheck(m_fp != NULL); + nvDebugCheck(pos <= size()); +#if NV_OS_WIN32 + _fseek_nolock(m_fp, pos, SEEK_SET); +#else + fseek(m_fp, pos, SEEK_SET); +#endif + } + + virtual uint tell() const + { + nvDebugCheck(m_fp != NULL); +#if NV_OS_WIN32 + return _ftell_nolock(m_fp); +#else + return (uint)ftell(m_fp); +#endif + } + + virtual uint size() const + { + nvDebugCheck(m_fp != NULL); +#if NV_OS_WIN32 + uint pos = _ftell_nolock(m_fp); + _fseek_nolock(m_fp, 0, SEEK_END); + uint end = _ftell_nolock(m_fp); + _fseek_nolock(m_fp, pos, SEEK_SET); +#else + uint pos = (uint)ftell(m_fp); + fseek(m_fp, 0, SEEK_END); + uint end = (uint)ftell(m_fp); + fseek(m_fp, pos, SEEK_SET); +#endif + return end; + } + + virtual bool isError() const + { + return m_fp == NULL || ferror( m_fp ) != 0; + } + + virtual void clearError() + { + nvDebugCheck(m_fp != NULL); + clearerr(m_fp); + } + + // @@ The original implementation uses feof, which only returns true when we attempt to read *past* the end of the stream. + // That is, if we read the last byte of a file, then isAtEnd would still return false, even though the stream pointer is at the file end. This is not the intent and was inconsistent with the implementation of the MemoryStream, a better + // implementation uses use ftell and fseek to determine our location within the file. + virtual bool isAtEnd() const + { + if (m_fp == NULL) return true; + //nvDebugCheck(m_fp != NULL); + //return feof( m_fp ) != 0; +#if NV_OS_WIN32 + uint pos = _ftell_nolock(m_fp); + _fseek_nolock(m_fp, 0, SEEK_END); + uint end = _ftell_nolock(m_fp); + _fseek_nolock(m_fp, pos, SEEK_SET); +#else + uint pos = (uint)ftell(m_fp); + fseek(m_fp, 0, SEEK_END); + uint end = (uint)ftell(m_fp); + fseek(m_fp, pos, SEEK_SET); +#endif + return pos == end; + } + + /// Always true. + virtual bool isSeekable() const { return true; } + //@} + + protected: + + FILE * m_fp; + bool m_autoclose; + + }; + + + /// Standard output stream. + class NVCORE_CLASS StdOutputStream : public StdStream + { + NV_FORBID_COPY(StdOutputStream); + public: + + /// Construct stream by file name. + StdOutputStream( const char * name ) : StdStream(fileOpen(name, "wb"), /*autoclose=*/true) { } + + /// Construct stream by file handle. + StdOutputStream( FILE * fp, bool autoclose ) : StdStream(fp, autoclose) + { + } + + /** @name Stream implementation. */ + //@{ + /// Write data. + virtual uint serialize( void * data, uint len ) + { + nvDebugCheck(data != NULL); + nvDebugCheck(m_fp != NULL); +#if NV_OS_WIN32 + return (uint)_fwrite_nolock(data, 1, len, m_fp); +#elif NV_OS_LINUX + return (uint)fwrite_unlocked(data, 1, len, m_fp); +#elif NV_OS_DARWIN + // @@ No error checking, always returns len. + for (uint i = 0; i < len; i++) { + putc_unlocked(((char *)data)[i], m_fp); + } + return len; +#else + return (uint)fwrite(data, 1, len, m_fp); +#endif + } + + virtual bool isLoading() const + { + return false; + } + + virtual bool isSaving() const + { + return true; + } + //@} + + }; + + + /// Standard input stream. + class NVCORE_CLASS StdInputStream : public StdStream + { + NV_FORBID_COPY(StdInputStream); + public: + + /// Construct stream by file name. + StdInputStream( const char * name ) : StdStream(fileOpen(name, "rb"), /*autoclose=*/true) { } + + /// Construct stream by file handle. + StdInputStream( FILE * fp, bool autoclose=true ) : StdStream(fp, autoclose) + { + } + + /** @name Stream implementation. */ + //@{ + /// Read data. + virtual uint serialize( void * data, uint len ) + { + nvDebugCheck(data != NULL); + nvDebugCheck(m_fp != NULL); +#if NV_OS_WIN32 + return (uint)_fread_nolock(data, 1, len, m_fp); +#elif NV_OS_LINUX + return (uint)fread_unlocked(data, 1, len, m_fp); +#elif NV_OS_DARWIN + // @@ No error checking, always returns len. + for (uint i = 0; i < len; i++) { + ((char *)data)[i] = getc_unlocked(m_fp); + } + return len; +#else + return (uint)fread(data, 1, len, m_fp); +#endif + + } + + virtual bool isLoading() const + { + return true; + } + + virtual bool isSaving() const + { + return false; + } + //@} + }; + + + + /// Memory input stream. + class NVCORE_CLASS MemoryInputStream : public Stream + { + NV_FORBID_COPY(MemoryInputStream); + public: + + /// Ctor. + MemoryInputStream( const uint8 * mem, uint size ) : m_mem(mem), m_ptr(mem), m_size(size) { } + + /** @name Stream implementation. */ + //@{ + /// Read data. + virtual uint serialize( void * data, uint len ) + { + nvDebugCheck(data != NULL); + nvDebugCheck(!isError()); + + uint left = m_size - tell(); + if (len > left) len = left; + + memcpy( data, m_ptr, len ); + m_ptr += len; + + return len; + } + + virtual void seek( uint pos ) + { + nvDebugCheck(!isError()); + m_ptr = m_mem + pos; + nvDebugCheck(!isError()); + } + + virtual uint tell() const + { + nvDebugCheck(m_ptr >= m_mem); + return uint(m_ptr - m_mem); + } + + virtual uint size() const + { + return m_size; + } + + virtual bool isError() const + { + return m_mem == NULL || m_ptr > m_mem + m_size || m_ptr < m_mem; + } + + virtual void clearError() + { + // Nothing to do. + } + + virtual bool isAtEnd() const + { + return m_ptr == m_mem + m_size; + } + + /// Always true. + virtual bool isSeekable() const + { + return true; + } + + virtual bool isLoading() const + { + return true; + } + + virtual bool isSaving() const + { + return false; + } + //@} + + const uint8 * ptr() const { return m_ptr; } + + + private: + + const uint8 * m_mem; + const uint8 * m_ptr; + uint m_size; + + }; + + + /// Buffer output stream. + class NVCORE_CLASS BufferOutputStream : public Stream + { + NV_FORBID_COPY(BufferOutputStream); + public: + + BufferOutputStream(Array & buffer) : m_buffer(buffer) { } + + virtual uint serialize( void * data, uint len ) + { + nvDebugCheck(data != NULL); + m_buffer.append((uint8 *)data, len); + return len; + } + + virtual void seek( uint /*pos*/ ) { /*Not implemented*/ } + virtual uint tell() const { return m_buffer.size(); } + virtual uint size() const { return m_buffer.size(); } + + virtual bool isError() const { return false; } + virtual void clearError() {} + + virtual bool isAtEnd() const { return true; } + virtual bool isSeekable() const { return false; } + virtual bool isLoading() const { return false; } + virtual bool isSaving() const { return true; } + + private: + Array & m_buffer; + }; + + + /// Protected input stream. + class NVCORE_CLASS ProtectedStream : public Stream + { + NV_FORBID_COPY(ProtectedStream); + public: + + /// Ctor. + ProtectedStream( Stream & s ) : m_s(&s), m_autodelete(false) + { + } + + /// Ctor. + ProtectedStream( Stream * s, bool autodelete = true ) : + m_s(s), m_autodelete(autodelete) + { + nvDebugCheck(m_s != NULL); + } + + /// Dtor. + virtual ~ProtectedStream() + { + if( m_autodelete ) { + delete m_s; + } + } + + /** @name Stream implementation. */ + //@{ + /// Read data. + virtual uint serialize( void * data, uint len ) + { + nvDebugCheck(data != NULL); + len = m_s->serialize( data, len ); + + if( m_s->isError() ) { + throw; + } + + return len; + } + + virtual void seek( uint pos ) + { + m_s->seek( pos ); + + if( m_s->isError() ) { + throw; + } + } + + virtual uint tell() const + { + return m_s->tell(); + } + + virtual uint size() const + { + return m_s->size(); + } + + virtual bool isError() const + { + return m_s->isError(); + } + + virtual void clearError() + { + m_s->clearError(); + } + + virtual bool isAtEnd() const + { + return m_s->isAtEnd(); + } + + virtual bool isSeekable() const + { + return m_s->isSeekable(); + } + + virtual bool isLoading() const + { + return m_s->isLoading(); + } + + virtual bool isSaving() const + { + return m_s->isSaving(); + } + //@} + + + private: + + Stream * const m_s; + bool const m_autodelete; + + }; + +} // nv namespace + + +//#endif // NV_CORE_STDSTREAM_H diff --git a/3rdparty/nvtt/nvcore/stream.h b/3rdparty/nvtt/nvcore/stream.h new file mode 100644 index 00000000..9252d9ef --- /dev/null +++ b/3rdparty/nvtt/nvcore/stream.h @@ -0,0 +1,163 @@ +// This code is in the public domain -- Ignacio Castaņo + +#ifndef NV_CORE_STREAM_H +#define NV_CORE_STREAM_H + +#include "nvcore.h" +#include "debug.h" + +namespace nv +{ + + /// Base stream class. + class NVCORE_CLASS Stream { + public: + + enum ByteOrder { + LittleEndian = false, + BigEndian = true, + }; + + /// Get the byte order of the system. + static ByteOrder getSystemByteOrder() { +#if NV_LITTLE_ENDIAN + return LittleEndian; +#else + return BigEndian; +#endif + } + + + /// Ctor. + Stream() : m_byteOrder(LittleEndian) { } + + /// Virtual destructor. + virtual ~Stream() {} + + /// Set byte order. + void setByteOrder(ByteOrder bo) { m_byteOrder = bo; } + + /// Get byte order. + ByteOrder byteOrder() const { return m_byteOrder; } + + + /// Serialize the given data. + virtual uint serialize( void * data, uint len ) = 0; + + /// Move to the given position in the archive. + virtual void seek( uint pos ) = 0; + + /// Return the current position in the archive. + virtual uint tell() const = 0; + + /// Return the current size of the archive. + virtual uint size() const = 0; + + /// Determine if there has been any error. + virtual bool isError() const = 0; + + /// Clear errors. + virtual void clearError() = 0; + + /// Return true if the stream is at the end. + virtual bool isAtEnd() const = 0; + + /// Return true if the stream is seekable. + virtual bool isSeekable() const = 0; + + /// Return true if this is an input stream. + virtual bool isLoading() const = 0; + + /// Return true if this is an output stream. + virtual bool isSaving() const = 0; + + + void advance(uint offset) { seek(tell() + offset); } + + + // friends + friend Stream & operator<<( Stream & s, bool & c ) { +#if NV_OS_DARWIN && !NV_CC_CPP11 + nvStaticCheck(sizeof(bool) == 4); + uint8 b = c ? 1 : 0; + s.serialize( &b, 1 ); + c = (b == 1); +#else + nvStaticCheck(sizeof(bool) == 1); + s.serialize( &c, 1 ); +#endif + return s; + } + friend Stream & operator<<( Stream & s, char & c ) { + nvStaticCheck(sizeof(char) == 1); + s.serialize( &c, 1 ); + return s; + } + friend Stream & operator<<( Stream & s, uint8 & c ) { + nvStaticCheck(sizeof(uint8) == 1); + s.serialize( &c, 1 ); + return s; + } + friend Stream & operator<<( Stream & s, int8 & c ) { + nvStaticCheck(sizeof(int8) == 1); + s.serialize( &c, 1 ); + return s; + } + friend Stream & operator<<( Stream & s, uint16 & c ) { + nvStaticCheck(sizeof(uint16) == 2); + return s.byteOrderSerialize( &c, 2 ); + } + friend Stream & operator<<( Stream & s, int16 & c ) { + nvStaticCheck(sizeof(int16) == 2); + return s.byteOrderSerialize( &c, 2 ); + } + friend Stream & operator<<( Stream & s, uint32 & c ) { + nvStaticCheck(sizeof(uint32) == 4); + return s.byteOrderSerialize( &c, 4 ); + } + friend Stream & operator<<( Stream & s, int32 & c ) { + nvStaticCheck(sizeof(int32) == 4); + return s.byteOrderSerialize( &c, 4 ); + } + friend Stream & operator<<( Stream & s, uint64 & c ) { + nvStaticCheck(sizeof(uint64) == 8); + return s.byteOrderSerialize( &c, 8 ); + } + friend Stream & operator<<( Stream & s, int64 & c ) { + nvStaticCheck(sizeof(int64) == 8); + return s.byteOrderSerialize( &c, 8 ); + } + friend Stream & operator<<( Stream & s, float & c ) { + nvStaticCheck(sizeof(float) == 4); + return s.byteOrderSerialize( &c, 4 ); + } + friend Stream & operator<<( Stream & s, double & c ) { + nvStaticCheck(sizeof(double) == 8); + return s.byteOrderSerialize( &c, 8 ); + } + + protected: + + /// Serialize in the stream byte order. + Stream & byteOrderSerialize( void * v, uint len ) { + if( m_byteOrder == getSystemByteOrder() ) { + serialize( v, len ); + } + else { + for( uint i = len; i > 0; i-- ) { + serialize( (uint8 *)v + i - 1, 1 ); + } + } + return *this; + } + + + private: + + ByteOrder m_byteOrder; + + }; + +} // nv namespace + +#endif // NV_CORE_STREAM_H diff --git a/3rdparty/nvtt/nvcore/strlib.h b/3rdparty/nvtt/nvcore/strlib.h new file mode 100644 index 00000000..80a957cb --- /dev/null +++ b/3rdparty/nvtt/nvcore/strlib.h @@ -0,0 +1,429 @@ +// This code is in the public domain -- Ignacio Castaņo + +#ifndef NV_CORE_STRING_H +#define NV_CORE_STRING_H + +#include "debug.h" +#include "hash.h" // hash + +//#include // strlen, etc. + +#if NV_OS_WIN32 +#define NV_PATH_SEPARATOR '\\' +#else +#define NV_PATH_SEPARATOR '/' +#endif + +namespace nv +{ + + NVCORE_API uint strHash(const char * str, uint h) NV_PURE; + + /// String hash based on Bernstein's hash. + inline uint strHash(const char * data, uint h = 5381) + { + uint i = 0; + while(data[i] != 0) { + h = (33 * h) ^ uint(data[i]); + i++; + } + return h; + } + + template <> struct Hash { + uint operator()(const char * str) const { return strHash(str); } + }; + + NVCORE_API uint strLen(const char * str) NV_PURE; // Asserts on NULL strings. + + NVCORE_API int strDiff(const char * s1, const char * s2) NV_PURE; // Asserts on NULL strings. + NVCORE_API int strCaseDiff(const char * s1, const char * s2) NV_PURE; // Asserts on NULL strings. + NVCORE_API bool strEqual(const char * s1, const char * s2) NV_PURE; // Accepts NULL strings. + NVCORE_API bool strCaseEqual(const char * s1, const char * s2) NV_PURE; // Accepts NULL strings. + + template <> struct Equal { + bool operator()(const char * a, const char * b) const { return strEqual(a, b); } + }; + + NVCORE_API bool strBeginsWith(const char * dst, const char * prefix) NV_PURE; + NVCORE_API bool strEndsWith(const char * dst, const char * suffix) NV_PURE; + + + NVCORE_API void strCpy(char * dst, uint size, const char * src); + NVCORE_API void strCpy(char * dst, uint size, const char * src, uint len); + NVCORE_API void strCat(char * dst, uint size, const char * src); + + NVCORE_API const char * strSkipWhiteSpace(const char * str); + NVCORE_API char * strSkipWhiteSpace(char * str); + + NVCORE_API bool strMatch(const char * str, const char * pat) NV_PURE; + + NVCORE_API bool isNumber(const char * str) NV_PURE; + + /* @@ Implement these two functions and modify StringBuilder to use them? + NVCORE_API void strFormat(const char * dst, const char * fmt, ...); + NVCORE_API void strFormatList(const char * dst, const char * fmt, va_list arg); + + template void strFormatSafe(char (&buffer)[count], const char *fmt, ...) __attribute__((format (printf, 2, 3))); + template void strFormatSafe(char (&buffer)[count], const char *fmt, ...) { + va_list args; + va_start(args, fmt); + strFormatList(buffer, count, fmt, args); + va_end(args); + } + template void strFormatListSafe(char (&buffer)[count], const char *fmt, va_list arg) { + va_list tmp; + va_copy(tmp, args); + strFormatList(buffer, count, fmt, tmp); + va_end(tmp); + }*/ + + template void strCpySafe(char (&buffer)[count], const char *src) { + strCpy(buffer, count, src); + } + + template void strCatSafe(char (&buffer)[count], const char * src) { + strCat(buffer, count, src); + } + + + + /// String builder. + class NVCORE_CLASS StringBuilder + { + public: + + StringBuilder(); + explicit StringBuilder( uint size_hint ); + StringBuilder(const char * str); + StringBuilder(const char * str, uint len); + StringBuilder(const StringBuilder & other); + + ~StringBuilder(); + + StringBuilder & format( const char * format, ... ) __attribute__((format (printf, 2, 3))); + StringBuilder & formatList( const char * format, va_list arg ); + + StringBuilder & append(const char * str); + StringBuilder & append(const char * str, uint len); + StringBuilder & appendFormat(const char * format, ...) __attribute__((format (printf, 2, 3))); + StringBuilder & appendFormatList(const char * format, va_list arg); + + StringBuilder & appendSpace(uint n); + + StringBuilder & number( int i, int base = 10 ); + StringBuilder & number( uint i, int base = 10 ); + + StringBuilder & reserve(uint size_hint); + StringBuilder & copy(const char * str); + StringBuilder & copy(const char * str, uint len); + StringBuilder & copy(const StringBuilder & str); + + StringBuilder & toLower(); + StringBuilder & toUpper(); + + bool endsWith(const char * str) const; + bool beginsWith(const char * str) const; + + char * reverseFind(char c); + + void reset(); + bool isNull() const { return m_size == 0; } + + // const char * accessors + //operator const char * () const { return m_str; } + //operator char * () { return m_str; } + const char * str() const { return m_str; } + char * str() { return m_str; } + + char * release(); + + /// Implement value semantics. + StringBuilder & operator=( const StringBuilder & s ) { + return copy(s); + } + + /// Implement value semantics. + StringBuilder & operator=( const char * s ) { + return copy(s); + } + + /// Equal operator. + bool operator==( const StringBuilder & s ) const { + return strMatch(s.m_str, m_str); + } + + /// Return the exact length. + uint length() const { return isNull() ? 0 : strLen(m_str); } + + /// Return the size of the string container. + uint capacity() const { return m_size; } + + /// Return the hash of the string. + uint hash() const { return isNull() ? 0 : strHash(m_str); } + + // Swap strings. + friend void swap(StringBuilder & a, StringBuilder & b); + + protected: + + /// Size of the string container. + uint m_size; + + /// String. + char * m_str; + + }; + + + /// Path string. @@ This should be called PathBuilder. + class NVCORE_CLASS Path : public StringBuilder + { + public: + Path() : StringBuilder() {} + explicit Path(int size_hint) : StringBuilder(size_hint) {} + Path(const char * str) : StringBuilder(str) {} + Path(const Path & path) : StringBuilder(path) {} + + const char * fileName() const; + const char * extension() const; + + void translatePath(char pathSeparator = NV_PATH_SEPARATOR); + + void appendSeparator(char pathSeparator = NV_PATH_SEPARATOR); + + void stripFileName(); + void stripExtension(); + + // statics + NVCORE_API static char separator(); + NVCORE_API static const char * fileName(const char *); + NVCORE_API static const char * extension(const char *); + + NVCORE_API static void translatePath(char * path, char pathSeparator = NV_PATH_SEPARATOR); + }; + + + /// String class. + class NVCORE_CLASS String + { + public: + + /// Constructs a null string. @sa isNull() + String() + { + data = NULL; + } + + /// Constructs a shared copy of str. + String(const String & str) + { + data = str.data; + if (data != NULL) addRef(); + } + + /// Constructs a shared string from a standard string. + String(const char * str) + { + setString(str); + } + + /// Constructs a shared string from a standard string. + String(const char * str, int length) + { + setString(str, length); + } + + /// Constructs a shared string from a StringBuilder. + String(const StringBuilder & str) + { + setString(str); + } + + /// Dtor. + ~String() + { + release(); + } + + String clone() const; + + /// Release the current string and allocate a new one. + const String & operator=( const char * str ) + { + release(); + setString( str ); + return *this; + } + + /// Release the current string and allocate a new one. + const String & operator=( const StringBuilder & str ) + { + release(); + setString( str ); + return *this; + } + + /// Implement value semantics. + String & operator=( const String & str ) + { + if (str.data != data) + { + release(); + data = str.data; + addRef(); + } + return *this; + } + + /// Equal operator. + bool operator==( const String & str ) const + { + return strMatch(str.data, data); + } + + /// Equal operator. + bool operator==( const char * str ) const + { + return strMatch(str, data); + } + + /// Not equal operator. + bool operator!=( const String & str ) const + { + return !strMatch(str.data, data); + } + + /// Not equal operator. + bool operator!=( const char * str ) const + { + return !strMatch(str, data); + } + + /// Returns true if this string is the null string. + bool isNull() const { return data == NULL; } + + /// Return the exact length. + uint length() const { nvDebugCheck(data != NULL); return strLen(data); } + + /// Return the hash of the string. + uint hash() const { nvDebugCheck(data != NULL); return strHash(data); } + + /// const char * cast operator. + operator const char * () const { return data; } + + /// Get string pointer. + const char * str() const { return data; } + + + private: + + // Add reference count. + void addRef(); + + // Decrease reference count. + void release(); + + uint16 getRefCount() const + { + nvDebugCheck(data != NULL); + return *reinterpret_cast(data - 2); + } + + void setRefCount(uint16 count) { + nvDebugCheck(data != NULL); + nvCheck(count < 0xFFFF); + *reinterpret_cast(const_cast(data - 2)) = uint16(count); + } + + void setData(const char * str) { + data = str + 2; + } + + void allocString(const char * str) + { + allocString(str, strLen(str)); + } + + void allocString(const char * str, uint length); + + void setString(const char * str); + void setString(const char * str, uint length); + void setString(const StringBuilder & str); + + // Swap strings. + friend void swap(String & a, String & b); + + private: + + const char * data; + + }; + + template <> struct Hash { + uint operator()(const String & str) const { return str.hash(); } + }; + + + // Like AutoPtr, but for const char strings. + class AutoString + { + NV_FORBID_COPY(AutoString); + NV_FORBID_HEAPALLOC(); + public: + + // Ctor. + AutoString(const char * p = NULL) : m_ptr(p) { } + +#if NV_CC_CPP11 + // Move ctor. + AutoString(AutoString && ap) : m_ptr(ap.m_ptr) { ap.m_ptr = NULL; } +#endif + + // Dtor. Deletes owned pointer. + ~AutoString() { + delete [] m_ptr; + m_ptr = NULL; + } + + // Delete owned pointer and assign new one. + void operator=(const char * p) { + if (p != m_ptr) + { + delete [] m_ptr; + m_ptr = p; + } + } + + // Get pointer. + const char * ptr() const { return m_ptr; } + operator const char *() const { return m_ptr; } + + // Relinquish ownership of the underlying pointer and returns that pointer. + const char * release() { + const char * tmp = m_ptr; + m_ptr = NULL; + return tmp; + } + + // comparison operators. + friend bool operator == (const AutoString & ap, const char * const p) { + return (ap.ptr() == p); + } + friend bool operator != (const AutoString & ap, const char * const p) { + return (ap.ptr() != p); + } + friend bool operator == (const char * const p, const AutoString & ap) { + return (ap.ptr() == p); + } + friend bool operator != (const char * const p, const AutoString & ap) { + return (ap.ptr() != p); + } + + private: + const char * m_ptr; + }; + +} // nv namespace + +#endif // NV_CORE_STRING_H diff --git a/3rdparty/nvtt/nvcore/utils.h b/3rdparty/nvtt/nvcore/utils.h new file mode 100644 index 00000000..364b6292 --- /dev/null +++ b/3rdparty/nvtt/nvcore/utils.h @@ -0,0 +1,281 @@ +// This code is in the public domain -- Ignacio Castaņo + +#ifndef NV_CORE_UTILS_H +#define NV_CORE_UTILS_H + +#include "debug.h" // nvdebugcheck + +#include // for placement new + + +// Just in case. Grrr. +#undef min +#undef max + +#define NV_INT8_MIN (-128) +#define NV_INT8_MAX 127 +#define NV_UINT8_MAX 255 +#define NV_INT16_MIN (-32767-1) +#define NV_INT16_MAX 32767 +#define NV_UINT16_MAX 0xffff +#define NV_INT32_MIN (-2147483647-1) +#define NV_INT32_MAX 2147483647 +#define NV_UINT32_MAX 0xffffffff +#define NV_INT64_MAX POSH_I64(9223372036854775807) +#define NV_INT64_MIN (-POSH_I64(9223372036854775807)-1) +#define NV_UINT64_MAX POSH_U64(0xffffffffffffffff) + +#define NV_HALF_MAX 65504.0F +#define NV_FLOAT_MAX 3.402823466e+38F + +#define NV_INTEGER_TO_FLOAT_MAX 16777217 // Largest integer such that it and all smaller integers can be stored in a 32bit float. + + +namespace nv +{ + // Less error prone than casting. From CB: + // http://cbloomrants.blogspot.com/2011/06/06-17-11-c-casting-is-devil.html + + // These intentionally look like casts. + + // uint32 casts: + template inline uint32 U32(T x) { return x; } + template <> inline uint32 U32(uint64 x) { nvDebugCheck(x <= NV_UINT32_MAX); return (uint32)x; } + template <> inline uint32 U32(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT32_MAX); return (uint32)x; } + //template <> inline uint32 U32(uint32 x) { return x; } + template <> inline uint32 U32(int32 x) { nvDebugCheck(x >= 0); return (uint32)x; } + //template <> inline uint32 U32(uint16 x) { return x; } + template <> inline uint32 U32(int16 x) { nvDebugCheck(x >= 0); return (uint32)x; } + //template <> inline uint32 U32(uint8 x) { return x; } + template <> inline uint32 U32(int8 x) { nvDebugCheck(x >= 0); return (uint32)x; } + + // int32 casts: + template inline int32 I32(T x) { return x; } + template <> inline int32 I32(uint64 x) { nvDebugCheck(x <= NV_INT32_MAX); return (int32)x; } + template <> inline int32 I32(int64 x) { nvDebugCheck(x >= NV_INT32_MIN && x <= NV_UINT32_MAX); return (int32)x; } + template <> inline int32 I32(uint32 x) { nvDebugCheck(x <= NV_INT32_MAX); return (int32)x; } + //template <> inline int32 I32(int32 x) { return x; } + //template <> inline int32 I32(uint16 x) { return x; } + //template <> inline int32 I32(int16 x) { return x; } + //template <> inline int32 I32(uint8 x) { return x; } + //template <> inline int32 I32(int8 x) { return x; } + + // uint16 casts: + template inline uint16 U16(T x) { return x; } + template <> inline uint16 U16(uint64 x) { nvDebugCheck(x <= NV_UINT16_MAX); return (uint16)x; } + template <> inline uint16 U16(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT16_MAX); return (uint16)x; } + template <> inline uint16 U16(uint32 x) { nvDebugCheck(x <= NV_UINT16_MAX); return (uint16)x; } + template <> inline uint16 U16(int32 x) { nvDebugCheck(x >= 0 && x <= NV_UINT16_MAX); return (uint16)x; } + //template <> inline uint16 U16(uint16 x) { return x; } + template <> inline uint16 U16(int16 x) { nvDebugCheck(x >= 0); return (uint16)x; } + //template <> inline uint16 U16(uint8 x) { return x; } + template <> inline uint16 U16(int8 x) { nvDebugCheck(x >= 0); return (uint16)x; } + + // int16 casts: + template inline int16 I16(T x) { return x; } + template <> inline int16 I16(uint64 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; } + template <> inline int16 I16(int64 x) { nvDebugCheck(x >= NV_INT16_MIN && x <= NV_UINT16_MAX); return (int16)x; } + template <> inline int16 I16(uint32 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; } + template <> inline int16 I16(int32 x) { nvDebugCheck(x >= NV_INT16_MIN && x <= NV_UINT16_MAX); return (int16)x; } + template <> inline int16 I16(uint16 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; } + //template <> inline int16 I16(int16 x) { return x; } + //template <> inline int16 I16(uint8 x) { return x; } + //template <> inline int16 I16(int8 x) { return x; } + + // uint8 casts: + template inline uint8 U8(T x) { return x; } + template <> inline uint8 U8(uint64 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; } + template <> inline uint8 U8(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; } + template <> inline uint8 U8(uint32 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; } + template <> inline uint8 U8(int32 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; } + template <> inline uint8 U8(uint16 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; } + template <> inline uint8 U8(int16 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; } + //template <> inline uint8 U8(uint8 x) { return x; } + template <> inline uint8 U8(int8 x) { nvDebugCheck(x >= 0); return (uint8)x; } + //template <> inline uint8 U8(int8 x) { nvDebugCheck(x >= 0.0f && x <= 255.0f); return (uint8)x; } + + // int8 casts: + template inline int8 I8(T x) { return x; } + template <> inline int8 I8(uint64 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; } + template <> inline int8 I8(int64 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; } + template <> inline int8 I8(uint32 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; } + template <> inline int8 I8(int32 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; } + template <> inline int8 I8(uint16 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; } + template <> inline int8 I8(int16 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; } + template <> inline int8 I8(uint8 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; } + //template <> inline int8 I8(int8 x) { return x; } + + // float casts: + template inline float F32(T x) { return x; } + template <> inline float F32(uint64 x) { nvDebugCheck(x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; } + template <> inline float F32(int64 x) { nvDebugCheck(x >= -NV_INTEGER_TO_FLOAT_MAX && x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; } + template <> inline float F32(uint32 x) { nvDebugCheck(x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; } + template <> inline float F32(int32 x) { nvDebugCheck(x >= -NV_INTEGER_TO_FLOAT_MAX && x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; } + // The compiler should not complain about these conversions: + //template <> inline float F32(uint16 x) { nvDebugCheck(return (float)x; } + //template <> inline float F32(int16 x) { nvDebugCheck(return (float)x; } + //template <> inline float F32(uint8 x) { nvDebugCheck(return (float)x; } + //template <> inline float F32(int8 x) { nvDebugCheck(return (float)x; } + + + /// Swap two values. + template + inline void swap(T & a, T & b) + { + T temp(a); + a = b; + b = temp; + } + + /// Return the maximum of the two arguments. For floating point values, it returns the second value if the first is NaN. + template + //inline const T & max(const T & a, const T & b) + inline T max(const T & a, const T & b) + { + return (b < a) ? a : b; + } + + /// Return the maximum of the four arguments. + template + //inline const T & max4(const T & a, const T & b, const T & c) + inline T max4(const T & a, const T & b, const T & c, const T & d) + { + return max(max(a, b), max(c, d)); + } + + /// Return the maximum of the three arguments. + template + //inline const T & max3(const T & a, const T & b, const T & c) + inline T max3(const T & a, const T & b, const T & c) + { + return max(a, max(b, c)); + } + + /// Return the minimum of two values. + template + //inline const T & min(const T & a, const T & b) + inline T min(const T & a, const T & b) + { + return (a < b) ? a : b; + } + + /// Return the maximum of the three arguments. + template + //inline const T & min3(const T & a, const T & b, const T & c) + inline T min3(const T & a, const T & b, const T & c) + { + return min(a, min(b, c)); + } + + /// Clamp between two values. + template + //inline const T & clamp(const T & x, const T & a, const T & b) + inline T clamp(const T & x, const T & a, const T & b) + { + return min(max(x, a), b); + } + + /** Return the next power of two. + * @see http://graphics.stanford.edu/~seander/bithacks.html + * @warning Behaviour for 0 is undefined. + * @note isPowerOfTwo(x) == true -> nextPowerOfTwo(x) == x + * @note nextPowerOfTwo(x) = 2 << log2(x-1) + */ + inline uint nextPowerOfTwo( uint x ) + { + nvDebugCheck( x != 0 ); +#if 1 // On modern CPUs this is supposed to be as fast as using the bsr instruction. + x--; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + return x+1; +#else + uint p = 1; + while( x > p ) { + p += p; + } + return p; +#endif + } + + /// Return true if @a n is a power of two. + inline bool isPowerOfTwo( uint n ) + { + return (n & (n-1)) == 0; + } + + + // @@ Move this to utils? + /// Delete all the elements of a container. + template + void deleteAll(T & container) + { + for (typename T::PseudoIndex i = container.start(); !container.isDone(i); container.advance(i)) + { + delete container[i]; + } + } + + + + // @@ Specialize these methods for numeric, pointer, and pod types. + + template + void construct_range(T * restrict ptr, uint new_size, uint old_size) { + for (uint i = old_size; i < new_size; i++) { + new(ptr+i) T; // placement new + } + } + + template + void construct_range(T * restrict ptr, uint new_size, uint old_size, const T & elem) { + for (uint i = old_size; i < new_size; i++) { + new(ptr+i) T(elem); // placement new + } + } + + template + void construct_range(T * restrict ptr, uint new_size, uint old_size, const T * src) { + for (uint i = old_size; i < new_size; i++) { + new(ptr+i) T(src[i]); // placement new + } + } + + template + void destroy_range(T * restrict ptr, uint new_size, uint old_size) { + for (uint i = new_size; i < old_size; i++) { + (ptr+i)->~T(); // Explicit call to the destructor + } + } + + template + void fill(T * restrict dst, uint count, const T & value) { + for (uint i = 0; i < count; i++) { + dst[i] = value; + } + } + + template + void copy_range(T * restrict dst, const T * restrict src, uint count) { + for (uint i = 0; i < count; i++) { + dst[i] = src[i]; + } + } + + template + bool find(const T & element, const T * restrict ptr, uint begin, uint end, uint * index) { + for (uint i = begin; i < end; i++) { + if (ptr[i] == element) { + if (index != NULL) *index = i; + return true; + } + } + return false; + } + +} // nv namespace + +#endif // NV_CORE_UTILS_H diff --git a/3rdparty/nvtt/nvmath/Vector.inl b/3rdparty/nvtt/nvmath/Vector.inl new file mode 100644 index 00000000..8f1da1ec --- /dev/null +++ b/3rdparty/nvtt/nvmath/Vector.inl @@ -0,0 +1,921 @@ +// This code is in the public domain -- castanyo@yahoo.es + +#ifndef NV_MATH_VECTOR_INL +#define NV_MATH_VECTOR_INL + +#include "vector.h" +#include "nvcore/utils.h" // min, max +#include "nvcore/hash.h" // hash + +namespace nv +{ + + // Helpers to convert vector types. Assume T has x,y members and 2 argument constructor. + //template T to(Vector2::Arg v) { return T(v.x, v.y); } + + // Helpers to convert vector types. Assume T has x,y,z members and 3 argument constructor. + //template T to(Vector3::Arg v) { return T(v.x, v.y, v.z); } + + // Helpers to convert vector types. Assume T has x,y,z members and 3 argument constructor. + //template T to(Vector4::Arg v) { return T(v.x, v.y, v.z, v.w); } + + + // Vector2 + inline Vector2::Vector2() {} + inline Vector2::Vector2(float f) : x(f), y(f) {} + inline Vector2::Vector2(float x, float y) : x(x), y(y) {} + inline Vector2::Vector2(Vector2::Arg v) : x(v.x), y(v.y) {} + + inline const Vector2 & Vector2::operator=(Vector2::Arg v) + { + x = v.x; + y = v.y; + return *this; + } + + inline const float * Vector2::ptr() const + { + return &x; + } + + inline void Vector2::set(float x, float y) + { + this->x = x; + this->y = y; + } + + inline Vector2 Vector2::operator-() const + { + return Vector2(-x, -y); + } + + inline void Vector2::operator+=(Vector2::Arg v) + { + x += v.x; + y += v.y; + } + + inline void Vector2::operator-=(Vector2::Arg v) + { + x -= v.x; + y -= v.y; + } + + inline void Vector2::operator*=(float s) + { + x *= s; + y *= s; + } + + inline void Vector2::operator*=(Vector2::Arg v) + { + x *= v.x; + y *= v.y; + } + + inline bool operator==(Vector2::Arg a, Vector2::Arg b) + { + return a.x == b.x && a.y == b.y; + } + inline bool operator!=(Vector2::Arg a, Vector2::Arg b) + { + return a.x != b.x || a.y != b.y; + } + + + // Vector3 + inline Vector3::Vector3() {} + inline Vector3::Vector3(float f) : x(f), y(f), z(f) {} + inline Vector3::Vector3(float x, float y, float z) : x(x), y(y), z(z) {} + inline Vector3::Vector3(Vector2::Arg v, float z) : x(v.x), y(v.y), z(z) {} + inline Vector3::Vector3(Vector3::Arg v) : x(v.x), y(v.y), z(v.z) {} + + inline const Vector3 & Vector3::operator=(Vector3::Arg v) + { + x = v.x; + y = v.y; + z = v.z; + return *this; + } + + + inline Vector2 Vector3::xy() const + { + return Vector2(x, y); + } + + inline const float * Vector3::ptr() const + { + return &x; + } + + inline void Vector3::set(float x, float y, float z) + { + this->x = x; + this->y = y; + this->z = z; + } + + inline Vector3 Vector3::operator-() const + { + return Vector3(-x, -y, -z); + } + + inline void Vector3::operator+=(Vector3::Arg v) + { + x += v.x; + y += v.y; + z += v.z; + } + + inline void Vector3::operator-=(Vector3::Arg v) + { + x -= v.x; + y -= v.y; + z -= v.z; + } + + inline void Vector3::operator*=(float s) + { + x *= s; + y *= s; + z *= s; + } + + inline void Vector3::operator/=(float s) + { + float is = 1.0f / s; + x *= is; + y *= is; + z *= is; + } + + inline void Vector3::operator*=(Vector3::Arg v) + { + x *= v.x; + y *= v.y; + z *= v.z; + } + + inline void Vector3::operator/=(Vector3::Arg v) + { + x /= v.x; + y /= v.y; + z /= v.z; + } + + inline bool operator==(Vector3::Arg a, Vector3::Arg b) + { + return a.x == b.x && a.y == b.y && a.z == b.z; + } + inline bool operator!=(Vector3::Arg a, Vector3::Arg b) + { + return a.x != b.x || a.y != b.y || a.z != b.z; + } + + + // Vector4 + inline Vector4::Vector4() {} + inline Vector4::Vector4(float f) : x(f), y(f), z(f), w(f) {} + inline Vector4::Vector4(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {} + inline Vector4::Vector4(Vector2::Arg v, float z, float w) : x(v.x), y(v.y), z(z), w(w) {} + inline Vector4::Vector4(Vector2::Arg v, Vector2::Arg u) : x(v.x), y(v.y), z(u.x), w(u.y) {} + inline Vector4::Vector4(Vector3::Arg v, float w) : x(v.x), y(v.y), z(v.z), w(w) {} + inline Vector4::Vector4(Vector4::Arg v) : x(v.x), y(v.y), z(v.z), w(v.w) {} + + inline const Vector4 & Vector4::operator=(const Vector4 & v) + { + x = v.x; + y = v.y; + z = v.z; + w = v.w; + return *this; + } + + inline Vector2 Vector4::xy() const + { + return Vector2(x, y); + } + + inline Vector2 Vector4::zw() const + { + return Vector2(z, w); + } + + inline Vector3 Vector4::xyz() const + { + return Vector3(x, y, z); + } + + inline const float * Vector4::ptr() const + { + return &x; + } + + inline void Vector4::set(float x, float y, float z, float w) + { + this->x = x; + this->y = y; + this->z = z; + this->w = w; + } + + inline Vector4 Vector4::operator-() const + { + return Vector4(-x, -y, -z, -w); + } + + inline void Vector4::operator+=(Vector4::Arg v) + { + x += v.x; + y += v.y; + z += v.z; + w += v.w; + } + + inline void Vector4::operator-=(Vector4::Arg v) + { + x -= v.x; + y -= v.y; + z -= v.z; + w -= v.w; + } + + inline void Vector4::operator*=(float s) + { + x *= s; + y *= s; + z *= s; + w *= s; + } + + inline void Vector4::operator/=(float s) + { + x /= s; + y /= s; + z /= s; + w /= s; + } + + inline void Vector4::operator*=(Vector4::Arg v) + { + x *= v.x; + y *= v.y; + z *= v.z; + w *= v.w; + } + + inline void Vector4::operator/=(Vector4::Arg v) + { + x /= v.x; + y /= v.y; + z /= v.z; + w /= v.w; + } + + inline bool operator==(Vector4::Arg a, Vector4::Arg b) + { + return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; + } + inline bool operator!=(Vector4::Arg a, Vector4::Arg b) + { + return a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w; + } + + + + // Functions + + + // Vector2 + + inline Vector2 add(Vector2::Arg a, Vector2::Arg b) + { + return Vector2(a.x + b.x, a.y + b.y); + } + inline Vector2 operator+(Vector2::Arg a, Vector2::Arg b) + { + return add(a, b); + } + + inline Vector2 sub(Vector2::Arg a, Vector2::Arg b) + { + return Vector2(a.x - b.x, a.y - b.y); + } + inline Vector2 operator-(Vector2::Arg a, Vector2::Arg b) + { + return sub(a, b); + } + + inline Vector2 scale(Vector2::Arg v, float s) + { + return Vector2(v.x * s, v.y * s); + } + + inline Vector2 scale(Vector2::Arg v, Vector2::Arg s) + { + return Vector2(v.x * s.x, v.y * s.y); + } + + inline Vector2 operator*(Vector2::Arg v, float s) + { + return scale(v, s); + } + + inline Vector2 operator*(Vector2::Arg v1, Vector2::Arg v2) + { + return Vector2(v1.x*v2.x, v1.y*v2.y); + } + + inline Vector2 operator*(float s, Vector2::Arg v) + { + return scale(v, s); + } + + inline Vector2 operator/(Vector2::Arg v, float s) + { + return scale(v, 1.0f/s); + } + + inline Vector2 lerp(Vector2::Arg v1, Vector2::Arg v2, float t) + { + const float s = 1.0f - t; + return Vector2(v1.x * s + t * v2.x, v1.y * s + t * v2.y); + } + + inline float dot(Vector2::Arg a, Vector2::Arg b) + { + return a.x * b.x + a.y * b.y; + } + + inline float lengthSquared(Vector2::Arg v) + { + return v.x * v.x + v.y * v.y; + } + + inline float length(Vector2::Arg v) + { + return sqrtf(lengthSquared(v)); + } + + inline float distance(Vector2::Arg a, Vector2::Arg b) + { + return length(a - b); + } + + inline float inverseLength(Vector2::Arg v) + { + return 1.0f / sqrtf(lengthSquared(v)); + } + + inline bool isNormalized(Vector2::Arg v, float epsilon = NV_NORMAL_EPSILON) + { + return equal(length(v), 1, epsilon); + } + + inline Vector2 normalize(Vector2::Arg v, float epsilon = NV_EPSILON) + { + float l = length(v); + NV_UNUSED(epsilon); + nvDebugCheck(!isZero(l, epsilon)); + Vector2 n = scale(v, 1.0f / l); + nvDebugCheck(isNormalized(n)); + return n; + } + + inline Vector2 normalizeSafe(Vector2::Arg v, Vector2::Arg fallback, float epsilon = NV_EPSILON) + { + float l = length(v); + if (isZero(l, epsilon)) { + return fallback; + } + return scale(v, 1.0f / l); + } + + // Safe, branchless normalization from Andy Firth. All error checking ommitted. + // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/ + inline Vector2 normalizeFast(Vector2::Arg v) + { + const float very_small_float = 1.0e-037f; + float l = very_small_float + length(v); + return scale(v, 1.0f / l); + } + + inline bool equal(Vector2::Arg v1, Vector2::Arg v2, float epsilon = NV_EPSILON) + { + return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon); + } + + inline Vector2 min(Vector2::Arg a, Vector2::Arg b) + { + return Vector2(min(a.x, b.x), min(a.y, b.y)); + } + + inline Vector2 max(Vector2::Arg a, Vector2::Arg b) + { + return Vector2(max(a.x, b.x), max(a.y, b.y)); + } + + inline Vector2 clamp(Vector2::Arg v, float min, float max) + { + return Vector2(clamp(v.x, min, max), clamp(v.y, min, max)); + } + + inline Vector2 saturate(Vector2::Arg v) + { + return Vector2(saturate(v.x), saturate(v.y)); + } + + inline bool isFinite(Vector2::Arg v) + { + return isFinite(v.x) && isFinite(v.y); + } + + inline Vector2 validate(Vector2::Arg v, Vector2::Arg fallback = Vector2(0.0f)) + { + if (!isFinite(v)) return fallback; + Vector2 vf = v; + nv::floatCleanup(vf.component, 2); + return vf; + } + + // Note, this is the area scaled by 2! + inline float triangleArea(Vector2::Arg v0, Vector2::Arg v1) + { + return (v0.x * v1.y - v0.y * v1.x); // * 0.5f; + } + inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c) + { + // IC: While it may be appealing to use the following expression: + //return (c.x * a.y + a.x * b.y + b.x * c.y - b.x * a.y - c.x * b.y - a.x * c.y); // * 0.5f; + + // That's actually a terrible idea. Small triangles far from the origin can end up producing fairly large floating point + // numbers and the results becomes very unstable and dependent on the order of the factors. + + // Instead, it's preferable to substract the vertices first, and multiply the resulting small values together. The result + // in this case is always much more accurate (as long as the triangle is small) and less dependent of the location of + // the triangle. + + //return ((a.x - c.x) * (b.y - c.y) - (a.y - c.y) * (b.x - c.x)); // * 0.5f; + return triangleArea(a-c, b-c); + } + + + template <> + inline uint hash(const Vector2 & v, uint h) + { + return sdbmFloatHash(v.component, 2, h); + } + + + + // Vector3 + + inline Vector3 add(Vector3::Arg a, Vector3::Arg b) + { + return Vector3(a.x + b.x, a.y + b.y, a.z + b.z); + } + inline Vector3 add(Vector3::Arg a, float b) + { + return Vector3(a.x + b, a.y + b, a.z + b); + } + inline Vector3 operator+(Vector3::Arg a, Vector3::Arg b) + { + return add(a, b); + } + inline Vector3 operator+(Vector3::Arg a, float b) + { + return add(a, b); + } + + inline Vector3 sub(Vector3::Arg a, Vector3::Arg b) + { + return Vector3(a.x - b.x, a.y - b.y, a.z - b.z); + } + inline Vector3 sub(Vector3::Arg a, float b) + { + return Vector3(a.x - b, a.y - b, a.z - b); + } + inline Vector3 operator-(Vector3::Arg a, Vector3::Arg b) + { + return sub(a, b); + } + inline Vector3 operator-(Vector3::Arg a, float b) + { + return sub(a, b); + } + + inline Vector3 cross(Vector3::Arg a, Vector3::Arg b) + { + return Vector3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x); + } + + inline Vector3 scale(Vector3::Arg v, float s) + { + return Vector3(v.x * s, v.y * s, v.z * s); + } + + inline Vector3 scale(Vector3::Arg v, Vector3::Arg s) + { + return Vector3(v.x * s.x, v.y * s.y, v.z * s.z); + } + + inline Vector3 operator*(Vector3::Arg v, float s) + { + return scale(v, s); + } + + inline Vector3 operator*(float s, Vector3::Arg v) + { + return scale(v, s); + } + + inline Vector3 operator*(Vector3::Arg v, Vector3::Arg s) + { + return scale(v, s); + } + + inline Vector3 operator/(Vector3::Arg v, float s) + { + return scale(v, 1.0f/s); + } + + /*inline Vector3 add_scaled(Vector3::Arg a, Vector3::Arg b, float s) + { + return Vector3(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s); + }*/ + + inline Vector3 lerp(Vector3::Arg v1, Vector3::Arg v2, float t) + { + const float s = 1.0f - t; + return Vector3(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z); + } + + inline float dot(Vector3::Arg a, Vector3::Arg b) + { + return a.x * b.x + a.y * b.y + a.z * b.z; + } + + inline float lengthSquared(Vector3::Arg v) + { + return v.x * v.x + v.y * v.y + v.z * v.z; + } + + inline float length(Vector3::Arg v) + { + return sqrtf(lengthSquared(v)); + } + + inline float distance(Vector3::Arg a, Vector3::Arg b) + { + return length(a - b); + } + + inline float distanceSquared(Vector3::Arg a, Vector3::Arg b) + { + return lengthSquared(a - b); + } + + inline float inverseLength(Vector3::Arg v) + { + return 1.0f / sqrtf(lengthSquared(v)); + } + + inline bool isNormalized(Vector3::Arg v, float epsilon = NV_NORMAL_EPSILON) + { + return equal(length(v), 1, epsilon); + } + + inline Vector3 normalize(Vector3::Arg v, float epsilon = NV_EPSILON) + { + float l = length(v); + NV_UNUSED(epsilon); + nvDebugCheck(!isZero(l, epsilon)); + Vector3 n = scale(v, 1.0f / l); + nvDebugCheck(isNormalized(n)); + return n; + } + + inline Vector3 normalizeSafe(Vector3::Arg v, Vector3::Arg fallback, float epsilon = NV_EPSILON) + { + float l = length(v); + if (isZero(l, epsilon)) { + return fallback; + } + return scale(v, 1.0f / l); + } + + // Safe, branchless normalization from Andy Firth. All error checking ommitted. + // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/ + inline Vector3 normalizeFast(Vector3::Arg v) + { + const float very_small_float = 1.0e-037f; + float l = very_small_float + length(v); + return scale(v, 1.0f / l); + } + + inline bool equal(Vector3::Arg v1, Vector3::Arg v2, float epsilon = NV_EPSILON) + { + return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon); + } + + inline Vector3 min(Vector3::Arg a, Vector3::Arg b) + { + return Vector3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z)); + } + + inline Vector3 max(Vector3::Arg a, Vector3::Arg b) + { + return Vector3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z)); + } + + inline Vector3 clamp(Vector3::Arg v, float min, float max) + { + return Vector3(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max)); + } + + inline Vector3 saturate(Vector3::Arg v) + { + return Vector3(saturate(v.x), saturate(v.y), saturate(v.z)); + } + + inline Vector3 floor(Vector3::Arg v) + { + return Vector3(floorf(v.x), floorf(v.y), floorf(v.z)); + } + + inline Vector3 ceil(Vector3::Arg v) + { + return Vector3(ceilf(v.x), ceilf(v.y), ceilf(v.z)); + } + + inline bool isFinite(Vector3::Arg v) + { + return isFinite(v.x) && isFinite(v.y) && isFinite(v.z); + } + + inline Vector3 validate(Vector3::Arg v, Vector3::Arg fallback = Vector3(0.0f)) + { + if (!isFinite(v)) return fallback; + Vector3 vf = v; + nv::floatCleanup(vf.component, 3); + return vf; + } + + inline Vector3 reflect(Vector3::Arg v, Vector3::Arg n) + { + return v - (2 * dot(v, n)) * n; + } + + template <> + inline uint hash(const Vector3 & v, uint h) + { + return sdbmFloatHash(v.component, 3, h); + } + + + // Vector4 + + inline Vector4 add(Vector4::Arg a, Vector4::Arg b) + { + return Vector4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); + } + inline Vector4 operator+(Vector4::Arg a, Vector4::Arg b) + { + return add(a, b); + } + + inline Vector4 sub(Vector4::Arg a, Vector4::Arg b) + { + return Vector4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); + } + inline Vector4 operator-(Vector4::Arg a, Vector4::Arg b) + { + return sub(a, b); + } + + inline Vector4 scale(Vector4::Arg v, float s) + { + return Vector4(v.x * s, v.y * s, v.z * s, v.w * s); + } + + inline Vector4 scale(Vector4::Arg v, Vector4::Arg s) + { + return Vector4(v.x * s.x, v.y * s.y, v.z * s.z, v.w * s.w); + } + + inline Vector4 operator*(Vector4::Arg v, float s) + { + return scale(v, s); + } + + inline Vector4 operator*(float s, Vector4::Arg v) + { + return scale(v, s); + } + + inline Vector4 operator*(Vector4::Arg v, Vector4::Arg s) + { + return scale(v, s); + } + + inline Vector4 operator/(Vector4::Arg v, float s) + { + return scale(v, 1.0f/s); + } + + /*inline Vector4 add_scaled(Vector4::Arg a, Vector4::Arg b, float s) + { + return Vector4(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s, a.w + b.w * s); + }*/ + + inline Vector4 lerp(Vector4::Arg v1, Vector4::Arg v2, float t) + { + const float s = 1.0f - t; + return Vector4(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z, v1.w * s + t * v2.w); + } + + inline float dot(Vector4::Arg a, Vector4::Arg b) + { + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; + } + + inline float lengthSquared(Vector4::Arg v) + { + return v.x * v.x + v.y * v.y + v.z * v.z + v.w * v.w; + } + + inline float length(Vector4::Arg v) + { + return sqrtf(lengthSquared(v)); + } + + inline float inverseLength(Vector4::Arg v) + { + return 1.0f / sqrtf(lengthSquared(v)); + } + + inline bool isNormalized(Vector4::Arg v, float epsilon = NV_NORMAL_EPSILON) + { + return equal(length(v), 1, epsilon); + } + + inline Vector4 normalize(Vector4::Arg v, float epsilon = NV_EPSILON) + { + float l = length(v); + NV_UNUSED(epsilon); + nvDebugCheck(!isZero(l, epsilon)); + Vector4 n = scale(v, 1.0f / l); + nvDebugCheck(isNormalized(n)); + return n; + } + + inline Vector4 normalizeSafe(Vector4::Arg v, Vector4::Arg fallback, float epsilon = NV_EPSILON) + { + float l = length(v); + if (isZero(l, epsilon)) { + return fallback; + } + return scale(v, 1.0f / l); + } + + // Safe, branchless normalization from Andy Firth. All error checking ommitted. + // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/ + inline Vector4 normalizeFast(Vector4::Arg v) + { + const float very_small_float = 1.0e-037f; + float l = very_small_float + length(v); + return scale(v, 1.0f / l); + } + + inline bool equal(Vector4::Arg v1, Vector4::Arg v2, float epsilon = NV_EPSILON) + { + return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon) && equal(v1.w, v2.w, epsilon); + } + + inline Vector4 min(Vector4::Arg a, Vector4::Arg b) + { + return Vector4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w)); + } + + inline Vector4 max(Vector4::Arg a, Vector4::Arg b) + { + return Vector4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); + } + + inline Vector4 clamp(Vector4::Arg v, float min, float max) + { + return Vector4(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max), clamp(v.w, min, max)); + } + + inline Vector4 saturate(Vector4::Arg v) + { + return Vector4(saturate(v.x), saturate(v.y), saturate(v.z), saturate(v.w)); + } + + inline bool isFinite(Vector4::Arg v) + { + return isFinite(v.x) && isFinite(v.y) && isFinite(v.z) && isFinite(v.w); + } + + inline Vector4 validate(Vector4::Arg v, Vector4::Arg fallback = Vector4(0.0f)) + { + if (!isFinite(v)) return fallback; + Vector4 vf = v; + nv::floatCleanup(vf.component, 4); + return vf; + } + + template <> + inline uint hash(const Vector4 & v, uint h) + { + return sdbmFloatHash(v.component, 4, h); + } + + +#if NV_OS_IOS // LLVM is not happy with implicit conversion of immediate constants to float + + //int: + + inline Vector2 scale(Vector2::Arg v, int s) + { + return Vector2(v.x * s, v.y * s); + } + + inline Vector2 operator*(Vector2::Arg v, int s) + { + return scale(v, s); + } + + inline Vector2 operator*(int s, Vector2::Arg v) + { + return scale(v, s); + } + + inline Vector2 operator/(Vector2::Arg v, int s) + { + return scale(v, 1.0f/s); + } + + inline Vector3 scale(Vector3::Arg v, int s) + { + return Vector3(v.x * s, v.y * s, v.z * s); + } + + inline Vector3 operator*(Vector3::Arg v, int s) + { + return scale(v, s); + } + + inline Vector3 operator*(int s, Vector3::Arg v) + { + return scale(v, s); + } + + inline Vector3 operator/(Vector3::Arg v, int s) + { + return scale(v, 1.0f/s); + } + + inline Vector4 scale(Vector4::Arg v, int s) + { + return Vector4(v.x * s, v.y * s, v.z * s, v.w * s); + } + + inline Vector4 operator*(Vector4::Arg v, int s) + { + return scale(v, s); + } + + inline Vector4 operator*(int s, Vector4::Arg v) + { + return scale(v, s); + } + + inline Vector4 operator/(Vector4::Arg v, int s) + { + return scale(v, 1.0f/s); + } + + //double: + + inline Vector3 operator*(Vector3::Arg v, double s) + { + return scale(v, (float)s); + } + + inline Vector3 operator*(double s, Vector3::Arg v) + { + return scale(v, (float)s); + } + + inline Vector3 operator/(Vector3::Arg v, double s) + { + return scale(v, 1.f/((float)s)); + } + +#endif //NV_OS_IOS + +} // nv namespace + +#endif // NV_MATH_VECTOR_INL diff --git a/3rdparty/nvtt/nvmath/fitting.cpp b/3rdparty/nvtt/nvmath/fitting.cpp new file mode 100644 index 00000000..ba01b1fc --- /dev/null +++ b/3rdparty/nvtt/nvmath/fitting.cpp @@ -0,0 +1,1200 @@ +// This code is in the public domain -- Ignacio CastaÃąo + +#include "fitting.h" +#include "vector.inl" +#include "plane.inl" +#include "matrix.inl" + +#include "nvcore/array.inl" +#include "nvcore/utils.h" // max, swap + +using namespace nv; + +// @@ Move to EigenSolver.h + +// @@ We should be able to do something cheaper... +static Vector3 estimatePrincipalComponent(const float * __restrict matrix) +{ + const Vector3 row0(matrix[0], matrix[1], matrix[2]); + const Vector3 row1(matrix[1], matrix[3], matrix[4]); + const Vector3 row2(matrix[2], matrix[4], matrix[5]); + + float r0 = lengthSquared(row0); + float r1 = lengthSquared(row1); + float r2 = lengthSquared(row2); + + if (r0 > r1 && r0 > r2) return row0; + if (r1 > r2) return row1; + return row2; +} + + +static inline Vector3 firstEigenVector_PowerMethod(const float *__restrict matrix) +{ + if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0) + { + return Vector3(0.0f); + } + + Vector3 v = estimatePrincipalComponent(matrix); + + const int NUM = 8; + for (int i = 0; i < NUM; i++) + { + float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2]; + float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4]; + float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5]; + + float norm = max(max(x, y), z); + + v = Vector3(x, y, z) / norm; + } + + return v; +} + + +Vector3 nv::Fit::computeCentroid(int n, const Vector3 *__restrict points) +{ + Vector3 centroid(0.0f); + + for (int i = 0; i < n; i++) + { + centroid += points[i]; + } + centroid /= float(n); + + return centroid; +} + +Vector3 nv::Fit::computeCentroid(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric) +{ + Vector3 centroid(0.0f); + float total = 0.0f; + + for (int i = 0; i < n; i++) + { + total += weights[i]; + centroid += weights[i]*points[i]; + } + centroid /= total; + + return centroid; +} + +Vector4 nv::Fit::computeCentroid(int n, const Vector4 *__restrict points) +{ + Vector4 centroid(0.0f); + + for (int i = 0; i < n; i++) + { + centroid += points[i]; + } + centroid /= float(n); + + return centroid; +} + +Vector4 nv::Fit::computeCentroid(int n, const Vector4 *__restrict points, const float *__restrict weights, Vector4::Arg metric) +{ + Vector4 centroid(0.0f); + float total = 0.0f; + + for (int i = 0; i < n; i++) + { + total += weights[i]; + centroid += weights[i]*points[i]; + } + centroid /= total; + + return centroid; +} + + + +Vector3 nv::Fit::computeCovariance(int n, const Vector3 *__restrict points, float *__restrict covariance) +{ + // compute the centroid + Vector3 centroid = computeCentroid(n, points); + + // compute covariance matrix + for (int i = 0; i < 6; i++) + { + covariance[i] = 0.0f; + } + + for (int i = 0; i < n; i++) + { + Vector3 v = points[i] - centroid; + + covariance[0] += v.x * v.x; + covariance[1] += v.x * v.y; + covariance[2] += v.x * v.z; + covariance[3] += v.y * v.y; + covariance[4] += v.y * v.z; + covariance[5] += v.z * v.z; + } + + return centroid; +} + +Vector3 nv::Fit::computeCovariance(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, float *__restrict covariance) +{ + // compute the centroid + Vector3 centroid = computeCentroid(n, points, weights, metric); + + // compute covariance matrix + for (int i = 0; i < 6; i++) + { + covariance[i] = 0.0f; + } + + for (int i = 0; i < n; i++) + { + Vector3 a = (points[i] - centroid) * metric; + Vector3 b = weights[i]*a; + + covariance[0] += a.x * b.x; + covariance[1] += a.x * b.y; + covariance[2] += a.x * b.z; + covariance[3] += a.y * b.y; + covariance[4] += a.y * b.z; + covariance[5] += a.z * b.z; + } + + return centroid; +} + +Vector4 nv::Fit::computeCovariance(int n, const Vector4 *__restrict points, float *__restrict covariance) +{ + // compute the centroid + Vector4 centroid = computeCentroid(n, points); + + // compute covariance matrix + for (int i = 0; i < 10; i++) + { + covariance[i] = 0.0f; + } + + for (int i = 0; i < n; i++) + { + Vector4 v = points[i] - centroid; + + covariance[0] += v.x * v.x; + covariance[1] += v.x * v.y; + covariance[2] += v.x * v.z; + covariance[3] += v.x * v.w; + + covariance[4] += v.y * v.y; + covariance[5] += v.y * v.z; + covariance[6] += v.y * v.w; + + covariance[7] += v.z * v.z; + covariance[8] += v.z * v.w; + + covariance[9] += v.w * v.w; + } + + return centroid; +} + +Vector4 nv::Fit::computeCovariance(int n, const Vector4 *__restrict points, const float *__restrict weights, Vector4::Arg metric, float *__restrict covariance) +{ + // compute the centroid + Vector4 centroid = computeCentroid(n, points, weights, metric); + + // compute covariance matrix + for (int i = 0; i < 10; i++) + { + covariance[i] = 0.0f; + } + + for (int i = 0; i < n; i++) + { + Vector4 a = (points[i] - centroid) * metric; + Vector4 b = weights[i]*a; + + covariance[0] += a.x * b.x; + covariance[1] += a.x * b.y; + covariance[2] += a.x * b.z; + covariance[3] += a.x * b.w; + + covariance[4] += a.y * b.y; + covariance[5] += a.y * b.z; + covariance[6] += a.y * b.w; + + covariance[7] += a.z * b.z; + covariance[8] += a.z * b.w; + + covariance[9] += a.w * b.w; + } + + return centroid; +} + + + +Vector3 nv::Fit::computePrincipalComponent_PowerMethod(int n, const Vector3 *__restrict points) +{ + float matrix[6]; + computeCovariance(n, points, matrix); + + return firstEigenVector_PowerMethod(matrix); +} + +Vector3 nv::Fit::computePrincipalComponent_PowerMethod(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric) +{ + float matrix[6]; + computeCovariance(n, points, weights, metric, matrix); + + return firstEigenVector_PowerMethod(matrix); +} + + + +static inline Vector3 firstEigenVector_EigenSolver3(const float *__restrict matrix) +{ + if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0) + { + return Vector3(0.0f); + } + + float eigenValues[3]; + Vector3 eigenVectors[3]; + if (!nv::Fit::eigenSolveSymmetric3(matrix, eigenValues, eigenVectors)) + { + return Vector3(0.0f); + } + + return eigenVectors[0]; +} + +Vector3 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector3 *__restrict points) +{ + float matrix[6]; + computeCovariance(n, points, matrix); + + return firstEigenVector_EigenSolver3(matrix); +} + +Vector3 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric) +{ + float matrix[6]; + computeCovariance(n, points, weights, metric, matrix); + + return firstEigenVector_EigenSolver3(matrix); +} + + + +static inline Vector4 firstEigenVector_EigenSolver4(const float *__restrict matrix) +{ + if (matrix[0] == 0 && matrix[4] == 0 && matrix[7] == 0&& matrix[9] == 0) + { + return Vector4(0.0f); + } + + float eigenValues[4]; + Vector4 eigenVectors[4]; + if (!nv::Fit::eigenSolveSymmetric4(matrix, eigenValues, eigenVectors)) + { + return Vector4(0.0f); + } + + return eigenVectors[0]; +} + +Vector4 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector4 *__restrict points) +{ + float matrix[10]; + computeCovariance(n, points, matrix); + + return firstEigenVector_EigenSolver4(matrix); +} + +Vector4 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector4 *__restrict points, const float *__restrict weights, Vector4::Arg metric) +{ + float matrix[10]; + computeCovariance(n, points, weights, metric, matrix); + + return firstEigenVector_EigenSolver4(matrix); +} + + + +void ArvoSVD(int rows, int cols, float * Q, float * diag, float * R); + +Vector3 nv::Fit::computePrincipalComponent_SVD(int n, const Vector3 *__restrict points) +{ + // Store the points in an n x n matrix + Array Q; Q.resize(n*n, 0.0f); + for (int i = 0; i < n; ++i) + { + Q[i*n+0] = points[i].x; + Q[i*n+1] = points[i].y; + Q[i*n+2] = points[i].z; + } + + // Alloc space for the SVD outputs + Array diag; diag.resize(n, 0.0f); + Array R; R.resize(n*n, 0.0f); + + ArvoSVD(n, n, &Q[0], &diag[0], &R[0]); + + // Get the principal component + return Vector3(R[0], R[1], R[2]); +} + +Vector4 nv::Fit::computePrincipalComponent_SVD(int n, const Vector4 *__restrict points) +{ + // Store the points in an n x n matrix + Array Q; Q.resize(n*n, 0.0f); + for (int i = 0; i < n; ++i) + { + Q[i*n+0] = points[i].x; + Q[i*n+1] = points[i].y; + Q[i*n+2] = points[i].z; + Q[i*n+3] = points[i].w; + } + + // Alloc space for the SVD outputs + Array diag; diag.resize(n, 0.0f); + Array R; R.resize(n*n, 0.0f); + + ArvoSVD(n, n, &Q[0], &diag[0], &R[0]); + + // Get the principal component + return Vector4(R[0], R[1], R[2], R[3]); +} + + + +Plane nv::Fit::bestPlane(int n, const Vector3 *__restrict points) +{ + // compute the centroid and covariance + float matrix[6]; + Vector3 centroid = computeCovariance(n, points, matrix); + + if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0) + { + // If no plane defined, then return a horizontal plane. + return Plane(Vector3(0, 0, 1), centroid); + } + + float eigenValues[3]; + Vector3 eigenVectors[3]; + if (!eigenSolveSymmetric3(matrix, eigenValues, eigenVectors)) { + // If no plane defined, then return a horizontal plane. + return Plane(Vector3(0, 0, 1), centroid); + } + + return Plane(eigenVectors[2], centroid); +} + +bool nv::Fit::isPlanar(int n, const Vector3 * points, float epsilon/*=NV_EPSILON*/) +{ + // compute the centroid and covariance + float matrix[6]; + computeCovariance(n, points, matrix); + + float eigenValues[3]; + Vector3 eigenVectors[3]; + if (!eigenSolveSymmetric3(matrix, eigenValues, eigenVectors)) { + return false; + } + + return eigenValues[2] < epsilon; +} + + + +// Tridiagonal solver from Charles Bloom. +// Householder transforms followed by QL decomposition. +// Seems to be based on the code from Numerical Recipes in C. + +static void EigenSolver3_Tridiagonal(float mat[3][3], float * diag, float * subd); +static bool EigenSolver3_QLAlgorithm(float mat[3][3], float * diag, float * subd); + +bool nv::Fit::eigenSolveSymmetric3(const float matrix[6], float eigenValues[3], Vector3 eigenVectors[3]) +{ + nvDebugCheck(matrix != NULL && eigenValues != NULL && eigenVectors != NULL); + + float subd[3]; + float diag[3]; + float work[3][3]; + + work[0][0] = matrix[0]; + work[0][1] = work[1][0] = matrix[1]; + work[0][2] = work[2][0] = matrix[2]; + work[1][1] = matrix[3]; + work[1][2] = work[2][1] = matrix[4]; + work[2][2] = matrix[5]; + + EigenSolver3_Tridiagonal(work, diag, subd); + if (!EigenSolver3_QLAlgorithm(work, diag, subd)) + { + for (int i = 0; i < 3; i++) { + eigenValues[i] = 0; + eigenVectors[i] = Vector3(0); + } + return false; + } + + for (int i = 0; i < 3; i++) { + eigenValues[i] = (float)diag[i]; + } + + // eigenvectors are the columns; make them the rows : + + for (int i=0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + eigenVectors[j].component[i] = (float) work[i][j]; + } + } + + // shuffle to sort by singular value : + if (eigenValues[2] > eigenValues[0] && eigenValues[2] > eigenValues[1]) + { + swap(eigenValues[0], eigenValues[2]); + swap(eigenVectors[0], eigenVectors[2]); + } + if (eigenValues[1] > eigenValues[0]) + { + swap(eigenValues[0], eigenValues[1]); + swap(eigenVectors[0], eigenVectors[1]); + } + if (eigenValues[2] > eigenValues[1]) + { + swap(eigenValues[1], eigenValues[2]); + swap(eigenVectors[1], eigenVectors[2]); + } + + nvDebugCheck(eigenValues[0] >= eigenValues[1] && eigenValues[0] >= eigenValues[2]); + nvDebugCheck(eigenValues[1] >= eigenValues[2]); + + return true; +} + +static void EigenSolver3_Tridiagonal(float mat[3][3], float * diag, float * subd) +{ + // Householder reduction T = Q^t M Q + // Input: + // mat, symmetric 3x3 matrix M + // Output: + // mat, orthogonal matrix Q + // diag, diagonal entries of T + // subd, subdiagonal entries of T (T is symmetric) + const float epsilon = 1e-08f; + + float a = mat[0][0]; + float b = mat[0][1]; + float c = mat[0][2]; + float d = mat[1][1]; + float e = mat[1][2]; + float f = mat[2][2]; + + diag[0] = a; + subd[2] = 0.f; + if (fabsf(c) >= epsilon) + { + const float ell = sqrtf(b*b+c*c); + b /= ell; + c /= ell; + const float q = 2*b*e+c*(f-d); + diag[1] = d+c*q; + diag[2] = f-c*q; + subd[0] = ell; + subd[1] = e-b*q; + mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0; + mat[1][0] = 0; mat[1][1] = b; mat[1][2] = c; + mat[2][0] = 0; mat[2][1] = c; mat[2][2] = -b; + } + else + { + diag[1] = d; + diag[2] = f; + subd[0] = b; + subd[1] = e; + mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0; + mat[1][0] = 0; mat[1][1] = 1; mat[1][2] = 0; + mat[2][0] = 0; mat[2][1] = 0; mat[2][2] = 1; + } +} + +static bool EigenSolver3_QLAlgorithm(float mat[3][3], float * diag, float * subd) +{ + // QL iteration with implicit shifting to reduce matrix from tridiagonal + // to diagonal + const int maxiter = 32; + + for (int ell = 0; ell < 3; ell++) + { + int iter; + for (iter = 0; iter < maxiter; iter++) + { + int m; + for (m = ell; m <= 1; m++) + { + float dd = fabsf(diag[m]) + fabsf(diag[m+1]); + if ( fabsf(subd[m]) + dd == dd ) + break; + } + if ( m == ell ) + break; + + float g = (diag[ell+1]-diag[ell])/(2*subd[ell]); + float r = sqrtf(g*g+1); + if ( g < 0 ) + g = diag[m]-diag[ell]+subd[ell]/(g-r); + else + g = diag[m]-diag[ell]+subd[ell]/(g+r); + float s = 1, c = 1, p = 0; + for (int i = m-1; i >= ell; i--) + { + float f = s*subd[i], b = c*subd[i]; + if ( fabsf(f) >= fabsf(g) ) + { + c = g/f; + r = sqrtf(c*c+1); + subd[i+1] = f*r; + c *= (s = 1/r); + } + else + { + s = f/g; + r = sqrtf(s*s+1); + subd[i+1] = g*r; + s *= (c = 1/r); + } + g = diag[i+1]-p; + r = (diag[i]-g)*s+2*b*c; + p = s*r; + diag[i+1] = g+p; + g = c*r-b; + + for (int k = 0; k < 3; k++) + { + f = mat[k][i+1]; + mat[k][i+1] = s*mat[k][i]+c*f; + mat[k][i] = c*mat[k][i]-s*f; + } + } + diag[ell] -= p; + subd[ell] = g; + subd[m] = 0; + } + + if ( iter == maxiter ) + // should not get here under normal circumstances + return false; + } + + return true; +} + + + +// Tridiagonal solver for 4x4 symmetric matrices. + +static void EigenSolver4_Tridiagonal(float mat[4][4], float * diag, float * subd); +static bool EigenSolver4_QLAlgorithm(float mat[4][4], float * diag, float * subd); + +bool nv::Fit::eigenSolveSymmetric4(const float matrix[10], float eigenValues[4], Vector4 eigenVectors[4]) +{ + nvDebugCheck(matrix != NULL && eigenValues != NULL && eigenVectors != NULL); + + float subd[4]; + float diag[4]; + float work[4][4]; + + work[0][0] = matrix[0]; + work[0][1] = work[1][0] = matrix[1]; + work[0][2] = work[2][0] = matrix[2]; + work[0][3] = work[3][0] = matrix[3]; + work[1][1] = matrix[4]; + work[1][2] = work[2][1] = matrix[5]; + work[1][3] = work[3][1] = matrix[6]; + work[2][2] = matrix[7]; + work[2][3] = work[3][2] = matrix[8]; + work[3][3] = matrix[9]; + + EigenSolver4_Tridiagonal(work, diag, subd); + if (!EigenSolver4_QLAlgorithm(work, diag, subd)) + { + for (int i = 0; i < 4; i++) { + eigenValues[i] = 0; + eigenVectors[i] = Vector4(0); + } + return false; + } + + for (int i = 0; i < 4; i++) { + eigenValues[i] = (float)diag[i]; + } + + // eigenvectors are the columns; make them the rows + + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++) + { + eigenVectors[j].component[i] = (float) work[i][j]; + } + } + + // sort by singular value + + for (int i = 0; i < 3; ++i) + { + for (int j = i+1; j < 4; ++j) + { + if (eigenValues[j] > eigenValues[i]) + { + swap(eigenValues[i], eigenValues[j]); + swap(eigenVectors[i], eigenVectors[j]); + } + } + } + + nvDebugCheck(eigenValues[0] >= eigenValues[1] && eigenValues[0] >= eigenValues[2] && eigenValues[0] >= eigenValues[3]); + nvDebugCheck(eigenValues[1] >= eigenValues[2] && eigenValues[1] >= eigenValues[3]); + nvDebugCheck(eigenValues[2] >= eigenValues[2]); + + return true; +} + +inline float signNonzero(float x) +{ + return (x >= 0.0f) ? 1.0f : -1.0f; +} + +static void EigenSolver4_Tridiagonal(float mat[4][4], float * diag, float * subd) +{ + // Householder reduction T = Q^t M Q + // Input: + // mat, symmetric 3x3 matrix M + // Output: + // mat, orthogonal matrix Q + // diag, diagonal entries of T + // subd, subdiagonal entries of T (T is symmetric) + + static const int n = 4; + + // Set epsilon relative to size of elements in matrix + static const float relEpsilon = 1e-6f; + float maxElement = FLT_MAX; + for (int i = 0; i < n; ++i) + for (int j = 0; j < n; ++j) + maxElement = max(maxElement, fabsf(mat[i][j])); + float epsilon = relEpsilon * maxElement; + + // Iterative algorithm, works for any size of matrix but might be slower than + // a closed-form solution for symmetric 4x4 matrices. Based on this article: + // http://en.wikipedia.org/wiki/Householder_transformation#Tridiagonalization + + Matrix A, Q(identity); + memcpy(&A, mat, sizeof(float)*n*n); + + // We proceed from left to right, making the off-tridiagonal entries zero in + // one column of the matrix at a time. + for (int k = 0; k < n - 2; ++k) + { + float sum = 0.0f; + for (int j = k+1; j < n; ++j) + sum += A(j,k)*A(j,k); + float alpha = -signNonzero(A(k+1,k)) * sqrtf(sum); + float r = sqrtf(0.5f * (alpha*alpha - A(k+1,k)*alpha)); + + // If r is zero, skip this column - already in tridiagonal form + if (fabsf(r) < epsilon) + continue; + + float v[n] = {}; + v[k+1] = 0.5f * (A(k+1,k) - alpha) / r; + for (int j = k+2; j < n; ++j) + v[j] = 0.5f * A(j,k) / r; + + Matrix P(identity); + for (int i = 0; i < n; ++i) + for (int j = 0; j < n; ++j) + P(i,j) -= 2.0f * v[i] * v[j]; + + A = mul(mul(P, A), P); + Q = mul(Q, P); + } + + nvDebugCheck(fabsf(A(2,0)) < epsilon); + nvDebugCheck(fabsf(A(0,2)) < epsilon); + nvDebugCheck(fabsf(A(3,0)) < epsilon); + nvDebugCheck(fabsf(A(0,3)) < epsilon); + nvDebugCheck(fabsf(A(3,1)) < epsilon); + nvDebugCheck(fabsf(A(1,3)) < epsilon); + + for (int i = 0; i < n; ++i) + diag[i] = A(i,i); + for (int i = 0; i < n - 1; ++i) + subd[i] = A(i+1,i); + subd[n-1] = 0.0f; + + memcpy(mat, &Q, sizeof(float)*n*n); +} + +static bool EigenSolver4_QLAlgorithm(float mat[4][4], float * diag, float * subd) +{ + // QL iteration with implicit shifting to reduce matrix from tridiagonal + // to diagonal + const int maxiter = 32; + + for (int ell = 0; ell < 4; ell++) + { + int iter; + for (iter = 0; iter < maxiter; iter++) + { + int m; + for (m = ell; m < 3; m++) + { + float dd = fabsf(diag[m]) + fabsf(diag[m+1]); + if ( fabsf(subd[m]) + dd == dd ) + break; + } + if ( m == ell ) + break; + + float g = (diag[ell+1]-diag[ell])/(2*subd[ell]); + float r = sqrtf(g*g+1); + if ( g < 0 ) + g = diag[m]-diag[ell]+subd[ell]/(g-r); + else + g = diag[m]-diag[ell]+subd[ell]/(g+r); + float s = 1, c = 1, p = 0; + for (int i = m-1; i >= ell; i--) + { + float f = s*subd[i], b = c*subd[i]; + if ( fabsf(f) >= fabsf(g) ) + { + c = g/f; + r = sqrtf(c*c+1); + subd[i+1] = f*r; + c *= (s = 1/r); + } + else + { + s = f/g; + r = sqrtf(s*s+1); + subd[i+1] = g*r; + s *= (c = 1/r); + } + g = diag[i+1]-p; + r = (diag[i]-g)*s+2*b*c; + p = s*r; + diag[i+1] = g+p; + g = c*r-b; + + for (int k = 0; k < 4; k++) + { + f = mat[k][i+1]; + mat[k][i+1] = s*mat[k][i]+c*f; + mat[k][i] = c*mat[k][i]-s*f; + } + } + diag[ell] -= p; + subd[ell] = g; + subd[m] = 0; + } + + if ( iter == maxiter ) + // should not get here under normal circumstances + return false; + } + + return true; +} + + + +int nv::Fit::compute4Means(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, Vector3 *__restrict cluster) +{ + // Compute principal component. + float matrix[6]; + Vector3 centroid = computeCovariance(n, points, weights, metric, matrix); + Vector3 principal = firstEigenVector_PowerMethod(matrix); + + // Pick initial solution. + int mini, maxi; + mini = maxi = 0; + + float mindps, maxdps; + mindps = maxdps = dot(points[0] - centroid, principal); + + for (int i = 1; i < n; ++i) + { + float dps = dot(points[i] - centroid, principal); + + if (dps < mindps) { + mindps = dps; + mini = i; + } + else { + maxdps = dps; + maxi = i; + } + } + + cluster[0] = centroid + mindps * principal; + cluster[1] = centroid + maxdps * principal; + cluster[2] = (2.0f * cluster[0] + cluster[1]) / 3.0f; + cluster[3] = (2.0f * cluster[1] + cluster[0]) / 3.0f; + + // Now we have to iteratively refine the clusters. + while (true) + { + Vector3 newCluster[4] = { Vector3(0.0f), Vector3(0.0f), Vector3(0.0f), Vector3(0.0f) }; + float total[4] = {0, 0, 0, 0}; + + for (int i = 0; i < n; ++i) + { + // Find nearest cluster. + int nearest = 0; + float mindist = FLT_MAX; + for (int j = 0; j < 4; j++) + { + float dist = lengthSquared((cluster[j] - points[i]) * metric); + if (dist < mindist) + { + mindist = dist; + nearest = j; + } + } + + newCluster[nearest] += weights[i] * points[i]; + total[nearest] += weights[i]; + } + + for (int j = 0; j < 4; j++) + { + if (total[j] != 0) + newCluster[j] /= total[j]; + } + + if (equal(cluster[0], newCluster[0]) && equal(cluster[1], newCluster[1]) && + equal(cluster[2], newCluster[2]) && equal(cluster[3], newCluster[3])) + { + return (total[0] != 0) + (total[1] != 0) + (total[2] != 0) + (total[3] != 0); + } + + cluster[0] = newCluster[0]; + cluster[1] = newCluster[1]; + cluster[2] = newCluster[2]; + cluster[3] = newCluster[3]; + + // Sort clusters by weight. + for (int i = 0; i < 4; i++) + { + for (int j = i; j > 0 && total[j] > total[j - 1]; j--) + { + swap( total[j], total[j - 1] ); + swap( cluster[j], cluster[j - 1] ); + } + } + } +} + + + +// Adaptation of James Arvo's SVD code, as found in ZOH. + +inline float Sqr(float x) { return x*x; } + +inline float svd_pythag( float a, float b ) +{ + float at = fabsf(a); + float bt = fabsf(b); + if( at > bt ) + return at * sqrtf( 1.0f + Sqr( bt / at ) ); + else if( bt > 0.0f ) + return bt * sqrtf( 1.0f + Sqr( at / bt ) ); + else return 0.0f; +} + +inline float SameSign( float a, float b ) +{ + float t; + if( b >= 0.0f ) t = fabsf( a ); + else t = -fabsf( a ); + return t; +} + +void ArvoSVD(int rows, int cols, float * Q, float * diag, float * R) +{ + static const int MaxIterations = 30; + + int i, j, k, l, p, q, iter; + float c, f, h, s, x, y, z; + float norm = 0.0f; + float g = 0.0f; + float scale = 0.0f; + + Array temp; temp.resize(cols, 0.0f); + + for( i = 0; i < cols; i++ ) + { + temp[i] = scale * g; + scale = 0.0f; + g = 0.0f; + s = 0.0f; + l = i + 1; + + if( i < rows ) + { + for( k = i; k < rows; k++ ) scale += fabsf( Q[k*cols+i] ); + if( scale != 0.0f ) + { + for( k = i; k < rows; k++ ) + { + Q[k*cols+i] /= scale; + s += Sqr( Q[k*cols+i] ); + } + f = Q[i*cols+i]; + g = -SameSign( sqrtf(s), f ); + h = f * g - s; + Q[i*cols+i] = f - g; + if( i != cols - 1 ) + { + for( j = l; j < cols; j++ ) + { + s = 0.0f; + for( k = i; k < rows; k++ ) s += Q[k*cols+i] * Q[k*cols+j]; + f = s / h; + for( k = i; k < rows; k++ ) Q[k*cols+j] += f * Q[k*cols+i]; + } + } + for( k = i; k < rows; k++ ) Q[k*cols+i] *= scale; + } + } + + diag[i] = scale * g; + g = 0.0f; + s = 0.0f; + scale = 0.0f; + + if( i < rows && i != cols - 1 ) + { + for( k = l; k < cols; k++ ) scale += fabsf( Q[i*cols+k] ); + if( scale != 0.0f ) + { + for( k = l; k < cols; k++ ) + { + Q[i*cols+k] /= scale; + s += Sqr( Q[i*cols+k] ); + } + f = Q[i*cols+l]; + g = -SameSign( sqrtf(s), f ); + h = f * g - s; + Q[i*cols+l] = f - g; + for( k = l; k < cols; k++ ) temp[k] = Q[i*cols+k] / h; + if( i != rows - 1 ) + { + for( j = l; j < rows; j++ ) + { + s = 0.0f; + for( k = l; k < cols; k++ ) s += Q[j*cols+k] * Q[i*cols+k]; + for( k = l; k < cols; k++ ) Q[j*cols+k] += s * temp[k]; + } + } + for( k = l; k < cols; k++ ) Q[i*cols+k] *= scale; + } + } + norm = max( norm, fabsf( diag[i] ) + fabsf( temp[i] ) ); + } + + + for( i = cols - 1; i >= 0; i-- ) + { + if( i < cols - 1 ) + { + if( g != 0.0f ) + { + for( j = l; j < cols; j++ ) R[i*cols+j] = ( Q[i*cols+j] / Q[i*cols+l] ) / g; + for( j = l; j < cols; j++ ) + { + s = 0.0f; + for( k = l; k < cols; k++ ) s += Q[i*cols+k] * R[j*cols+k]; + for( k = l; k < cols; k++ ) R[j*cols+k] += s * R[i*cols+k]; + } + } + for( j = l; j < cols; j++ ) + { + R[i*cols+j] = 0.0f; + R[j*cols+i] = 0.0f; + } + } + R[i*cols+i] = 1.0f; + g = temp[i]; + l = i; + } + + + for( i = cols - 1; i >= 0; i-- ) + { + l = i + 1; + g = diag[i]; + if( i < cols - 1 ) for( j = l; j < cols; j++ ) Q[i*cols+j] = 0.0f; + if( g != 0.0f ) + { + g = 1.0f / g; + if( i != cols - 1 ) + { + for( j = l; j < cols; j++ ) + { + s = 0.0f; + for( k = l; k < rows; k++ ) s += Q[k*cols+i] * Q[k*cols+j]; + f = ( s / Q[i*cols+i] ) * g; + for( k = i; k < rows; k++ ) Q[k*cols+j] += f * Q[k*cols+i]; + } + } + for( j = i; j < rows; j++ ) Q[j*cols+i] *= g; + } + else + { + for( j = i; j < rows; j++ ) Q[j*cols+i] = 0.0f; + } + Q[i*cols+i] += 1.0f; + } + + + for( k = cols - 1; k >= 0; k-- ) + { + for( iter = 1; iter <= MaxIterations; iter++ ) + { + int jump; + + for( l = k; l >= 0; l-- ) + { + q = l - 1; + if( fabsf( temp[l] ) + norm == norm ) { jump = 1; break; } + if( fabsf( diag[q] ) + norm == norm ) { jump = 0; break; } + } + + if( !jump ) + { + c = 0.0f; + s = 1.0f; + for( i = l; i <= k; i++ ) + { + f = s * temp[i]; + temp[i] *= c; + if( fabsf( f ) + norm == norm ) break; + g = diag[i]; + h = svd_pythag( f, g ); + diag[i] = h; + h = 1.0f / h; + c = g * h; + s = -f * h; + for( j = 0; j < rows; j++ ) + { + y = Q[j*cols+q]; + z = Q[j*cols+i]; + Q[j*cols+q] = y * c + z * s; + Q[j*cols+i] = z * c - y * s; + } + } + } + + z = diag[k]; + if( l == k ) + { + if( z < 0.0f ) + { + diag[k] = -z; + for( j = 0; j < cols; j++ ) R[k*cols+j] *= -1.0f; + } + break; + } + if( iter >= MaxIterations ) return; + x = diag[l]; + q = k - 1; + y = diag[q]; + g = temp[q]; + h = temp[k]; + f = ( ( y - z ) * ( y + z ) + ( g - h ) * ( g + h ) ) / ( 2.0f * h * y ); + g = svd_pythag( f, 1.0f ); + f = ( ( x - z ) * ( x + z ) + h * ( ( y / ( f + SameSign( g, f ) ) ) - h ) ) / x; + c = 1.0f; + s = 1.0f; + for( j = l; j <= q; j++ ) + { + i = j + 1; + g = temp[i]; + y = diag[i]; + h = s * g; + g = c * g; + z = svd_pythag( f, h ); + temp[j] = z; + c = f / z; + s = h / z; + f = x * c + g * s; + g = g * c - x * s; + h = y * s; + y = y * c; + for( p = 0; p < cols; p++ ) + { + x = R[j*cols+p]; + z = R[i*cols+p]; + R[j*cols+p] = x * c + z * s; + R[i*cols+p] = z * c - x * s; + } + z = svd_pythag( f, h ); + diag[j] = z; + if( z != 0.0f ) + { + z = 1.0f / z; + c = f * z; + s = h * z; + } + f = c * g + s * y; + x = c * y - s * g; + for( p = 0; p < rows; p++ ) + { + y = Q[p*cols+j]; + z = Q[p*cols+i]; + Q[p*cols+j] = y * c + z * s; + Q[p*cols+i] = z * c - y * s; + } + } + temp[l] = 0.0f; + temp[k] = f; + diag[k] = x; + } + } + + // Sort the singular values into descending order. + + for( i = 0; i < cols - 1; i++ ) + { + float biggest = diag[i]; // Biggest singular value so far. + int bindex = i; // The row/col it occurred in. + for( j = i + 1; j < cols; j++ ) + { + if( diag[j] > biggest ) + { + biggest = diag[j]; + bindex = j; + } + } + if( bindex != i ) // Need to swap rows and columns. + { + // Swap columns in Q. + for (int j = 0; j < rows; ++j) + swap(Q[j*cols+i], Q[j*cols+bindex]); + + // Swap rows in R. + for (int j = 0; j < rows; ++j) + swap(R[i*cols+j], R[bindex*cols+j]); + + // Swap elements in diag. + swap(diag[i], diag[bindex]); + } + } +} diff --git a/3rdparty/nvtt/nvmath/fitting.h b/3rdparty/nvtt/nvmath/fitting.h new file mode 100644 index 00000000..e8350458 --- /dev/null +++ b/3rdparty/nvtt/nvmath/fitting.h @@ -0,0 +1,49 @@ +// This code is in the public domain -- Ignacio CastaÃąo + +#ifndef NV_MATH_FITTING_H +#define NV_MATH_FITTING_H + +#include "vector.h" +#include "plane.h" + +namespace nv +{ + namespace Fit + { + Vector3 computeCentroid(int n, const Vector3 * points); + Vector3 computeCentroid(int n, const Vector3 * points, const float * weights, const Vector3 & metric); + + Vector4 computeCentroid(int n, const Vector4 * points); + Vector4 computeCentroid(int n, const Vector4 * points, const float * weights, const Vector4 & metric); + + Vector3 computeCovariance(int n, const Vector3 * points, float * covariance); + Vector3 computeCovariance(int n, const Vector3 * points, const float * weights, const Vector3 & metric, float * covariance); + + Vector4 computeCovariance(int n, const Vector4 * points, float * covariance); + Vector4 computeCovariance(int n, const Vector4 * points, const float * weights, const Vector4 & metric, float * covariance); + + Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 * points); + Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 * points, const float * weights, const Vector3 & metric); + + Vector3 computePrincipalComponent_EigenSolver(int n, const Vector3 * points); + Vector3 computePrincipalComponent_EigenSolver(int n, const Vector3 * points, const float * weights, const Vector3 & metric); + + Vector4 computePrincipalComponent_EigenSolver(int n, const Vector4 * points); + Vector4 computePrincipalComponent_EigenSolver(int n, const Vector4 * points, const float * weights, const Vector4 & metric); + + Vector3 computePrincipalComponent_SVD(int n, const Vector3 * points); + Vector4 computePrincipalComponent_SVD(int n, const Vector4 * points); + + Plane bestPlane(int n, const Vector3 * points); + bool isPlanar(int n, const Vector3 * points, float epsilon = NV_EPSILON); + + bool eigenSolveSymmetric3(const float matrix[6], float eigenValues[3], Vector3 eigenVectors[3]); + bool eigenSolveSymmetric4(const float matrix[10], float eigenValues[4], Vector4 eigenVectors[4]); + + // Returns number of clusters [1-4]. + int compute4Means(int n, const Vector3 * points, const float * weights, const Vector3 & metric, Vector3 * cluster); + } + +} // nv namespace + +#endif // NV_MATH_FITTING_H diff --git a/3rdparty/nvtt/nvmath/matrix.h b/3rdparty/nvtt/nvmath/matrix.h new file mode 100644 index 00000000..901a9827 --- /dev/null +++ b/3rdparty/nvtt/nvmath/matrix.h @@ -0,0 +1,112 @@ +// This code is in the public domain -- castanyo@yahoo.es + +#ifndef NV_MATH_MATRIX_H +#define NV_MATH_MATRIX_H + +#include "vector.h" + +// - Matrices are stored in memory in *column major* order. +// - Points are to be though of as column vectors. +// - Transformation of a point p by a matrix M is: p' = M * p + +namespace nv +{ + enum identity_t { identity }; + + // 3x3 matrix. + class NVMATH_CLASS Matrix3 + { + public: + Matrix3(); + explicit Matrix3(float f); + explicit Matrix3(identity_t); + Matrix3(const Matrix3 & m); + Matrix3(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2); + + float data(uint idx) const; + float & data(uint idx); + float get(uint row, uint col) const; + float operator()(uint row, uint col) const; + float & operator()(uint row, uint col); + + Vector3 row(uint i) const; + Vector3 column(uint i) const; + + void operator*=(float s); + void operator/=(float s); + void operator+=(const Matrix3 & m); + void operator-=(const Matrix3 & m); + + void scale(float s); + void scale(Vector3::Arg s); + float determinant() const; + + private: + float m_data[9]; + }; + + // Solve equation system using LU decomposition and back-substitution. + extern bool solveLU(const Matrix3 & m, const Vector3 & b, Vector3 * x); + + // Solve equation system using Cramer's inverse. + extern bool solveCramer(const Matrix3 & A, const Vector3 & b, Vector3 * x); + + + // 4x4 matrix. + class NVMATH_CLASS Matrix + { + public: + typedef Matrix const & Arg; + + Matrix(); + explicit Matrix(float f); + explicit Matrix(identity_t); + Matrix(const Matrix3 & m); + Matrix(const Matrix & m); + Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3); + //explicit Matrix(const float m[]); // m is assumed to contain 16 elements + + float data(uint idx) const; + float & data(uint idx); + float get(uint row, uint col) const; + float operator()(uint row, uint col) const; + float & operator()(uint row, uint col); + const float * ptr() const; + + Vector4 row(uint i) const; + Vector4 column(uint i) const; + + void zero(); + void identity(); + + void scale(float s); + void scale(Vector3::Arg s); + void translate(Vector3::Arg t); + void rotate(float theta, float v0, float v1, float v2); + float determinant() const; + + void operator+=(const Matrix & m); + void operator-=(const Matrix & m); + + void apply(Matrix::Arg m); + + private: + float m_data[16]; + }; + + // Solve equation system using LU decomposition and back-substitution. + extern bool solveLU(const Matrix & A, const Vector4 & b, Vector4 * x); + + // Solve equation system using Cramer's inverse. + extern bool solveCramer(const Matrix & A, const Vector4 & b, Vector4 * x); + + // Compute inverse using LU decomposition. + extern Matrix inverseLU(const Matrix & m); + + // Compute inverse using Gaussian elimination and partial pivoting. + extern Matrix inverse(const Matrix & m); + extern Matrix3 inverse(const Matrix3 & m); + +} // nv namespace + +#endif // NV_MATH_MATRIX_H diff --git a/3rdparty/nvtt/nvmath/matrix.inl b/3rdparty/nvtt/nvmath/matrix.inl new file mode 100644 index 00000000..28fc7a2e --- /dev/null +++ b/3rdparty/nvtt/nvmath/matrix.inl @@ -0,0 +1,1274 @@ +// This code is in the public domain -- castanyo@yahoo.es + +#pragma once +#ifndef NV_MATH_MATRIX_INL +#define NV_MATH_MATRIX_INL + +#include "Matrix.h" + +namespace nv +{ + inline Matrix3::Matrix3() {} + + inline Matrix3::Matrix3(float f) + { + for(int i = 0; i < 9; i++) { + m_data[i] = f; + } + } + + inline Matrix3::Matrix3(identity_t) + { + for(int i = 0; i < 3; i++) { + for(int j = 0; j < 3; j++) { + m_data[3*j+i] = (i == j) ? 1.0f : 0.0f; + } + } + } + + inline Matrix3::Matrix3(const Matrix3 & m) + { + for(int i = 0; i < 9; i++) { + m_data[i] = m.m_data[i]; + } + } + + inline Matrix3::Matrix3(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2) + { + m_data[0] = v0.x; m_data[1] = v0.y; m_data[2] = v0.z; + m_data[3] = v1.x; m_data[4] = v1.y; m_data[5] = v1.z; + m_data[6] = v2.x; m_data[7] = v2.y; m_data[8] = v2.z; + } + + inline float Matrix3::data(uint idx) const + { + nvDebugCheck(idx < 9); + return m_data[idx]; + } + inline float & Matrix3::data(uint idx) + { + nvDebugCheck(idx < 9); + return m_data[idx]; + } + inline float Matrix3::get(uint row, uint col) const + { + nvDebugCheck(row < 3 && col < 3); + return m_data[col * 3 + row]; + } + inline float Matrix3::operator()(uint row, uint col) const + { + nvDebugCheck(row < 3 && col < 3); + return m_data[col * 3 + row]; + } + inline float & Matrix3::operator()(uint row, uint col) + { + nvDebugCheck(row < 3 && col < 3); + return m_data[col * 3 + row]; + } + + inline Vector3 Matrix3::row(uint i) const + { + nvDebugCheck(i < 3); + return Vector3(get(i, 0), get(i, 1), get(i, 2)); + } + inline Vector3 Matrix3::column(uint i) const + { + nvDebugCheck(i < 3); + return Vector3(get(0, i), get(1, i), get(2, i)); + } + + inline void Matrix3::operator*=(float s) + { + for(int i = 0; i < 9; i++) { + m_data[i] *= s; + } + } + + inline void Matrix3::operator/=(float s) + { + float is = 1.0f /s; + for(int i = 0; i < 9; i++) { + m_data[i] *= is; + } + } + + inline void Matrix3::operator+=(const Matrix3 & m) + { + for(int i = 0; i < 9; i++) { + m_data[i] += m.m_data[i]; + } + } + + inline void Matrix3::operator-=(const Matrix3 & m) + { + for(int i = 0; i < 9; i++) { + m_data[i] -= m.m_data[i]; + } + } + + inline Matrix3 operator+(const Matrix3 & a, const Matrix3 & b) + { + Matrix3 m = a; + m += b; + return m; + } + + inline Matrix3 operator-(const Matrix3 & a, const Matrix3 & b) + { + Matrix3 m = a; + m -= b; + return m; + } + + inline Matrix3 operator*(const Matrix3 & a, float s) + { + Matrix3 m = a; + m *= s; + return m; + } + + inline Matrix3 operator*(float s, const Matrix3 & a) + { + Matrix3 m = a; + m *= s; + return m; + } + + inline Matrix3 operator/(const Matrix3 & a, float s) + { + Matrix3 m = a; + m /= s; + return m; + } + + inline Matrix3 mul(const Matrix3 & a, const Matrix3 & b) + { + Matrix3 m; + + for(int i = 0; i < 3; i++) { + const float ai0 = a(i,0), ai1 = a(i,1), ai2 = a(i,2); + m(i, 0) = ai0 * b(0,0) + ai1 * b(1,0) + ai2 * b(2,0); + m(i, 1) = ai0 * b(0,1) + ai1 * b(1,1) + ai2 * b(2,1); + m(i, 2) = ai0 * b(0,2) + ai1 * b(1,2) + ai2 * b(2,2); + } + + return m; + } + + inline Matrix3 operator*(const Matrix3 & a, const Matrix3 & b) + { + return mul(a, b); + } + + // Transform the given 3d vector with the given matrix. + inline Vector3 transform(const Matrix3 & m, const Vector3 & p) + { + return Vector3( + p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2), + p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2), + p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2)); + } + + inline void Matrix3::scale(float s) + { + for (int i = 0; i < 9; i++) { + m_data[i] *= s; + } + } + + inline void Matrix3::scale(Vector3::Arg s) + { + m_data[0] *= s.x; m_data[1] *= s.x; m_data[2] *= s.x; + m_data[3] *= s.y; m_data[4] *= s.y; m_data[5] *= s.y; + m_data[6] *= s.z; m_data[7] *= s.z; m_data[8] *= s.z; + } + + inline float Matrix3::determinant() const + { + return + get(0,0) * get(1,1) * get(2,2) + + get(0,1) * get(1,2) * get(2,0) + + get(0,2) * get(1,0) * get(2,1) - + get(0,2) * get(1,1) * get(2,0) - + get(0,1) * get(1,0) * get(2,2) - + get(0,0) * get(1,2) * get(2,1); + } + + // Inverse using Cramer's rule. + inline Matrix3 inverseCramer(const Matrix3 & m) + { + const float det = m.determinant(); + if (equal(det, 0.0f, 0.0f)) { + return Matrix3(0); + } + + Matrix3 r; + + r.data(0) = - m.data(5) * m.data(7) + m.data(4) * m.data(8); + r.data(1) = + m.data(5) * m.data(6) - m.data(3) * m.data(8); + r.data(2) = - m.data(4) * m.data(6) + m.data(3) * m.data(7); + + r.data(3) = + m.data(2) * m.data(7) - m.data(1) * m.data(8); + r.data(4) = - m.data(2) * m.data(6) + m.data(0) * m.data(8); + r.data(5) = + m.data(1) * m.data(6) - m.data(0) * m.data(7); + + r.data(6) = - m.data(2) * m.data(4) + m.data(1) * m.data(5); + r.data(7) = + m.data(2) * m.data(3) - m.data(0) * m.data(5); + r.data(8) = - m.data(1) * m.data(3) + m.data(0) * m.data(4); + + r.scale(1.0f / det); + + return r; + } + + + + inline Matrix::Matrix() + { + } + + inline Matrix::Matrix(float f) + { + for(int i = 0; i < 16; i++) { + m_data[i] = 0.0f; + } + } + + inline Matrix::Matrix(identity_t) + { + for(int i = 0; i < 4; i++) { + for(int j = 0; j < 4; j++) { + m_data[4*j+i] = (i == j) ? 1.0f : 0.0f; + } + } + } + + inline Matrix::Matrix(const Matrix & m) + { + for(int i = 0; i < 16; i++) { + m_data[i] = m.m_data[i]; + } + } + + inline Matrix::Matrix(const Matrix3 & m) + { + for(int i = 0; i < 3; i++) { + for(int j = 0; j < 3; j++) { + operator()(i, j) = m.get(i, j); + } + } + for(int i = 0; i < 4; i++) { + operator()(3, i) = 0; + operator()(i, 3) = 0; + } + } + + inline Matrix::Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3) + { + m_data[ 0] = v0.x; m_data[ 1] = v0.y; m_data[ 2] = v0.z; m_data[ 3] = v0.w; + m_data[ 4] = v1.x; m_data[ 5] = v1.y; m_data[ 6] = v1.z; m_data[ 7] = v1.w; + m_data[ 8] = v2.x; m_data[ 9] = v2.y; m_data[10] = v2.z; m_data[11] = v2.w; + m_data[12] = v3.x; m_data[13] = v3.y; m_data[14] = v3.z; m_data[15] = v3.w; + } + + /*inline Matrix::Matrix(const float m[]) + { + for(int i = 0; i < 16; i++) { + m_data[i] = m[i]; + } + }*/ + + + // Accessors + inline float Matrix::data(uint idx) const + { + nvDebugCheck(idx < 16); + return m_data[idx]; + } + inline float & Matrix::data(uint idx) + { + nvDebugCheck(idx < 16); + return m_data[idx]; + } + inline float Matrix::get(uint row, uint col) const + { + nvDebugCheck(row < 4 && col < 4); + return m_data[col * 4 + row]; + } + inline float Matrix::operator()(uint row, uint col) const + { + nvDebugCheck(row < 4 && col < 4); + return m_data[col * 4 + row]; + } + inline float & Matrix::operator()(uint row, uint col) + { + nvDebugCheck(row < 4 && col < 4); + return m_data[col * 4 + row]; + } + + inline const float * Matrix::ptr() const + { + return m_data; + } + + inline Vector4 Matrix::row(uint i) const + { + nvDebugCheck(i < 4); + return Vector4(get(i, 0), get(i, 1), get(i, 2), get(i, 3)); + } + + inline Vector4 Matrix::column(uint i) const + { + nvDebugCheck(i < 4); + return Vector4(get(0, i), get(1, i), get(2, i), get(3, i)); + } + + inline void Matrix::zero() + { + m_data[0] = 0; m_data[1] = 0; m_data[2] = 0; m_data[3] = 0; + m_data[4] = 0; m_data[5] = 0; m_data[6] = 0; m_data[7] = 0; + m_data[8] = 0; m_data[9] = 0; m_data[10] = 0; m_data[11] = 0; + m_data[12] = 0; m_data[13] = 0; m_data[14] = 0; m_data[15] = 0; + } + + inline void Matrix::identity() + { + m_data[0] = 1; m_data[1] = 0; m_data[2] = 0; m_data[3] = 0; + m_data[4] = 0; m_data[5] = 1; m_data[6] = 0; m_data[7] = 0; + m_data[8] = 0; m_data[9] = 0; m_data[10] = 1; m_data[11] = 0; + m_data[12] = 0; m_data[13] = 0; m_data[14] = 0; m_data[15] = 1; + } + + // Apply scale. + inline void Matrix::scale(float s) + { + m_data[0] *= s; m_data[1] *= s; m_data[2] *= s; m_data[3] *= s; + m_data[4] *= s; m_data[5] *= s; m_data[6] *= s; m_data[7] *= s; + m_data[8] *= s; m_data[9] *= s; m_data[10] *= s; m_data[11] *= s; + m_data[12] *= s; m_data[13] *= s; m_data[14] *= s; m_data[15] *= s; + } + + // Apply scale. + inline void Matrix::scale(Vector3::Arg s) + { + m_data[0] *= s.x; m_data[1] *= s.x; m_data[2] *= s.x; m_data[3] *= s.x; + m_data[4] *= s.y; m_data[5] *= s.y; m_data[6] *= s.y; m_data[7] *= s.y; + m_data[8] *= s.z; m_data[9] *= s.z; m_data[10] *= s.z; m_data[11] *= s.z; + } + + // Apply translation. + inline void Matrix::translate(Vector3::Arg t) + { + m_data[12] = m_data[0] * t.x + m_data[4] * t.y + m_data[8] * t.z + m_data[12]; + m_data[13] = m_data[1] * t.x + m_data[5] * t.y + m_data[9] * t.z + m_data[13]; + m_data[14] = m_data[2] * t.x + m_data[6] * t.y + m_data[10] * t.z + m_data[14]; + m_data[15] = m_data[3] * t.x + m_data[7] * t.y + m_data[11] * t.z + m_data[15]; + } + + Matrix rotation(float theta, float v0, float v1, float v2); + + // Apply rotation. + inline void Matrix::rotate(float theta, float v0, float v1, float v2) + { + Matrix R(rotation(theta, v0, v1, v2)); + apply(R); + } + + // Apply transform. + inline void Matrix::apply(Matrix::Arg m) + { + nvDebugCheck(this != &m); + + for(int i = 0; i < 4; i++) { + const float ai0 = get(i,0), ai1 = get(i,1), ai2 = get(i,2), ai3 = get(i,3); + m_data[0 + i] = ai0 * m(0,0) + ai1 * m(1,0) + ai2 * m(2,0) + ai3 * m(3,0); + m_data[4 + i] = ai0 * m(0,1) + ai1 * m(1,1) + ai2 * m(2,1) + ai3 * m(3,1); + m_data[8 + i] = ai0 * m(0,2) + ai1 * m(1,2) + ai2 * m(2,2) + ai3 * m(3,2); + m_data[12+ i] = ai0 * m(0,3) + ai1 * m(1,3) + ai2 * m(2,3) + ai3 * m(3,3); + } + } + + // Get scale matrix. + inline Matrix scale(Vector3::Arg s) + { + Matrix m(identity); + m(0,0) = s.x; + m(1,1) = s.y; + m(2,2) = s.z; + return m; + } + + // Get scale matrix. + inline Matrix scale(float s) + { + Matrix m(identity); + m(0,0) = m(1,1) = m(2,2) = s; + return m; + } + + // Get translation matrix. + inline Matrix translation(Vector3::Arg t) + { + Matrix m(identity); + m(0,3) = t.x; + m(1,3) = t.y; + m(2,3) = t.z; + return m; + } + + // Get rotation matrix. + inline Matrix rotation(float theta, float v0, float v1, float v2) + { + float cost = cosf(theta); + float sint = sinf(theta); + + Matrix m(identity); + + if( 1 == v0 && 0 == v1 && 0 == v2 ) { + m(1,1) = cost; m(2,1) = -sint; + m(1,2) = sint; m(2,2) = cost; + } + else if( 0 == v0 && 1 == v1 && 0 == v2 ) { + m(0,0) = cost; m(2,0) = sint; + m(1,2) = -sint; m(2,2) = cost; + } + else if( 0 == v0 && 0 == v1 && 1 == v2 ) { + m(0,0) = cost; m(1,0) = -sint; + m(0,1) = sint; m(1,1) = cost; + } + else { + float a2, b2, c2; + a2 = v0 * v0; + b2 = v1 * v1; + c2 = v2 * v2; + + float iscale = 1.0f / sqrtf(a2 + b2 + c2); + v0 *= iscale; + v1 *= iscale; + v2 *= iscale; + + float abm, acm, bcm; + float mcos, asin, bsin, csin; + mcos = 1.0f - cost; + abm = v0 * v1 * mcos; + acm = v0 * v2 * mcos; + bcm = v1 * v2 * mcos; + asin = v0 * sint; + bsin = v1 * sint; + csin = v2 * sint; + m(0,0) = a2 * mcos + cost; + m(1,0) = abm - csin; + m(2,0) = acm + bsin; + m(3,0) = abm + csin; + m(1,1) = b2 * mcos + cost; + m(2,1) = bcm - asin; + m(3,1) = acm - bsin; + m(1,2) = bcm + asin; + m(2,2) = c2 * mcos + cost; + } + return m; + } + + //Matrix rotation(float yaw, float pitch, float roll); + //Matrix skew(float angle, Vector3::Arg v1, Vector3::Arg v2); + + // Get frustum matrix. + inline Matrix frustum(float xmin, float xmax, float ymin, float ymax, float zNear, float zFar) + { + Matrix m(0.0f); + + float doubleznear = 2.0f * zNear; + float one_deltax = 1.0f / (xmax - xmin); + float one_deltay = 1.0f / (ymax - ymin); + float one_deltaz = 1.0f / (zFar - zNear); + + m(0,0) = doubleznear * one_deltax; + m(1,1) = doubleznear * one_deltay; + m(0,2) = (xmax + xmin) * one_deltax; + m(1,2) = (ymax + ymin) * one_deltay; + m(2,2) = -(zFar + zNear) * one_deltaz; + m(3,2) = -1.0f; + m(2,3) = -(zFar * doubleznear) * one_deltaz; + + return m; + } + + // Get inverse frustum matrix. + inline Matrix frustumInverse(float xmin, float xmax, float ymin, float ymax, float zNear, float zFar) + { + Matrix m(0.0f); + + float one_doubleznear = 1.0f / (2.0f * zNear); + float one_doubleznearzfar = 1.0f / (2.0f * zNear * zFar); + + m(0,0) = (xmax - xmin) * one_doubleznear; + m(0,3) = (xmax + xmin) * one_doubleznear; + m(1,1) = (ymax - ymin) * one_doubleznear; + m(1,3) = (ymax + ymin) * one_doubleznear; + m(2,3) = -1; + m(3,2) = -(zFar - zNear) * one_doubleznearzfar; + m(3,3) = (zFar + zNear) * one_doubleznearzfar; + + return m; + } + + // Get infinite frustum matrix. + inline Matrix frustum(float xmin, float xmax, float ymin, float ymax, float zNear) + { + Matrix m(0.0f); + + float doubleznear = 2.0f * zNear; + float one_deltax = 1.0f / (xmax - xmin); + float one_deltay = 1.0f / (ymax - ymin); + float nudge = 1.0; // 0.999; + + m(0,0) = doubleznear * one_deltax; + m(1,1) = doubleznear * one_deltay; + m(0,2) = (xmax + xmin) * one_deltax; + m(1,2) = (ymax + ymin) * one_deltay; + m(2,2) = -1.0f * nudge; + m(3,2) = -1.0f; + m(2,3) = -doubleznear * nudge; + + return m; + } + + // Get perspective matrix. + inline Matrix perspective(float fovy, float aspect, float zNear, float zFar) + { + float xmax = zNear * tanf(fovy / 2); + float xmin = -xmax; + + float ymax = xmax / aspect; + float ymin = -ymax; + + return frustum(xmin, xmax, ymin, ymax, zNear, zFar); + } + + // Get inverse perspective matrix. + inline Matrix perspectiveInverse(float fovy, float aspect, float zNear, float zFar) + { + float xmax = zNear * tanf(fovy / 2); + float xmin = -xmax; + + float ymax = xmax / aspect; + float ymin = -ymax; + + return frustumInverse(xmin, xmax, ymin, ymax, zNear, zFar); + } + + // Get infinite perspective matrix. + inline Matrix perspective(float fovy, float aspect, float zNear) + { + float x = zNear * tanf(fovy / 2); + float y = x / aspect; + return frustum( -x, x, -y, y, zNear ); + } + + // Get matrix determinant. + inline float Matrix::determinant() const + { + return + m_data[3] * m_data[6] * m_data[ 9] * m_data[12] - m_data[2] * m_data[7] * m_data[ 9] * m_data[12] - m_data[3] * m_data[5] * m_data[10] * m_data[12] + m_data[1] * m_data[7] * m_data[10] * m_data[12] + + m_data[2] * m_data[5] * m_data[11] * m_data[12] - m_data[1] * m_data[6] * m_data[11] * m_data[12] - m_data[3] * m_data[6] * m_data[ 8] * m_data[13] + m_data[2] * m_data[7] * m_data[ 8] * m_data[13] + + m_data[3] * m_data[4] * m_data[10] * m_data[13] - m_data[0] * m_data[7] * m_data[10] * m_data[13] - m_data[2] * m_data[4] * m_data[11] * m_data[13] + m_data[0] * m_data[6] * m_data[11] * m_data[13] + + m_data[3] * m_data[5] * m_data[ 8] * m_data[14] - m_data[1] * m_data[7] * m_data[ 8] * m_data[14] - m_data[3] * m_data[4] * m_data[ 9] * m_data[14] + m_data[0] * m_data[7] * m_data[ 9] * m_data[14] + + m_data[1] * m_data[4] * m_data[11] * m_data[14] - m_data[0] * m_data[5] * m_data[11] * m_data[14] - m_data[2] * m_data[5] * m_data[ 8] * m_data[15] + m_data[1] * m_data[6] * m_data[ 8] * m_data[15] + + m_data[2] * m_data[4] * m_data[ 9] * m_data[15] - m_data[0] * m_data[6] * m_data[ 9] * m_data[15] - m_data[1] * m_data[4] * m_data[10] * m_data[15] + m_data[0] * m_data[5] * m_data[10] * m_data[15]; + } + + inline Matrix transpose(Matrix::Arg m) + { + Matrix r; + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++) + { + r(i, j) = m(j, i); + } + } + return r; + } + + // Inverse using Cramer's rule. + inline Matrix inverseCramer(Matrix::Arg m) + { + Matrix r; + r.data( 0) = m.data(6)*m.data(11)*m.data(13) - m.data(7)*m.data(10)*m.data(13) + m.data(7)*m.data(9)*m.data(14) - m.data(5)*m.data(11)*m.data(14) - m.data(6)*m.data(9)*m.data(15) + m.data(5)*m.data(10)*m.data(15); + r.data( 1) = m.data(3)*m.data(10)*m.data(13) - m.data(2)*m.data(11)*m.data(13) - m.data(3)*m.data(9)*m.data(14) + m.data(1)*m.data(11)*m.data(14) + m.data(2)*m.data(9)*m.data(15) - m.data(1)*m.data(10)*m.data(15); + r.data( 2) = m.data(2)*m.data( 7)*m.data(13) - m.data(3)*m.data( 6)*m.data(13) + m.data(3)*m.data(5)*m.data(14) - m.data(1)*m.data( 7)*m.data(14) - m.data(2)*m.data(5)*m.data(15) + m.data(1)*m.data( 6)*m.data(15); + r.data( 3) = m.data(3)*m.data( 6)*m.data( 9) - m.data(2)*m.data( 7)*m.data( 9) - m.data(3)*m.data(5)*m.data(10) + m.data(1)*m.data( 7)*m.data(10) + m.data(2)*m.data(5)*m.data(11) - m.data(1)*m.data( 6)*m.data(11); + r.data( 4) = m.data(7)*m.data(10)*m.data(12) - m.data(6)*m.data(11)*m.data(12) - m.data(7)*m.data(8)*m.data(14) + m.data(4)*m.data(11)*m.data(14) + m.data(6)*m.data(8)*m.data(15) - m.data(4)*m.data(10)*m.data(15); + r.data( 5) = m.data(2)*m.data(11)*m.data(12) - m.data(3)*m.data(10)*m.data(12) + m.data(3)*m.data(8)*m.data(14) - m.data(0)*m.data(11)*m.data(14) - m.data(2)*m.data(8)*m.data(15) + m.data(0)*m.data(10)*m.data(15); + r.data( 6) = m.data(3)*m.data( 6)*m.data(12) - m.data(2)*m.data( 7)*m.data(12) - m.data(3)*m.data(4)*m.data(14) + m.data(0)*m.data( 7)*m.data(14) + m.data(2)*m.data(4)*m.data(15) - m.data(0)*m.data( 6)*m.data(15); + r.data( 7) = m.data(2)*m.data( 7)*m.data( 8) - m.data(3)*m.data( 6)*m.data( 8) + m.data(3)*m.data(4)*m.data(10) - m.data(0)*m.data( 7)*m.data(10) - m.data(2)*m.data(4)*m.data(11) + m.data(0)*m.data( 6)*m.data(11); + r.data( 8) = m.data(5)*m.data(11)*m.data(12) - m.data(7)*m.data( 9)*m.data(12) + m.data(7)*m.data(8)*m.data(13) - m.data(4)*m.data(11)*m.data(13) - m.data(5)*m.data(8)*m.data(15) + m.data(4)*m.data( 9)*m.data(15); + r.data( 9) = m.data(3)*m.data( 9)*m.data(12) - m.data(1)*m.data(11)*m.data(12) - m.data(3)*m.data(8)*m.data(13) + m.data(0)*m.data(11)*m.data(13) + m.data(1)*m.data(8)*m.data(15) - m.data(0)*m.data( 9)*m.data(15); + r.data(10) = m.data(1)*m.data( 7)*m.data(12) - m.data(3)*m.data( 5)*m.data(12) + m.data(3)*m.data(4)*m.data(13) - m.data(0)*m.data( 7)*m.data(13) - m.data(1)*m.data(4)*m.data(15) + m.data(0)*m.data( 5)*m.data(15); + r.data(11) = m.data(3)*m.data( 5)*m.data( 8) - m.data(1)*m.data( 7)*m.data( 8) - m.data(3)*m.data(4)*m.data( 9) + m.data(0)*m.data( 7)*m.data( 9) + m.data(1)*m.data(4)*m.data(11) - m.data(0)*m.data( 5)*m.data(11); + r.data(12) = m.data(6)*m.data( 9)*m.data(12) - m.data(5)*m.data(10)*m.data(12) - m.data(6)*m.data(8)*m.data(13) + m.data(4)*m.data(10)*m.data(13) + m.data(5)*m.data(8)*m.data(14) - m.data(4)*m.data( 9)*m.data(14); + r.data(13) = m.data(1)*m.data(10)*m.data(12) - m.data(2)*m.data( 9)*m.data(12) + m.data(2)*m.data(8)*m.data(13) - m.data(0)*m.data(10)*m.data(13) - m.data(1)*m.data(8)*m.data(14) + m.data(0)*m.data( 9)*m.data(14); + r.data(14) = m.data(2)*m.data( 5)*m.data(12) - m.data(1)*m.data( 6)*m.data(12) - m.data(2)*m.data(4)*m.data(13) + m.data(0)*m.data( 6)*m.data(13) + m.data(1)*m.data(4)*m.data(14) - m.data(0)*m.data( 5)*m.data(14); + r.data(15) = m.data(1)*m.data( 6)*m.data( 8) - m.data(2)*m.data( 5)*m.data( 8) + m.data(2)*m.data(4)*m.data( 9) - m.data(0)*m.data( 6)*m.data( 9) - m.data(1)*m.data(4)*m.data(10) + m.data(0)*m.data( 5)*m.data(10); + r.scale(1.0f / m.determinant()); + return r; + } + + inline Matrix isometryInverse(Matrix::Arg m) + { + Matrix r(identity); + + // transposed 3x3 upper left matrix + for (int i = 0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + r(i, j) = m(j, i); + } + } + + // translate by the negative offsets + r.translate(-Vector3(m.data(12), m.data(13), m.data(14))); + + return r; + } + + // Transform the given 3d point with the given matrix. + inline Vector3 transformPoint(Matrix::Arg m, Vector3::Arg p) + { + return Vector3( + p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2) + m(0,3), + p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2) + m(1,3), + p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2) + m(2,3)); + } + + // Transform the given 3d vector with the given matrix. + inline Vector3 transformVector(Matrix::Arg m, Vector3::Arg p) + { + return Vector3( + p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2), + p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2), + p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2)); + } + + // Transform the given 4d vector with the given matrix. + inline Vector4 transform(Matrix::Arg m, Vector4::Arg p) + { + return Vector4( + p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2) + p.w * m(0,3), + p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2) + p.w * m(1,3), + p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2) + p.w * m(2,3), + p.x * m(3,0) + p.y * m(3,1) + p.z * m(3,2) + p.w * m(3,3)); + } + + inline Matrix mul(Matrix::Arg a, Matrix::Arg b) + { + // @@ Is this the right order? mul(a, b) = b * a + Matrix m = a; + m.apply(b); + return m; + } + + inline void Matrix::operator+=(const Matrix & m) + { + for(int i = 0; i < 16; i++) { + m_data[i] += m.m_data[i]; + } + } + + inline void Matrix::operator-=(const Matrix & m) + { + for(int i = 0; i < 16; i++) { + m_data[i] -= m.m_data[i]; + } + } + + inline Matrix operator+(const Matrix & a, const Matrix & b) + { + Matrix m = a; + m += b; + return m; + } + + inline Matrix operator-(const Matrix & a, const Matrix & b) + { + Matrix m = a; + m -= b; + return m; + } + + +} // nv namespace + + +#if 0 // old code. +/** @name Special matrices. */ +//@{ +/** Generate a translation matrix. */ +void TranslationMatrix(const Vec3 & v) { + data[0] = 1; data[1] = 0; data[2] = 0; data[3] = 0; + data[4] = 0; data[5] = 1; data[6] = 0; data[7] = 0; + data[8] = 0; data[9] = 0; data[10] = 1; data[11] = 0; + data[12] = v.x; data[13] = v.y; data[14] = v.z; data[15] = 1; +} + +/** Rotate theta degrees around v. */ +void RotationMatrix( float theta, float v0, float v1, float v2 ) { + float cost = cos(theta); + float sint = sin(theta); + + if( 1 == v0 && 0 == v1 && 0 == v2 ) { + data[0] = 1.0f; data[1] = 0.0f; data[2] = 0.0f; data[3] = 0.0f; + data[4] = 0.0f; data[5] = cost; data[6] = -sint;data[7] = 0.0f; + data[8] = 0.0f; data[9] = sint; data[10] = cost;data[11] = 0.0f; + data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f; + } + else if( 0 == v0 && 1 == v1 && 0 == v2 ) { + data[0] = cost; data[1] = 0.0f; data[2] = sint; data[3] = 0.0f; + data[4] = 0.0f; data[5] = 1.0f; data[6] = 0.0f; data[7] = 0.0f; + data[8] = -sint;data[9] = 0.0f;data[10] = cost; data[11] = 0.0f; + data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f; + } + else if( 0 == v0 && 0 == v1 && 1 == v2 ) { + data[0] = cost; data[1] = -sint;data[2] = 0.0f; data[3] = 0.0f; + data[4] = sint; data[5] = cost; data[6] = 0.0f; data[7] = 0.0f; + data[8] = 0.0f; data[9] = 0.0f; data[10] = 1.0f;data[11] = 0.0f; + data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f; + } + else { + //we need scale a,b,c to unit length. + float a2, b2, c2; + a2 = v0 * v0; + b2 = v1 * v1; + c2 = v2 * v2; + + float iscale = 1.0f / sqrtf(a2 + b2 + c2); + v0 *= iscale; + v1 *= iscale; + v2 *= iscale; + + float abm, acm, bcm; + float mcos, asin, bsin, csin; + mcos = 1.0f - cost; + abm = v0 * v1 * mcos; + acm = v0 * v2 * mcos; + bcm = v1 * v2 * mcos; + asin = v0 * sint; + bsin = v1 * sint; + csin = v2 * sint; + data[0] = a2 * mcos + cost; + data[1] = abm - csin; + data[2] = acm + bsin; + data[3] = abm + csin; + data[4] = 0.0f; + data[5] = b2 * mcos + cost; + data[6] = bcm - asin; + data[7] = acm - bsin; + data[8] = 0.0f; + data[9] = bcm + asin; + data[10] = c2 * mcos + cost; + data[11] = 0.0f; + data[12] = 0.0f; + data[13] = 0.0f; + data[14] = 0.0f; + data[15] = 1.0f; + } +} + +/* +void SkewMatrix(float angle, const Vec3 & v1, const Vec3 & v2) { +v1.Normalize(); +v2.Normalize(); + +Vec3 v3; +v3.Cross(v1, v2); +v3.Normalize(); + +// Get skew factor. +float costheta = Vec3DotProduct(v1, v2); +float sintheta = Real.Sqrt(1 - costheta * costheta); +float skew = tan(Trig.DegreesToRadians(angle) + acos(sintheta)) * sintheta - costheta; + +// Build orthonormal matrix. +v1 = FXVector3.Cross(v3, v2); +v1.Normalize(); + +Matrix R = Matrix::Identity; +R[0, 0] = v3.X; // Not sure this is in the correct order... +R[1, 0] = v3.Y; +R[2, 0] = v3.Z; +R[0, 1] = v1.X; +R[1, 1] = v1.Y; +R[2, 1] = v1.Z; +R[0, 2] = v2.X; +R[1, 2] = v2.Y; +R[2, 2] = v2.Z; + +// Build skew matrix. +Matrix S = Matrix::Identity; +S[2, 1] = -skew; + +// Return skew transform. +return R * S * R.Transpose; // Not sure this is in the correct order... +} +*/ + +/** +* Generate rotation matrix for the euler angles. This is the same as computing +* 3 rotation matrices and multiplying them together in our custom order. +* +* @todo Have to recompute this code for our new convention. +**/ +void RotationMatrix( float yaw, float pitch, float roll ) { + float sy = sin(yaw+ToRadian(90)); + float cy = cos(yaw+ToRadian(90)); + float sp = sin(pitch-ToRadian(90)); + float cp = cos(pitch-ToRadian(90)); + float sr = sin(roll); + float cr = cos(roll); + + data[0] = cr*cy + sr*sp*sy; + data[1] = cp*sy; + data[2] = -sr*cy + cr*sp*sy; + data[3] = 0; + + data[4] = -cr*sy + sr*sp*cy; + data[5] = cp*cy; + data[6] = sr*sy + cr*sp*cy; + data[7] = 0; + + data[8] = sr*cp; + data[9] = -sp; + data[10] = cr*cp; + data[11] = 0; + + data[12] = 0; + data[13] = 0; + data[14] = 0; + data[15] = 1; +} + +/** Create a frustum matrix with the far plane at the infinity. */ +void Frustum( float xmin, float xmax, float ymin, float ymax, float zNear, float zFar ) { + float one_deltax, one_deltay, one_deltaz, doubleznear; + + doubleznear = 2.0f * zNear; + one_deltax = 1.0f / (xmax - xmin); + one_deltay = 1.0f / (ymax - ymin); + one_deltaz = 1.0f / (zFar - zNear); + + data[0] = (float)(doubleznear * one_deltax); + data[1] = 0.0f; + data[2] = 0.0f; + data[3] = 0.0f; + data[4] = 0.0f; + data[5] = (float)(doubleznear * one_deltay); + data[6] = 0.f; + data[7] = 0.f; + data[8] = (float)((xmax + xmin) * one_deltax); + data[9] = (float)((ymax + ymin) * one_deltay); + data[10] = (float)(-(zFar + zNear) * one_deltaz); + data[11] = -1.f; + data[12] = 0.f; + data[13] = 0.f; + data[14] = (float)(-(zFar * doubleznear) * one_deltaz); + data[15] = 0.f; +} + +/** Create a frustum matrix with the far plane at the infinity. */ +void FrustumInf( float xmin, float xmax, float ymin, float ymax, float zNear ) { + float one_deltax, one_deltay, doubleznear, nudge; + + doubleznear = 2.0f * zNear; + one_deltax = 1.0f / (xmax - xmin); + one_deltay = 1.0f / (ymax - ymin); + nudge = 1.0; // 0.999; + + data[0] = doubleznear * one_deltax; + data[1] = 0.0f; + data[2] = 0.0f; + data[3] = 0.0f; + + data[4] = 0.0f; + data[5] = doubleznear * one_deltay; + data[6] = 0.f; + data[7] = 0.f; + + data[8] = (xmax + xmin) * one_deltax; + data[9] = (ymax + ymin) * one_deltay; + data[10] = -1.0f * nudge; + data[11] = -1.0f; + + data[12] = 0.f; + data[13] = 0.f; + data[14] = -doubleznear * nudge; + data[15] = 0.f; +} + +/** Create an inverse frustum matrix with the far plane at the infinity. */ +void FrustumInfInv( float left, float right, float bottom, float top, float zNear ) { + // this matrix is wrong (not tested floatly) I think it should be transposed. + data[0] = (right - left) / (2 * zNear); + data[1] = 0; + data[2] = 0; + data[3] = (right + left) / (2 * zNear); + data[4] = 0; + data[5] = (top - bottom) / (2 * zNear); + data[6] = 0; + data[7] = (top + bottom) / (2 * zNear); + data[8] = 0; + data[9] = 0; + data[10] = 0; + data[11] = -1; + data[12] = 0; + data[13] = 0; + data[14] = -1 / (2 * zNear); + data[15] = 1 / (2 * zNear); +} + +/** Create an homogeneous projection matrix. */ +void Perspective( float fov, float aspect, float zNear, float zFar ) { + float xmin, xmax, ymin, ymax; + + xmax = zNear * tan( fov/2 ); + xmin = -xmax; + + ymax = xmax / aspect; + ymin = -ymax; + + Frustum(xmin, xmax, ymin, ymax, zNear, zFar); +} + +/** Create a projection matrix with the far plane at the infinity. */ +void PerspectiveInf( float fov, float aspect, float zNear ) { + float x = zNear * tan( fov/2 ); + float y = x / aspect; + FrustumInf( -x, x, -y, y, zNear ); +} + +/** Create an inverse projection matrix with far plane at the infinity. */ +void PerspectiveInfInv( float fov, float aspect, float zNear ) { + float x = zNear * tan( fov/2 ); + float y = x / aspect; + FrustumInfInv( -x, x, -y, y, zNear ); +} + +/** Build bone matrix from quatertion and offset. */ +void BoneMatrix(const Quat & q, const Vec3 & offset) { + float x2, y2, z2, xx, xy, xz, yy, yz, zz, wx, wy, wz; + + // calculate coefficients + x2 = q.x + q.x; + y2 = q.y + q.y; + z2 = q.z + q.z; + + xx = q.x * x2; xy = q.x * y2; xz = q.x * z2; + yy = q.y * y2; yz = q.y * z2; zz = q.z * z2; + wx = q.w * x2; wy = q.w * y2; wz = q.w * z2; + + data[0] = 1.0f - (yy + zz); + data[1] = xy - wz; + data[2] = xz + wy; + data[3] = 0.0f; + + data[4] = xy + wz; + data[5] = 1.0f - (xx + zz); + data[6] = yz - wx; + data[7] = 0.0f; + + data[8] = xz - wy; + data[9] = yz + wx; + data[10] = 1.0f - (xx + yy); + data[11] = 0.0f; + + data[12] = offset.x; + data[13] = offset.y; + data[14] = offset.z; + data[15] = 1.0f; +} + +//@} + + +/** @name Transformations: */ +//@{ + +/** Apply a general scale. */ +void Scale( float x, float y, float z ) { + data[0] *= x; data[4] *= y; data[8] *= z; + data[1] *= x; data[5] *= y; data[9] *= z; + data[2] *= x; data[6] *= y; data[10] *= z; + data[3] *= x; data[7] *= y; data[11] *= z; +} + +/** Apply a rotation of theta degrees around the axis v*/ +void Rotate( float theta, const Vec3 & v ) { + Matrix b; + b.RotationMatrix( theta, v[0], v[1], v[2] ); + Multiply4x3( b ); +} + +/** Apply a rotation of theta degrees around the axis v*/ +void Rotate( float theta, float v0, float v1, float v2 ) { + Matrix b; + b.RotationMatrix( theta, v0, v1, v2 ); + Multiply4x3( b ); +} + +/** +* Translate the matrix by t. This is the same as multiplying by a +* translation matrix with the given offset. +* this = T * this +*/ +void Translate( const Vec3 &t ) { + data[12] = data[0] * t.x + data[4] * t.y + data[8] * t.z + data[12]; + data[13] = data[1] * t.x + data[5] * t.y + data[9] * t.z + data[13]; + data[14] = data[2] * t.x + data[6] * t.y + data[10] * t.z + data[14]; + data[15] = data[3] * t.x + data[7] * t.y + data[11] * t.z + data[15]; +} + +/** +* Translate the matrix by x, y, z. This is the same as multiplying by a +* translation matrix with the given offsets. +*/ +void Translate( float x, float y, float z ) { + data[12] = data[0] * x + data[4] * y + data[8] * z + data[12]; + data[13] = data[1] * x + data[5] * y + data[9] * z + data[13]; + data[14] = data[2] * x + data[6] * y + data[10] * z + data[14]; + data[15] = data[3] * x + data[7] * y + data[11] * z + data[15]; +} + +/** Compute the transposed matrix. */ +void Transpose() { + piSwap(data[1], data[4]); + piSwap(data[2], data[8]); + piSwap(data[6], data[9]); + piSwap(data[3], data[12]); + piSwap(data[7], data[13]); + piSwap(data[11], data[14]); +} + +/** Compute the inverse of a rigid-body/isometry/orthonormal matrix. */ +void IsometryInverse() { + // transposed 3x3 upper left matrix + piSwap(data[1], data[4]); + piSwap(data[2], data[8]); + piSwap(data[6], data[9]); + + // translate by the negative offsets + Vec3 v(-data[12], -data[13], -data[14]); + data[12] = data[13] = data[14] = 0; + Translate(v); +} + +/** Compute the inverse of the affine portion of this matrix. */ +void AffineInverse() { + data[12] = data[13] = data[14] = 0; + Transpose(); +} +//@} + +/** @name Matrix operations: */ +//@{ + +/** Return the determinant of this matrix. */ +float Determinant() const { + return data[0] * data[5] * data[10] * data[15] + + data[1] * data[6] * data[11] * data[12] + + data[2] * data[7] * data[ 8] * data[13] + + data[3] * data[4] * data[ 9] * data[14] - + data[3] * data[6] * data[ 9] * data[12] - + data[2] * data[5] * data[ 8] * data[15] - + data[1] * data[4] * data[11] * data[14] - + data[0] * data[7] * data[10] * data[12]; +} + + +/** Standard matrix product: this *= B. */ +void Multiply4x4( const Matrix & restrict B ) { + Multiply4x4(*this, B); +} + +/** Standard matrix product: this = A * B. this != B*/ +void Multiply4x4( const Matrix & A, const Matrix & restrict B ) { + piDebugCheck(this != &B); + + for(int i = 0; i < 4; i++) { + const float ai0 = A(i,0), ai1 = A(i,1), ai2 = A(i,2), ai3 = A(i,3); + GetElem(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0) + ai3 * B(3,0); + GetElem(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1) + ai3 * B(3,1); + GetElem(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2) + ai3 * B(3,2); + GetElem(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3 * B(3,3); + } + + /* Unrolled but does not allow this == A + data[0] = A.data[0] * B.data[0] + A.data[4] * B.data[1] + A.data[8] * B.data[2] + A.data[12] * B.data[3]; + data[1] = A.data[1] * B.data[0] + A.data[5] * B.data[1] + A.data[9] * B.data[2] + A.data[13] * B.data[3]; + data[2] = A.data[2] * B.data[0] + A.data[6] * B.data[1] + A.data[10] * B.data[2] + A.data[14] * B.data[3]; + data[3] = A.data[3] * B.data[0] + A.data[7] * B.data[1] + A.data[11] * B.data[2] + A.data[15] * B.data[3]; + data[4] = A.data[0] * B.data[4] + A.data[4] * B.data[5] + A.data[8] * B.data[6] + A.data[12] * B.data[7]; + data[5] = A.data[1] * B.data[4] + A.data[5] * B.data[5] + A.data[9] * B.data[6] + A.data[13] * B.data[7]; + data[6] = A.data[2] * B.data[4] + A.data[6] * B.data[5] + A.data[10] * B.data[6] + A.data[14] * B.data[7]; + data[7] = A.data[3] * B.data[4] + A.data[7] * B.data[5] + A.data[11] * B.data[6] + A.data[15] * B.data[7]; + data[8] = A.data[0] * B.data[8] + A.data[4] * B.data[9] + A.data[8] * B.data[10] + A.data[12] * B.data[11]; + data[9] = A.data[1] * B.data[8] + A.data[5] * B.data[9] + A.data[9] * B.data[10] + A.data[13] * B.data[11]; + data[10]= A.data[2] * B.data[8] + A.data[6] * B.data[9] + A.data[10] * B.data[10] + A.data[14] * B.data[11]; + data[11]= A.data[3] * B.data[8] + A.data[7] * B.data[9] + A.data[11] * B.data[10] + A.data[15] * B.data[11]; + data[12]= A.data[0] * B.data[12] + A.data[4] * B.data[13] + A.data[8] * B.data[14] + A.data[12] * B.data[15]; + data[13]= A.data[1] * B.data[12] + A.data[5] * B.data[13] + A.data[9] * B.data[14] + A.data[13] * B.data[15]; + data[14]= A.data[2] * B.data[12] + A.data[6] * B.data[13] + A.data[10] * B.data[14] + A.data[14] * B.data[15]; + data[15]= A.data[3] * B.data[12] + A.data[7] * B.data[13] + A.data[11] * B.data[14] + A.data[15] * B.data[15]; + */ +} + +/** Standard matrix product: this *= B. */ +void Multiply4x3( const Matrix & restrict B ) { + Multiply4x3(*this, B); +} + +/** Standard product of matrices, where the last row is [0 0 0 1]. */ +void Multiply4x3( const Matrix & A, const Matrix & restrict B ) { + piDebugCheck(this != &B); + + for(int i = 0; i < 3; i++) { + const float ai0 = A(i,0), ai1 = A(i,1), ai2 = A(i,2), ai3 = A(i,3); + GetElem(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0) + ai3 * B(3,0); + GetElem(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1) + ai3 * B(3,1); + GetElem(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2) + ai3 * B(3,2); + GetElem(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3 * B(3,3); + } + data[3] = 0.0f; data[7] = 0.0f; data[11] = 0.0f; data[15] = 1.0f; + + /* Unrolled but does not allow this == A + data[0] = a.data[0] * b.data[0] + a.data[4] * b.data[1] + a.data[8] * b.data[2] + a.data[12] * b.data[3]; + data[1] = a.data[1] * b.data[0] + a.data[5] * b.data[1] + a.data[9] * b.data[2] + a.data[13] * b.data[3]; + data[2] = a.data[2] * b.data[0] + a.data[6] * b.data[1] + a.data[10] * b.data[2] + a.data[14] * b.data[3]; + data[3] = 0.0f; + data[4] = a.data[0] * b.data[4] + a.data[4] * b.data[5] + a.data[8] * b.data[6] + a.data[12] * b.data[7]; + data[5] = a.data[1] * b.data[4] + a.data[5] * b.data[5] + a.data[9] * b.data[6] + a.data[13] * b.data[7]; + data[6] = a.data[2] * b.data[4] + a.data[6] * b.data[5] + a.data[10] * b.data[6] + a.data[14] * b.data[7]; + data[7] = 0.0f; + data[8] = a.data[0] * b.data[8] + a.data[4] * b.data[9] + a.data[8] * b.data[10] + a.data[12] * b.data[11]; + data[9] = a.data[1] * b.data[8] + a.data[5] * b.data[9] + a.data[9] * b.data[10] + a.data[13] * b.data[11]; + data[10]= a.data[2] * b.data[8] + a.data[6] * b.data[9] + a.data[10] * b.data[10] + a.data[14] * b.data[11]; + data[11]= 0.0f; + data[12]= a.data[0] * b.data[12] + a.data[4] * b.data[13] + a.data[8] * b.data[14] + a.data[12] * b.data[15]; + data[13]= a.data[1] * b.data[12] + a.data[5] * b.data[13] + a.data[9] * b.data[14] + a.data[13] * b.data[15]; + data[14]= a.data[2] * b.data[12] + a.data[6] * b.data[13] + a.data[10] * b.data[14] + a.data[14] * b.data[15]; + data[15]= 1.0f; + */ +} +//@} + + +/** @name Vector operations: */ +//@{ + +/** Transform 3d vector (w=0). */ +void TransformVec3(const Vec3 & restrict orig, Vec3 * restrict dest) const { + piDebugCheck(&orig != dest); + dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8]; + dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9]; + dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10]; +} +/** Transform 3d vector by the transpose (w=0). */ +void TransformVec3T(const Vec3 & restrict orig, Vec3 * restrict dest) const { + piDebugCheck(&orig != dest); + dest->x = orig.x * data[0] + orig.y * data[1] + orig.z * data[2]; + dest->y = orig.x * data[4] + orig.y * data[5] + orig.z * data[6]; + dest->z = orig.x * data[8] + orig.y * data[9] + orig.z * data[10]; +} + +/** Transform a 3d homogeneous vector, where the fourth coordinate is assumed to be 1. */ +void TransformPoint(const Vec3 & restrict orig, Vec3 * restrict dest) const { + piDebugCheck(&orig != dest); + dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12]; + dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13]; + dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14]; +} + +/** Transform a point, normalize it, and return w. */ +float TransformPointAndNormalize(const Vec3 & restrict orig, Vec3 * restrict dest) const { + piDebugCheck(&orig != dest); + float w; + dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12]; + dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13]; + dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14]; + w = 1 / (orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15]); + *dest *= w; + return w; +} + +/** Transform a point and return w. */ +float TransformPointReturnW(const Vec3 & restrict orig, Vec3 * restrict dest) const { + piDebugCheck(&orig != dest); + dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12]; + dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13]; + dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14]; + return orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15]; +} + +/** Transform a normalized 3d point by a 4d matrix and return the resulting 4d vector. */ +void TransformVec4(const Vec3 & orig, Vec4 * dest) const { + dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12]; + dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13]; + dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14]; + dest->w = orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15]; +} +//@} + +/** @name Matrix analysis. */ +//@{ + +/** Get the ZYZ euler angles from the matrix. Assumes the matrix is orthonormal. */ +void GetEulerAnglesZYZ(float * s, float * t, float * r) const { + if( GetElem(2,2) < 1.0f ) { + if( GetElem(2,2) > -1.0f ) { + // cs*ct*cr-ss*sr -ss*ct*cr-cs*sr st*cr + // cs*ct*sr+ss*cr -ss*ct*sr+cs*cr st*sr + // -cs*st ss*st ct + *s = atan2(GetElem(1,2), -GetElem(0,2)); + *t = acos(GetElem(2,2)); + *r = atan2(GetElem(2,1), GetElem(2,0)); + } + else { + // -c(s-r) s(s-r) 0 + // s(s-r) c(s-r) 0 + // 0 0 -1 + *s = atan2(GetElem(0, 1), -GetElem(0, 0)); // = s-r + *t = PI; + *r = 0; + } + } + else { + // c(s+r) -s(s+r) 0 + // s(s+r) c(s+r) 0 + // 0 0 1 + *s = atan2(GetElem(0, 1), GetElem(0, 0)); // = s+r + *t = 0; + *r = 0; + } +} + +//@} + +MATHLIB_API friend PiStream & operator<< ( PiStream & s, Matrix & m ); + +/** Print to debug output. */ +void Print() const { + piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[0], data[4], data[8], data[12] ); + piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[1], data[5], data[9], data[13] ); + piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[2], data[6], data[10], data[14] ); + piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[3], data[7], data[11], data[15] ); +} + + +public: + + float data[16]; + +}; +#endif + + +#endif // NV_MATH_MATRIX_INL diff --git a/3rdparty/nvtt/nvmath/nvmath.h b/3rdparty/nvtt/nvmath/nvmath.h new file mode 100644 index 00000000..58353983 --- /dev/null +++ b/3rdparty/nvtt/nvmath/nvmath.h @@ -0,0 +1,56 @@ +// This code is in the public domain -- castanyo@yahoo.es + +#ifndef NV_MATH_H +#define NV_MATH_H + +#include +#include // finite, isnan + +#include "nvcore/utils.h" // max, clamp + +#define NVMATH_API +#define NVMATH_CLASS + +#define PI float(3.1415926535897932384626433833) +#define NV_EPSILON (0.0001f) +#define NV_NORMAL_EPSILON (0.001f) + +namespace nv +{ + inline float toRadian(float degree) { return degree * (PI / 180.0f); } + inline float toDegree(float radian) { return radian * (180.0f / PI); } + + // Robust floating point comparisons: + // http://realtimecollisiondetection.net/blog/?p=89 + inline bool equal(const float f0, const float f1, const float epsilon = NV_EPSILON) + { + //return fabs(f0-f1) <= epsilon; + return fabs(f0-f1) <= epsilon * max3(1.0f, fabsf(f0), fabsf(f1)); + } + + inline bool isZero(const float f, const float epsilon = NV_EPSILON) + { + return fabsf(f) <= epsilon; + } + + inline bool isFinite(const float f) + { + return _finite(f) != 0; + } + + // Eliminates negative zeros from a float array. + inline void floatCleanup(float * fp, int n) + { + for (int i = 0; i < n; i++) { + //nvDebugCheck(isFinite(fp[i])); + union { float f; uint32 i; } x = { fp[i] }; + if (x.i == 0x80000000) fp[i] = 0.0f; + } + } + + inline float saturate(float f) { + return clamp(f, 0.0f, 1.0f); + } +} + +#endif // NV_MATH_H diff --git a/3rdparty/nvtt/nvmath/plane.h b/3rdparty/nvtt/nvmath/plane.h new file mode 100644 index 00000000..eb544b13 --- /dev/null +++ b/3rdparty/nvtt/nvmath/plane.h @@ -0,0 +1,40 @@ +// This code is in the public domain -- Ignacio Castaņo + +#ifndef NV_MATH_PLANE_H +#define NV_MATH_PLANE_H + +#include "nvmath.h" +#include "vector.h" + +namespace nv +{ + class Matrix; + + class NVMATH_CLASS Plane + { + public: + Plane(); + Plane(float x, float y, float z, float w); + Plane(const Vector4 & v); + Plane(const Vector3 & v, float d); + Plane(const Vector3 & normal, const Vector3 & point); + Plane(const Vector3 & v0, const Vector3 & v1, const Vector3 & v2); + + const Plane & operator=(const Plane & v); + + Vector3 vector() const; + float offset() const; + + void operator*=(float s); + + Vector4 v; + }; + + Plane transformPlane(const Matrix &, const Plane &); + + Vector3 planeIntersection(const Plane & a, const Plane & b, const Plane & c); + + +} // nv namespace + +#endif // NV_MATH_PLANE_H diff --git a/3rdparty/nvtt/nvmath/plane.inl b/3rdparty/nvtt/nvmath/plane.inl new file mode 100644 index 00000000..73bf712c --- /dev/null +++ b/3rdparty/nvtt/nvmath/plane.inl @@ -0,0 +1,49 @@ +// This code is in the public domain -- Ignacio Castaņo + +#pragma once +#ifndef NV_MATH_PLANE_INL +#define NV_MATH_PLANE_INL + +#include "Plane.h" +#include "Vector.inl" + +namespace nv +{ + inline Plane::Plane() {} + inline Plane::Plane(float x, float y, float z, float w) : v(x, y, z, w) {} + inline Plane::Plane(const Vector4 & v) : v(v) {} + inline Plane::Plane(const Vector3 & v, float d) : v(v, d) {} + inline Plane::Plane(const Vector3 & normal, const Vector3 & point) : v(normal, -dot(normal, point)) {} + inline Plane::Plane(const Vector3 & v0, const Vector3 & v1, const Vector3 & v2) { + Vector3 n = cross(v1-v0, v2-v0); + float d = -dot(n, v0); + v = Vector4(n, d); + } + + inline const Plane & Plane::operator=(const Plane & p) { v = p.v; return *this; } + + inline Vector3 Plane::vector() const { return v.xyz(); } + inline float Plane::offset() const { return v.w; } + + // Normalize plane. + inline Plane normalize(const Plane & plane, float epsilon = NV_EPSILON) + { + const float len = length(plane.vector()); + const float inv = isZero(len, epsilon) ? 0 : 1.0f / len; + return Plane(plane.v * inv); + } + + // Get the signed distance from the given point to this plane. + inline float distance(const Plane & plane, const Vector3 & point) + { + return dot(plane.vector(), point) + plane.offset(); + } + + inline void Plane::operator*=(float s) + { + v *= s; + } + +} // nv namespace + +#endif // NV_MATH_PLANE_H diff --git a/3rdparty/nvtt/nvmath/vector.h b/3rdparty/nvtt/nvmath/vector.h new file mode 100644 index 00000000..180cfab0 --- /dev/null +++ b/3rdparty/nvtt/nvmath/vector.h @@ -0,0 +1,148 @@ +// This code is in the public domain -- castanyo@yahoo.es + +#ifndef NV_MATH_VECTOR_H +#define NV_MATH_VECTOR_H + +#include "nvmath.h" + +namespace nv +{ + class NVMATH_CLASS Vector2 + { + public: + typedef Vector2 const & Arg; + + Vector2(); + explicit Vector2(float f); + Vector2(float x, float y); + Vector2(Vector2::Arg v); + + //template explicit Vector2(const T & v) : x(v.x), y(v.y) {} + //template operator T() const { return T(x, y); } + + const Vector2 & operator=(Vector2::Arg v); + + const float * ptr() const; + + void set(float x, float y); + + Vector2 operator-() const; + void operator+=(Vector2::Arg v); + void operator-=(Vector2::Arg v); + void operator*=(float s); + void operator*=(Vector2::Arg v); + + friend bool operator==(Vector2::Arg a, Vector2::Arg b); + friend bool operator!=(Vector2::Arg a, Vector2::Arg b); + + union { + struct { + float x, y; + }; + float component[2]; + }; + }; + + class NVMATH_CLASS Vector3 + { + public: + typedef Vector3 const & Arg; + + Vector3(); + explicit Vector3(float x); + //explicit Vector3(int x) : x(float(x)), y(float(x)), z(float(x)) {} + Vector3(float x, float y, float z); + Vector3(Vector2::Arg v, float z); + Vector3(Vector3::Arg v); + + //template explicit Vector3(const T & v) : x(v.x), y(v.y), z(v.z) {} + //template operator T() const { return T(x, y, z); } + + const Vector3 & operator=(Vector3::Arg v); + + Vector2 xy() const; + + const float * ptr() const; + + void set(float x, float y, float z); + + Vector3 operator-() const; + void operator+=(Vector3::Arg v); + void operator-=(Vector3::Arg v); + void operator*=(float s); + void operator/=(float s); + void operator*=(Vector3::Arg v); + void operator/=(Vector3::Arg v); + + friend bool operator==(Vector3::Arg a, Vector3::Arg b); + friend bool operator!=(Vector3::Arg a, Vector3::Arg b); + + union { + struct { + float x, y, z; + }; + float component[3]; + }; + }; + + class NVMATH_CLASS Vector4 + { + public: + typedef Vector4 const & Arg; + + Vector4(); + explicit Vector4(float x); + Vector4(float x, float y, float z, float w); + Vector4(Vector2::Arg v, float z, float w); + Vector4(Vector2::Arg v, Vector2::Arg u); + Vector4(Vector3::Arg v, float w); + Vector4(Vector4::Arg v); + // Vector4(const Quaternion & v); + + //template explicit Vector4(const T & v) : x(v.x), y(v.y), z(v.z), w(v.w) {} + //template operator T() const { return T(x, y, z, w); } + + const Vector4 & operator=(Vector4::Arg v); + + Vector2 xy() const; + Vector2 zw() const; + Vector3 xyz() const; + + const float * ptr() const; + + void set(float x, float y, float z, float w); + + Vector4 operator-() const; + void operator+=(Vector4::Arg v); + void operator-=(Vector4::Arg v); + void operator*=(float s); + void operator/=(float s); + void operator*=(Vector4::Arg v); + void operator/=(Vector4::Arg v); + + friend bool operator==(Vector4::Arg a, Vector4::Arg b); + friend bool operator!=(Vector4::Arg a, Vector4::Arg b); + + union { + struct { + float x, y, z, w; + }; + float component[4]; + }; + }; + +} // nv namespace + +// If we had these functions, they would be ambiguous, the compiler would not know which one to pick: +//template Vector2 to(const T & v) { return Vector2(v.x, v.y); } +//template Vector3 to(const T & v) { return Vector3(v.x, v.y, v.z); } +//template Vector4 to(const T & v) { return Vector4(v.x, v.y, v.z, v.z); } + +// We could use a cast operator so that we could infer the expected type, but that doesn't work the same way in all compilers and produces horrible error messages. + +// Instead we simply have explicit casts: +template T to(const nv::Vector2 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector2)); return T(v.x, v.y); } +template T to(const nv::Vector3 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector3)); return T(v.x, v.y, v.z); } +template T to(const nv::Vector4 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector4)); return T(v.x, v.y, v.z, v.w); } + +#endif // NV_MATH_VECTOR_H diff --git a/3rdparty/nvtt/nvtt.cpp b/3rdparty/nvtt/nvtt.cpp new file mode 100644 index 00000000..adb5ae55 --- /dev/null +++ b/3rdparty/nvtt/nvtt.cpp @@ -0,0 +1,95 @@ +/* + * Copyright 2011-2015 Branimir Karadzic. All rights reserved. + * License: http://www.opensource.org/licenses/BSD-2-Clause + */ + +#include "nvtt.h" + +#include +#include + +#include "bc6h/zoh.h" +#include "bc7/avpcl.h" +#include "nvmath/vector.inl" + +NVCORE_API int nvAbort(const char *, const char *, int , const char *, const char *, ...) __attribute__((format (printf, 5, 6))) +{ + abort(); + return 0; +} + +namespace nvtt +{ + using namespace nv; + + void compressBC6H(const void* _input, uint32_t _width, uint32_t _height, uint32_t _stride, void* _output) + { + const uint8_t* src = (const uint8_t*)_input; + char* dst = (char*)_output; + + for (uint32_t yy = 0; yy < _height; yy += 4) + { + for (uint32_t xx = 0; xx < _width; xx += 4) + { + const Vector4* rgba = (const Vector4*)&src[yy*_stride + xx*sizeof(float)*4]; + + ZOH::Utils::FORMAT = ZOH::UNSIGNED_F16; + ZOH::Tile zohTile(4, 4); + + memset(zohTile.data, 0, sizeof(zohTile.data) ); + memset(zohTile.importance_map, 0, sizeof(zohTile.importance_map) ); + + for (uint32_t blockY = 0; blockY < 4; ++blockY) + { + for (uint32_t blockX = 0; blockX < 4; ++blockX) + { + Vector4 color = rgba[blockY*4 + blockX]; + uint16 rHalf = bx::halfFromFloat(color.x); + uint16 gHalf = bx::halfFromFloat(color.y); + uint16 bHalf = bx::halfFromFloat(color.z); + zohTile.data[blockY][blockX].x = ZOH::Tile::half2float(rHalf); + zohTile.data[blockY][blockX].y = ZOH::Tile::half2float(gHalf); + zohTile.data[blockY][blockX].z = ZOH::Tile::half2float(bHalf); + zohTile.importance_map[blockY][blockX] = 1.0f; + } + } + + ZOH::compress(zohTile, &dst[( (yy*_width) + xx)/4 * 16]); + } + } + } + + void compressBC7(const void* _input, uint32_t _width, uint32_t _height, uint32_t _stride, void* _output) + { + const uint8_t* src = (const uint8_t*)_input; + char* dst = (char*)_output; + + for (uint32_t yy = 0; yy < _height; yy += 4) + { + for (uint32_t xx = 0; xx < _width; xx += 4) + { + const Vector4* rgba = (const Vector4*)&src[yy*_stride + xx*sizeof(float)*4]; + + AVPCL::mode_rgb = false; + AVPCL::flag_premult = false; + AVPCL::flag_nonuniform = false; + AVPCL::flag_nonuniform_ati = false; + + AVPCL::Tile avpclTile(4, 4); + memset(avpclTile.data, 0, sizeof(avpclTile.data) ); + for (uint32_t blockY = 0; blockY < 4; ++blockY) + { + for (uint32_t blockX = 0; blockX < 4; ++blockX) + { + Vector4 color = rgba[blockY*4 + blockX]; + avpclTile.data[blockY][blockX] = color * 255.0f; + avpclTile.importance_map[blockY][blockX] = 1.0f; + } + } + + AVPCL::compress(avpclTile, &dst[( (yy*_width) + xx)/4 * 16]); + } + } + } + +} //namespace nvtt diff --git a/3rdparty/nvtt/nvtt.h b/3rdparty/nvtt/nvtt.h new file mode 100644 index 00000000..a37c7cfb --- /dev/null +++ b/3rdparty/nvtt/nvtt.h @@ -0,0 +1,13 @@ +#ifndef NVTT_H +#define NVTT_H + +#include + +namespace nvtt +{ +void compressBC6H(const void* _input, uint32_t _width, uint32_t _height, uint32_t _stride, void* _output); +void compressBC7(const void* _input, uint32_t _width, uint32_t _height, uint32_t _stride, void* _output); + +} // namespace nvtt + +#endif // NVTT_H diff --git a/scripts/texturec.lua b/scripts/texturec.lua index ab60f80b..72e9ab2a 100644 --- a/scripts/texturec.lua +++ b/scripts/texturec.lua @@ -12,6 +12,7 @@ project "texturec" path.join(BGFX_DIR, "include"), path.join(BGFX_DIR, "src"), path.join(BGFX_DIR, "3rdparty"), + path.join(BGFX_DIR, "3rdparty/nvtt"), } files { @@ -20,6 +21,8 @@ project "texturec" path.join(BGFX_DIR, "3rdparty/libsquish/**.h"), path.join(BGFX_DIR, "3rdparty/etc1/**.cpp"), path.join(BGFX_DIR, "3rdparty/etc1/**.h"), + path.join(BGFX_DIR, "3rdparty/nvtt/**.cpp"), + path.join(BGFX_DIR, "3rdparty/nvtt/**.h"), path.join(BGFX_DIR, "tools/texturec/**.cpp"), path.join(BGFX_DIR, "tools/texturec/**.h"), } diff --git a/tools/texturec/texturec.cpp b/tools/texturec/texturec.cpp index c5866b21..dbb8d508 100644 --- a/tools/texturec/texturec.cpp +++ b/tools/texturec/texturec.cpp @@ -13,6 +13,7 @@ #include "image.h" #include #include +#include #if 0 # define BX_TRACE(_format, ...) fprintf(stderr, "" _format "\n", ##__VA_ARGS__) @@ -113,6 +114,14 @@ int main(int _argc, const char* _argv[]) { format = TextureFormat::ETC1; } + else if (0 == bx::stricmp(type, "bc6h") ) + { + format = TextureFormat::BC6H; + } + else if (0 == bx::stricmp(type, "bc7") ) + { + format = TextureFormat::BC7; + } } uint32_t size = (uint32_t)bx::getSize(&reader); @@ -154,10 +163,33 @@ int main(int _argc, const char* _argv[]) ); break; + case TextureFormat::BC4: + case TextureFormat::BC5: + break; + + case TextureFormat::BC6H: + nvtt::compressBC6H(rgba, mip.m_width, mip.m_height, 4, output); + break; + + case TextureFormat::BC7: + nvtt::compressBC7(rgba, mip.m_width, mip.m_height, 4, output); + break; + case TextureFormat::ETC1: etc1_encode_image(rgba, mip.m_width, mip.m_height, 4, mip.m_width*4, output); break; + case TextureFormat::ETC2: + case TextureFormat::ETC2A: + case TextureFormat::ETC2A1: + case TextureFormat::PTC12: + case TextureFormat::PTC14: + case TextureFormat::PTC12A: + case TextureFormat::PTC14A: + case TextureFormat::PTC22: + case TextureFormat::PTC24: + break; + default: break; }