basis_universal: Update to upstream commit from Apr 16, 2021

BinomialLLC/basis_universal@ba1c3e40f1.
2021-05-07 17:00:41 +02:00 · 2021-05-07 17:00:41 +02:00 · 2d133177e9
parent 8976594f4b
commit 2d133177e9
74 changed files with 36770 additions and 9225 deletions
--- a/modules/basis_universal/SCsub
+++ b/modules/basis_universal/SCsub
@ -11,40 +11,45 @@ thirdparty_obj = []

 # Not unbundled so far since not widespread as shared library
 thirdparty_dir = "#thirdparty/basis_universal/"
-tool_sources = [
+# Sync list with upstream CMakeLists.txt
+encoder_sources = [
+    "apg_bmp.c",
    "basisu_astc_decomp.cpp",
    "basisu_backend.cpp",
    "basisu_basis_file.cpp",
+    "basisu_bc7enc.cpp",
    "basisu_comp.cpp",
    "basisu_enc.cpp",
    "basisu_etc.cpp",
    "basisu_frontend.cpp",
    "basisu_global_selector_palette_helpers.cpp",
    "basisu_gpu_texture.cpp",
+    "basisu_kernels_sse.cpp",
    "basisu_pvrtc1_4.cpp",
-    "basisu_resample_filters.cpp",
    "basisu_resampler.cpp",
+    "basisu_resample_filters.cpp",
    "basisu_ssim.cpp",
+    "basisu_uastc_enc.cpp",
+    "jpgd.cpp",
    "lodepng.cpp",
 ]
-tool_sources = [thirdparty_dir + file for file in tool_sources]
+encoder_sources = [thirdparty_dir + "encoder/" + file for file in encoder_sources]
 transcoder_sources = [thirdparty_dir + "transcoder/basisu_transcoder.cpp"]

 # Treat Basis headers as system headers to avoid raising warnings. Not supported on MSVC.
 if not env.msvc:
-    env_basisu.Append(
-        CPPFLAGS=["-isystem", Dir(thirdparty_dir).path, "-isystem", Dir(thirdparty_dir + "transcoder").path]
-    )
+    env_basisu.Append(CPPFLAGS=["-isystem", Dir(thirdparty_dir).path])
 else:
-    env_basisu.Prepend(CPPPATH=[thirdparty_dir, thirdparty_dir + "transcoder"])
+    env_basisu.Prepend(CPPPATH=[thirdparty_dir])

 if env["target"] == "debug":
-    env_basisu.Append(CPPFLAGS=["-DBASISU_DEVEL_MESSAGES=1", "-DBASISD_ENABLE_DEBUG_FLAGS=1"])
+    env_basisu.Append(CPPDEFINES=[("BASISU_DEVEL_MESSAGES", 1), ("BASISD_ENABLE_DEBUG_FLAGS", 1)])

 env_thirdparty = env_basisu.Clone()
 env_thirdparty.disable_warnings()
 if env["tools"]:
-    env_thirdparty.add_source_files(thirdparty_obj, tool_sources)
+    env_thirdparty.Append(CPPDEFINES=["BASISU_NO_IMG_LOADERS"])
+    env_thirdparty.add_source_files(thirdparty_obj, encoder_sources)
 env_thirdparty.add_source_files(thirdparty_obj, transcoder_sources)
 env.modules_sources += thirdparty_obj

--- a/modules/basis_universal/register_types.cpp
+++ b/modules/basis_universal/register_types.cpp
@ -35,7 +35,7 @@
 #include "texture_basisu.h"

 #ifdef TOOLS_ENABLED
-#include <basisu_comp.h>
+#include <encoder/basisu_comp.h>
 #endif

 #include <transcoder/basisu_transcoder.h>
@ -233,7 +233,7 @@ static Ref<Image> basis_universal_unpacker(const Vector<uint8_t> &p_buffer) {
 	basist::basisu_image_info info;
 	tr.get_image_info(ptr, size, info, 0);

-	int block_size = basist::basis_get_bytes_per_block(format);
+	int block_size = basist::basis_get_bytes_per_block_or_pixel(format);
 	Vector<uint8_t> gpudata;
 	gpudata.resize(info.m_total_blocks * block_size);

--- a/modules/basis_universal/texture_basisu.cpp
+++ b/modules/basis_universal/texture_basisu.cpp
@ -33,7 +33,7 @@
 #include "core/os/os.h"

 #ifdef TOOLS_ENABLED
-#include <basisu_comp.h>
+#include <encoder/basisu_comp.h>
 #endif

 #include <transcoder/basisu_transcoder.h>
--- a/modules/basis_universal/texture_basisu.h
+++ b/modules/basis_universal/texture_basisu.h
@ -34,7 +34,7 @@
 #include "scene/resources/texture.h"

 #ifdef TOOLS_ENABLED
-#include <basisu_comp.h>
+#include <encoder/basisu_comp.h>
 #endif

 #include <transcoder/basisu_transcoder.h>
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@ -8,13 +8,12 @@ readability.
 ## basis_universal

 - Upstream: https://github.com/BinomialLLC/basis_universal
- Version: git (895ee8ee7e04f22267f8d16d46de04d5a01d63ac, 2020)
+- Version: git (ba1c3e40f1d434ebaf9a167b44e9b11d2bf0f765, 2021)
 - License: Apache 2.0

 Files extracted from upstream source:

- `.cpp` and `.h` files in root folder except for `basisu_tool.cpp` (contains `main` and can cause link error)
- `.cpp`, `.h` and `.inc` files in `transcoder/`, keeping folder structure
+- `encoder/` and `transcoder/` folders
 - `LICENSE`


--- a/thirdparty/basis_universal/basisu_comp.cpp
+++ b/thirdparty/basis_universal/basisu_comp.cpp
--- a/thirdparty/basis_universal/basisu_etc.cpp
+++ b/thirdparty/basis_universal/basisu_etc.cpp
--- a/thirdparty/basis_universal/basisu_pvrtc1_4.cpp
+++ b/thirdparty/basis_universal/basisu_pvrtc1_4.cpp
@ -1,269 +0,0 @@
-// basisu_pvrtc1_4.cpp
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "basisu_pvrtc1_4.h"
-
-namespace basisu
-{
-	uint32_t pvrtc4_swizzle_uv(uint32_t width, uint32_t height, uint32_t x, uint32_t y)
-	{
-		assert((x < width) && (y < height) && basisu::is_pow2(height) && basisu::is_pow2(width));
-				
-		uint32_t min_d = width, max_v = y;
-		if (height < width)
-		{
-			min_d = height;
-			max_v = x;
-		}
-
-		// Interleave the XY LSB's
-		uint32_t shift_ofs = 0, swizzled = 0;
-		for (uint32_t s_bit = 1, d_bit = 1; s_bit < min_d; s_bit <<= 1, d_bit <<= 2, ++shift_ofs)
-		{
-			if (y & s_bit) swizzled |= d_bit;
-			if (x & s_bit) swizzled |= (2 * d_bit);
-		}
-
-		max_v >>= shift_ofs;
-		
-		// OR in the rest of the bits from the largest dimension
-		swizzled |= (max_v << (2 * shift_ofs));
-
-		return swizzled;
-	}
-
-	color_rgba pvrtc4_block::get_endpoint(uint32_t endpoint_index, bool unpack) const
-	{
-		assert(endpoint_index < 2);
-		const uint32_t packed = m_endpoints >> (endpoint_index * 16);
-
-		uint32_t r, g, b, a;
-		if (packed & 0x8000)
-		{
-			// opaque 554 or 555
-			if (!endpoint_index)
-			{
-				r = (packed >> 10) & 31;
-				g = (packed >> 5) & 31;
-				b = (packed >> 1) & 15;
-					
-				if (unpack)
-				{
-					b = (b << 1) | (b >> 3);
-				}
-			}
-			else
-			{
-				r = (packed >> 10) & 31;
-				g = (packed >> 5) & 31;
-				b = packed & 31;
-			}
-
-			a = unpack ? 255 : 7;
-		}
-		else
-		{
-			// translucent 4433 or 4443
-			if (!endpoint_index)
-			{
-				a = (packed >> 12) & 7;
-				r = (packed >> 8) & 15;
-				g = (packed >> 4) & 15;
-				b = (packed >> 1) & 7;
-
-				if (unpack)
-				{
-					a = (a << 1);
-					a = (a << 4) | a;
-						
-					r = (r << 1) | (r >> 3);
-					g = (g << 1) | (g >> 3);
-					b = (b << 2) | (b >> 1);
-				}
-			}
-			else
-			{
-				a = (packed >> 12) & 7;
-				r = (packed >> 8) & 15;
-				g = (packed >> 4) & 15;
-				b = packed & 15;
-
-				if (unpack)
-				{
-					a = (a << 1);
-					a = (a << 4) | a;
-
-					r = (r << 1) | (r >> 3);
-					g = (g << 1) | (g >> 3);
-					b = (b << 1) | (b >> 3);
-				}
-			}
-		}
-
-		if (unpack)
-		{
-			r = (r << 3) | (r >> 2);
-			g = (g << 3) | (g >> 2);
-			b = (b << 3) | (b >> 2);
-		}
-
-		assert((r < 256) && (g < 256) && (b < 256) && (a < 256));
-
-		return color_rgba(r, g, b, a);
-	}
-
-	color_rgba pvrtc4_block::get_endpoint_5554(uint32_t endpoint_index) const
-	{
-		assert(endpoint_index < 2);
-		const uint32_t packed = m_endpoints >> (endpoint_index * 16);
-
-		uint32_t r, g, b, a;
-		if (packed & 0x8000)
-		{
-			// opaque 554 or 555
-			if (!endpoint_index)
-			{
-				r = (packed >> 10) & 31;
-				g = (packed >> 5) & 31;
-				b = (packed >> 1) & 15;
-
-				b = (b << 1) | (b >> 3);
-			}
-			else
-			{
-				r = (packed >> 10) & 31;
-				g = (packed >> 5) & 31;
-				b = packed & 31;
-			}
-
-			a = 15;
-		}
-		else
-		{
-			// translucent 4433 or 4443
-			if (!endpoint_index)
-			{
-				a = (packed >> 12) & 7;
-				r = (packed >> 8) & 15;
-				g = (packed >> 4) & 15;
-				b = (packed >> 1) & 7;
-
-				a = a << 1;
-						
-				r = (r << 1) | (r >> 3);
-				g = (g << 1) | (g >> 3);
-				b = (b << 2) | (b >> 1);
-			}
-			else
-			{
-				a = (packed >> 12) & 7;
-				r = (packed >> 8) & 15;
-				g = (packed >> 4) & 15;
-				b = packed & 15;
-
-				a = a << 1;
-						
-				r = (r << 1) | (r >> 3);
-				g = (g << 1) | (g >> 3);
-				b = (b << 1) | (b >> 3);
-			}
-		}
-						
-		assert((r < 32) && (g < 32) && (b < 32) && (a < 16));
-
-		return color_rgba(r, g, b, a);
-	}
-
-	bool pvrtc4_image::get_interpolated_colors(uint32_t x, uint32_t y, color_rgba* pColors) const
-	{
-		assert((x < m_width) && (y < m_height));
-
-		int block_x0 = (static_cast<int>(x) - 2) >> 2;
-		int block_x1 = block_x0 + 1;
-		int block_y0 = (static_cast<int>(y) - 2) >> 2;
-		int block_y1 = block_y0 + 1;
-		
-		block_x0 = posmod(block_x0, m_block_width);
-		block_x1 = posmod(block_x1, m_block_width);
-		block_y0 = posmod(block_y0, m_block_height);
-		block_y1 = posmod(block_y1, m_block_height);
-		
-		pColors[0] = interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0));
-		pColors[3] = interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1));
-
-		if (get_block_uses_transparent_modulation(x >> 2, y >> 2))
-		{
-			for (uint32_t c = 0; c < 4; c++)
-			{
-				uint32_t m = (pColors[0][c] + pColors[3][c]) / 2;
-				pColors[1][c] = static_cast<uint8_t>(m);
-				pColors[2][c] = static_cast<uint8_t>(m);
-			}
-			pColors[2][3] = 0;
-			return true;
-		}
-
-		for (uint32_t c = 0; c < 4; c++)
-		{
-			pColors[1][c] = static_cast<uint8_t>((pColors[0][c] * 5 + pColors[3][c] * 3) / 8);
-			pColors[2][c] = static_cast<uint8_t>((pColors[0][c] * 3 + pColors[3][c] * 5) / 8);
-		}
-
-		return false;
-	}
-		
-	color_rgba pvrtc4_image::get_pixel(uint32_t x, uint32_t y, uint32_t m) const
-	{
-		assert((x < m_width) && (y < m_height));
-
-		int block_x0 = (static_cast<int>(x) - 2) >> 2;
-		int block_x1 = block_x0 + 1;
-		int block_y0 = (static_cast<int>(y) - 2) >> 2;
-		int block_y1 = block_y0 + 1;
-		
-		block_x0 = posmod(block_x0, m_block_width);
-		block_x1 = posmod(block_x1, m_block_width);
-		block_y0 = posmod(block_y0, m_block_height);
-		block_y1 = posmod(block_y1, m_block_height);
-		
-		if (get_block_uses_transparent_modulation(x >> 2, y >> 2))
-		{
-			if (m == 0)
-				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0));
-			else if (m == 3)
-				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1));
-
-			color_rgba l(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0)));
-			color_rgba h(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1)));
-
-			return color_rgba((l[0] + h[0]) / 2, (l[1] + h[1]) / 2, (l[2] + h[2]) / 2, (m == 2) ? 0 : (l[3] + h[3]) / 2);
-		}
-		else
-		{
-			if (m == 0)
-				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0));
-			else if (m == 3)
-				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1));
-
-			color_rgba l(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0)));
-			color_rgba h(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1)));
-
-			if (m == 2)
-				return color_rgba((l[0] * 3 + h[0] * 5) / 8, (l[1] * 3 + h[1] * 5) / 8, (l[2] * 3 + h[2] * 5) / 8, (l[3] * 3 + h[3] * 5) / 8);
-			else
-				return color_rgba((l[0] * 5 + h[0] * 3) / 8, (l[1] * 5 + h[1] * 3) / 8, (l[2] * 5 + h[2] * 3) / 8, (l[3] * 5 + h[3] * 3) / 8);
-		}
-	}
-
-} // basisu
--- a/thirdparty/basis_universal/encoder/apg_bmp.c
+++ b/thirdparty/basis_universal/encoder/apg_bmp.c
@ -0,0 +1,541 @@
+/*
+BMP File Reader/Writer Implementation
+Anton Gerdelan
+Version: 3
+Licence: see apg_bmp.h
+C99
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS 1
+#endif
+
+#include "apg_bmp.h"
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* Maximum pixel dimensions of width or height of an image. Should accommodate max used in graphics APIs.
+   NOTE: 65536*65536 is the biggest number storable in 32 bits.
+   This needs to be multiplied by n_channels so actual memory indices are not uint32 but size_t to avoid overflow.
+   Note this will crash stb_image_write et al at maximum size which use 32bits, so reduce max size to accom. */
+#define _BMP_MAX_DIMS 65536
+#define _BMP_FILE_HDR_SZ 14
+#define _BMP_MIN_DIB_HDR_SZ 40
+#define _BMP_MIN_HDR_SZ ( _BMP_FILE_HDR_SZ + _BMP_MIN_DIB_HDR_SZ )
+#define _BMP_MAX_IMAGE_FILE_SIZE (1024ULL*1024ULL*1024ULL)
+
+#pragma pack( push, 1 ) // supported on GCC in addition to individual packing attribs
+/* All BMP files, regardless of type, start with this file header */
+typedef struct _bmp_file_header_t {
+  char file_type[2];
+  uint32_t file_sz;
+  uint16_t reserved1;
+  uint16_t reserved2;
+  uint32_t image_data_offset;
+} _bmp_file_header_t;
+
+/* Following the file header is the BMP type header. this is the most commonly used format */
+typedef struct _bmp_dib_BITMAPINFOHEADER_t {
+  uint32_t this_header_sz;
+  int32_t w;                      // in older headers w & h these are shorts and may be unsigned
+  int32_t h;                      //
+  uint16_t n_planes;              // must be 1
+  uint16_t bpp;                   // bits per pixel. 1,4,8,16,24,32.
+  uint32_t compression_method;    // 16 and 32-bit images must have a value of 3 here
+  uint32_t image_uncompressed_sz; // not consistently used in the wild, so ignored here.
+  int32_t horiz_pixels_per_meter; // not used.
+  int32_t vert_pixels_per_meter;  // not used.
+  uint32_t n_colours_in_palette;  //
+  uint32_t n_important_colours;   // not used.
+  /* NOTE(Anton) a DIB header may end here at 40-bytes. be careful using sizeof() */
+  /* if 'compression' value, above, is set to 3 ie the image is 16 or 32-bit, then these colour channel masks follow the headers.
+  these are big-endian order bit masks to assign bits of each pixel to different colours. bits used must be contiguous and not overlap. */
+  uint32_t bitmask_r;
+  uint32_t bitmask_g;
+  uint32_t bitmask_b;
+} _bmp_dib_BITMAPINFOHEADER_t;
+#pragma pack( pop )
+
+typedef enum _bmp_compression_t {
+  BI_RGB            = 0,
+  BI_RLE8           = 1,
+  BI_RLE4           = 2,
+  BI_BITFIELDS      = 3,
+  BI_JPEG           = 4,
+  BI_PNG            = 5,
+  BI_ALPHABITFIELDS = 6,
+  BI_CMYK           = 11,
+  BI_CMYKRLE8       = 12,
+  BI_CMYRLE4        = 13
+} _bmp_compression_t;
+
+/* convenience struct and file->memory function */
+typedef struct _entire_file_t {
+  void* data;
+  size_t sz;
+} _entire_file_t;
+
+/*
+RETURNS
+- true on success. record->data is allocated memory and must be freed by the caller.
+- false on any error. Any allocated memory is freed if false is returned */
+static bool _read_entire_file( const char* filename, _entire_file_t* record ) {
+  FILE* fp = fopen( filename, "rb" );
+  if ( !fp ) { return false; }
+  fseek( fp, 0L, SEEK_END );
+  record->sz   = (size_t)ftell( fp );
+
+  // Immediately bail on anything larger than _BMP_MAX_IMAGE_FILE_SIZE. 
+  if (record->sz > _BMP_MAX_IMAGE_FILE_SIZE) {
+    fclose( fp );
+    return false;
+  }
+
+  record->data = malloc( record->sz );
+  if ( !record->data ) {
+    fclose( fp );
+    return false;
+  }
+  rewind( fp );
+  size_t nr = fread( record->data, record->sz, 1, fp );
+  fclose( fp );
+  if ( 1 != nr ) { return false; }
+  return true;
+}
+
+static bool _validate_file_hdr( _bmp_file_header_t* file_hdr_ptr, size_t file_sz ) {
+  if ( !file_hdr_ptr ) { return false; }
+  if ( file_hdr_ptr->file_type[0] != 'B' || file_hdr_ptr->file_type[1] != 'M' ) { return false; }
+  if ( file_hdr_ptr->image_data_offset > file_sz ) { return false; }
+  return true;
+}
+
+static bool _validate_dib_hdr( _bmp_dib_BITMAPINFOHEADER_t* dib_hdr_ptr, size_t file_sz ) {
+  if ( !dib_hdr_ptr ) { return false; }
+  if ( _BMP_FILE_HDR_SZ + dib_hdr_ptr->this_header_sz > file_sz ) { return false; }
+  if ( ( 32 == dib_hdr_ptr->bpp || 16 == dib_hdr_ptr->bpp ) && ( BI_BITFIELDS != dib_hdr_ptr->compression_method && BI_ALPHABITFIELDS != dib_hdr_ptr->compression_method ) ) {
+    return false;
+  }
+  if ( BI_RGB != dib_hdr_ptr->compression_method && BI_BITFIELDS != dib_hdr_ptr->compression_method && BI_ALPHABITFIELDS != dib_hdr_ptr->compression_method ) {
+    return false;
+  }
+  // NOTE(Anton) using abs() in the if-statement was blowing up on large negative numbers. switched to labs()
+  if ( 0 == dib_hdr_ptr->w || 0 == dib_hdr_ptr->h || labs( dib_hdr_ptr->w ) > _BMP_MAX_DIMS || labs( dib_hdr_ptr->h ) > _BMP_MAX_DIMS ) { return false; }
+
+  /* NOTE(Anton) if images reliably used n_colours_in_palette we could have done a palette/file size integrity check here.
+  because some always set 0 then we have to check every palette indexing as we read them */
+  return true;
+}
+
+/* NOTE(Anton) this could have ifdef branches on different compilers for the intrinsics versions for perf */
+static uint32_t _bitscan( uint32_t dword ) {
+  for ( uint32_t i = 0; i < 32; i++ ) {
+    if ( 1 & dword ) { return i; }
+    dword = dword >> 1;
+  }
+  return (uint32_t)-1;
+}
+
+unsigned char* apg_bmp_read( const char* filename, int* w, int* h, unsigned int* n_chans ) {
+  if ( !filename || !w || !h || !n_chans ) { return NULL; }
+
+  // read in the whole file into memory first - much faster than parsing on-the-fly
+  _entire_file_t record;
+  if ( !_read_entire_file( filename, &record ) ) { return NULL; }
+  if ( record.sz < _BMP_MIN_HDR_SZ ) {
+    free( record.data );
+    return NULL;
+  }
+
+  // grab and validate the first, file, header
+  _bmp_file_header_t* file_hdr_ptr = (_bmp_file_header_t*)record.data;
+  if ( !_validate_file_hdr( file_hdr_ptr, record.sz ) ) {
+    free( record.data );
+    return NULL;
+  }
+
+  // grad and validate the second, DIB, header
+  _bmp_dib_BITMAPINFOHEADER_t* dib_hdr_ptr = (_bmp_dib_BITMAPINFOHEADER_t*)( (uint8_t*)record.data + _BMP_FILE_HDR_SZ );
+  if ( !_validate_dib_hdr( dib_hdr_ptr, record.sz ) ) {
+    free( record.data );
+    return NULL;
+  }
+
+  // bitmaps can have negative dims to indicate the image should be flipped
+  uint32_t width = *w = abs( dib_hdr_ptr->w );
+  uint32_t height = *h = abs( dib_hdr_ptr->h );
+
+  // TODO(Anton) flip image memory at the end if this is true. because doing it per row was making me write bugs.
+  // bool vertically_flip = dib_hdr_ptr->h > 0 ? false : true;
+
+  // channel count and palette are not well defined in the header so we make a good guess here
+  uint32_t n_dst_chans = 3, n_src_chans = 3;
+  bool has_palette = false;
+  switch ( dib_hdr_ptr->bpp ) {
+  case 32: n_dst_chans = n_src_chans = 4; break; // technically can be RGB but not supported
+  case 24: n_dst_chans = n_src_chans = 3; break; // technically can be RGBA but not supported
+  case 8:                                        // seems to always use a BGR0 palette, even for greyscale
+    n_dst_chans = 3;
+    has_palette = true;
+    n_src_chans = 1;
+    break;
+  case 4: // always has a palette - needed for a MS-saved BMP
+    n_dst_chans = 3;
+    has_palette = true;
+    n_src_chans = 1;
+    break;
+  case 1: // 1-bpp means the palette has 3 colour channels with 2 colours i.e. monochrome but not always black & white
+    n_dst_chans = 3;
+    has_palette = true;
+    n_src_chans = 1;
+    break;
+  default: // this includes 2bpp and 16bpp
+    free( record.data );
+    return NULL;
+  } // endswitch
+  *n_chans = n_dst_chans;
+  // NOTE(Anton) some image formats are not allowed a palette - could check for a bad header spec here also
+  if ( dib_hdr_ptr->n_colours_in_palette > 0 ) { has_palette = true; }
+
+#ifdef APG_BMP_DEBUG_OUTPUT
+  printf( "apg_bmp_debug: reading image\n|-filename `%s`\n|-dims %ux%u pixels\n|-bpp %u\n|-n_src_chans %u\n|-n_dst_chans %u\n", filename, *w, *h,
+    dib_hdr_ptr->bpp, n_src_chans, n_dst_chans );
+#endif
+
+  uint32_t palette_offset = _BMP_FILE_HDR_SZ + dib_hdr_ptr->this_header_sz;
+  bool has_bitmasks       = false;
+  if ( BI_BITFIELDS == dib_hdr_ptr->compression_method || BI_ALPHABITFIELDS == dib_hdr_ptr->compression_method ) {
+    has_bitmasks = true;
+    palette_offset += 12;
+  }
+  if ( palette_offset > record.sz ) {
+    free( record.data );
+    return NULL;
+  }
+
+  // work out if any padding how much to skip at end of each row
+  uint32_t unpadded_row_sz = width * n_src_chans;
+  // bit-encoded palette indices have different padding properties
+  if ( 4 == dib_hdr_ptr->bpp ) {
+    unpadded_row_sz = width % 2 > 0 ? width / 2 + 1 : width / 2; // find how many whole bytes required for this bit width
+  }
+  if ( 1 == dib_hdr_ptr->bpp ) {
+    unpadded_row_sz = width % 8 > 0 ? width / 8 + 1 : width / 8; // find how many whole bytes required for this bit width
+  }
+  uint32_t row_padding_sz = 0 == unpadded_row_sz % 4 ? 0 : 4 - ( unpadded_row_sz % 4 ); // NOTE(Anton) didn't expect operator precedence of - over %
+
+  // another file size integrity check: partially validate source image data size
+  // 'image_data_offset' is by row padded to 4 bytes and is either colour data or palette indices.
+  if ( file_hdr_ptr->image_data_offset + ( unpadded_row_sz + row_padding_sz ) * height > record.sz ) {
+    free( record.data );
+    return NULL;
+  }
+
+  // find which bit number each colour channel starts at, so we can separate colours out
+  uint32_t bitshift_rgba[4] = {0, 0, 0, 0}; // NOTE(Anton) noticed this was int and not uint32_t so changed it. 17 Mar 2020
+  uint32_t bitmask_a        = 0;
+  if ( has_bitmasks ) {
+    bitmask_a        = ~( dib_hdr_ptr->bitmask_r | dib_hdr_ptr->bitmask_g | dib_hdr_ptr->bitmask_b );
+    bitshift_rgba[0] = _bitscan( dib_hdr_ptr->bitmask_r );
+    bitshift_rgba[1] = _bitscan( dib_hdr_ptr->bitmask_g );
+    bitshift_rgba[2] = _bitscan( dib_hdr_ptr->bitmask_b );
+    bitshift_rgba[3] = _bitscan( bitmask_a );
+  }
+
+  // allocate memory for the output pixels block. cast to size_t in case width and height are both the max of 65536 and n_dst_chans > 1
+  unsigned char* dst_img_ptr = malloc( (size_t)width * (size_t)height * (size_t)n_dst_chans );
+  if ( !dst_img_ptr ) {
+    free( record.data );
+    return NULL;
+  }
+
+  uint8_t* palette_data_ptr = (uint8_t*)record.data + palette_offset;
+  uint8_t* src_img_ptr      = (uint8_t*)record.data + file_hdr_ptr->image_data_offset;
+  size_t dst_stride_sz      = width * n_dst_chans;
+
+  //   == 32-bpp -> 32-bit RGBA. == 32-bit and 16-bit require bitmasks
+  if ( 32 == dib_hdr_ptr->bpp ) {
+    // check source image has enough data in it to read from
+    if ( (size_t)file_hdr_ptr->image_data_offset + (size_t)height * (size_t)width * (size_t)n_src_chans > record.sz ) {
+      free( record.data );
+      free( dst_img_ptr );
+      return NULL;
+    }
+    size_t src_byte_idx = 0;
+    for ( uint32_t r = 0; r < height; r++ ) {
+      size_t dst_pixels_idx = r * dst_stride_sz;
+      for ( uint32_t c = 0; c < width; c++ ) {
+        uint32_t pixel;
+        memcpy( &pixel, &src_img_ptr[src_byte_idx], 4 );
+        // NOTE(Anton) the below assumes 32-bits is always RGBA 1 byte per channel. 10,10,10 RGB exists though and isn't handled.
+        dst_img_ptr[dst_pixels_idx++] = ( uint8_t )( ( pixel & dib_hdr_ptr->bitmask_r ) >> bitshift_rgba[0] );
+        dst_img_ptr[dst_pixels_idx++] = ( uint8_t )( ( pixel & dib_hdr_ptr->bitmask_g ) >> bitshift_rgba[1] );
+        dst_img_ptr[dst_pixels_idx++] = ( uint8_t )( ( pixel & dib_hdr_ptr->bitmask_b ) >> bitshift_rgba[2] );
+        dst_img_ptr[dst_pixels_idx++] = ( uint8_t )( ( pixel & bitmask_a ) >> bitshift_rgba[3] );
+        src_byte_idx += 4;
+      }
+      src_byte_idx += row_padding_sz;
+    }
+
+    // == 8-bpp -> 24-bit RGB ==
+  } else if ( 8 == dib_hdr_ptr->bpp && has_palette ) {
+    // validate indices (body of image data) fits in file
+    if ( file_hdr_ptr->image_data_offset + height * width > record.sz ) {
+      free( record.data );
+      free( dst_img_ptr );
+      return NULL;
+    }
+    size_t src_byte_idx = 0;
+    for ( uint32_t r = 0; r < height; r++ ) {
+      size_t dst_pixels_idx = ( height - 1 - r ) * dst_stride_sz;
+      for ( uint32_t c = 0; c < width; c++ ) {
+        // "most palettes are 4 bytes in RGB0 order but 3 for..." - it was actually BRG0 in old images -- Anton
+        uint8_t index = src_img_ptr[src_byte_idx]; // 8-bit index value per pixel
+
+        if ( palette_offset + index * 4 + 2 >= record.sz ) {
+          free( record.data );
+          return dst_img_ptr;
+        }
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[index * 4 + 2];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[index * 4 + 1];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[index * 4 + 0];
+        src_byte_idx++;
+      }
+      src_byte_idx += row_padding_sz;
+    }
+
+    // == 4-bpp (16-colour) -> 24-bit RGB ==
+  } else if ( 4 == dib_hdr_ptr->bpp && has_palette ) {
+    size_t src_byte_idx = 0;
+    for ( uint32_t r = 0; r < height; r++ ) {
+      size_t dst_pixels_idx = ( height - 1 - r ) * dst_stride_sz;
+      for ( uint32_t c = 0; c < width; c++ ) {
+        if ( file_hdr_ptr->image_data_offset + src_byte_idx > record.sz ) {
+          free( record.data );
+          free( dst_img_ptr );
+          return NULL;
+        }
+        // handle 2 pixels at a time
+        uint8_t pixel_duo = src_img_ptr[src_byte_idx];
+        uint8_t a_index   = ( 0xFF & pixel_duo ) >> 4;
+        uint8_t b_index   = 0xF & pixel_duo;
+
+        if ( palette_offset + a_index * 4 + 2 >= record.sz ) { // invalid src image
+          free( record.data );
+          return dst_img_ptr;
+        }
+        if ( dst_pixels_idx + 3 > width * height * n_dst_chans ) { // done
+          free( record.data );
+          return dst_img_ptr;
+        }
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[a_index * 4 + 2];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[a_index * 4 + 1];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[a_index * 4 + 0];
+        if ( ++c >= width ) { // advance a column
+          c = 0;
+          r++;
+          if ( r >= height ) { // done. no need to get second pixel. eg a 1x1 pixel image.
+            free( record.data );
+            return dst_img_ptr;
+          }
+          dst_pixels_idx = ( height - 1 - r ) * dst_stride_sz;
+        }
+
+        if ( palette_offset + b_index * 4 + 2 >= record.sz ) { // invalid src image
+          free( record.data );
+          return dst_img_ptr;
+        }
+        if ( dst_pixels_idx + 3 > width * height * n_dst_chans ) { // done. probably redundant check since checking r >= height.
+          free( record.data );
+          return dst_img_ptr;
+        }
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[b_index * 4 + 2];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[b_index * 4 + 1];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[b_index * 4 + 0];
+        src_byte_idx++;
+      }
+      src_byte_idx += row_padding_sz;
+    }
+
+    // == 1-bpp -> 24-bit RGB ==
+  } else if ( 1 == dib_hdr_ptr->bpp && has_palette ) {
+    /* encoding method for monochrome is not well documented.
+    a 2x2 pixel image is stored as 4 1-bit palette indexes
+    the palette is stored as any 2 RGB0 colours (not necessarily B&W)
+    so for an image with indexes like so:
+    1 1
+    0 1
+    it is bit-encoded as follows, starting at MSB:
+    01000000 00000000 00000000 00000000 (first byte val  64)
+    11000000 00000000 00000000 00000000 (first byte val 192)
+    data is still split by row and each row padded to 4 byte multiples
+     */
+    size_t src_byte_idx = 0;
+    for ( uint32_t r = 0; r < height; r++ ) {
+      uint8_t bit_idx       = 0; // used in monochrome
+      size_t dst_pixels_idx = ( height - 1 - r ) * dst_stride_sz;
+      for ( uint32_t c = 0; c < width; c++ ) {
+        if ( 8 == bit_idx ) { // start reading from the next byte
+          src_byte_idx++;
+          bit_idx = 0;
+        }
+        if ( file_hdr_ptr->image_data_offset + src_byte_idx > record.sz ) {
+          free( record.data );
+          return dst_img_ptr;
+        }
+        uint8_t pixel_oct   = src_img_ptr[src_byte_idx];
+        uint8_t bit         = 128 >> bit_idx;
+        uint8_t masked      = pixel_oct & bit;
+        uint8_t palette_idx = masked > 0 ? 1 : 0;
+
+        if ( palette_offset + palette_idx * 4 + 2 >= record.sz ) {
+          free( record.data );
+          return dst_img_ptr;
+        }
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[palette_idx * 4 + 2];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[palette_idx * 4 + 1];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[palette_idx * 4 + 0];
+        bit_idx++;
+      }
+      src_byte_idx += ( row_padding_sz + 1 ); // 1bpp is special here
+    }
+
+    // == 24-bpp -> 24-bit RGB == (but also should handle some other n_chans cases)
+  } else {
+    // NOTE(Anton) this only supports 1 byte per channel
+    if ( file_hdr_ptr->image_data_offset + height * width * n_dst_chans > record.sz ) {
+      free( record.data );
+      free( dst_img_ptr );
+      return NULL;
+    }
+    size_t src_byte_idx = 0;
+    for ( uint32_t r = 0; r < height; r++ ) {
+      size_t dst_pixels_idx = ( height - 1 - r ) * dst_stride_sz;
+      for ( uint32_t c = 0; c < width; c++ ) {
+        // re-orders from BGR to RGB
+        if ( n_dst_chans > 3 ) { dst_img_ptr[dst_pixels_idx++] = src_img_ptr[src_byte_idx + 3]; }
+        if ( n_dst_chans > 2 ) { dst_img_ptr[dst_pixels_idx++] = src_img_ptr[src_byte_idx + 2]; }
+        if ( n_dst_chans > 1 ) { dst_img_ptr[dst_pixels_idx++] = src_img_ptr[src_byte_idx + 1]; }
+        dst_img_ptr[dst_pixels_idx++] = src_img_ptr[src_byte_idx];
+        src_byte_idx += n_src_chans;
+      }
+      src_byte_idx += row_padding_sz;
+    }
+  } // endif bpp
+
+  free( record.data );
+  return dst_img_ptr;
+}
+
+void apg_bmp_free( unsigned char* pixels_ptr ) {
+  if ( !pixels_ptr ) { return; }
+  free( pixels_ptr );
+}
+
+unsigned int apg_bmp_write( const char* filename, unsigned char* pixels_ptr, int w, int h, unsigned int n_chans ) {
+  if ( !filename || !pixels_ptr ) { return 0; }
+  if ( 0 == w || 0 == h ) { return 0; }
+  if ( labs( w ) > _BMP_MAX_DIMS || labs( h ) > _BMP_MAX_DIMS ) { return 0; }
+  if ( n_chans != 3 && n_chans != 4 ) { return 0; }
+
+  uint32_t height = (uint32_t)labs( h );
+  uint32_t width  = (uint32_t)labs( w );
+  // work out if any padding how much to skip at end of each row
+  const size_t unpadded_row_sz      = width * n_chans;
+  const size_t row_padding_sz       = 0 == unpadded_row_sz % 4 ? 0 : 4 - unpadded_row_sz % 4;
+  const size_t row_sz               = unpadded_row_sz + row_padding_sz;
+  const size_t dst_pixels_padded_sz = row_sz * height;
+
+  const size_t dib_hdr_sz = sizeof( _bmp_dib_BITMAPINFOHEADER_t );
+  _bmp_file_header_t file_hdr;
+  {
+    file_hdr.file_type[0]      = 'B';
+    file_hdr.file_type[1]      = 'M';
+    file_hdr.file_sz           = _BMP_FILE_HDR_SZ + (uint32_t)dib_hdr_sz + (uint32_t)dst_pixels_padded_sz;
+    file_hdr.reserved1         = 0;
+    file_hdr.reserved2         = 0;
+    file_hdr.image_data_offset = _BMP_FILE_HDR_SZ + (uint32_t)dib_hdr_sz;
+  }
+  _bmp_dib_BITMAPINFOHEADER_t dib_hdr;
+  {
+    dib_hdr.this_header_sz         = _BMP_MIN_DIB_HDR_SZ; // NOTE: must be 40 and not include the bitmask memory in size here
+    dib_hdr.w                      = w;
+    dib_hdr.h                      = h;
+    dib_hdr.n_planes               = 1;
+    dib_hdr.bpp                    = 3 == n_chans ? 24 : 32;
+    dib_hdr.compression_method     = 3 == n_chans ? BI_RGB : BI_BITFIELDS;
+    dib_hdr.image_uncompressed_sz  = 0;
+    dib_hdr.horiz_pixels_per_meter = 0;
+    dib_hdr.vert_pixels_per_meter  = 0;
+    dib_hdr.n_colours_in_palette   = 0;
+    dib_hdr.n_important_colours    = 0;
+    // big-endian masks. only used in BI_BITFIELDS and BI_ALPHABITFIELDS ( 16 and 32-bit images )
+    // important note: GIMP stores BMP data in this array order for 32-bit: [A][B][G][R]
+    dib_hdr.bitmask_r = 0xFF000000;
+    dib_hdr.bitmask_g = 0x00FF0000;
+    dib_hdr.bitmask_b = 0x0000FF00;
+  }
+
+  uint8_t* dst_pixels_ptr = malloc( dst_pixels_padded_sz );
+  if ( !dst_pixels_ptr ) { return 0; }
+  {
+    size_t dst_byte_idx = 0;
+    uint8_t padding[4]  = {0, 0, 0, 0};
+    uint8_t rgba[4]     = {0, 0, 0, 0};
+    uint8_t bgra[4]     = {0, 0, 0, 0};
+
+    for ( uint32_t row = 0; row < height; row++ ) {
+      size_t src_byte_idx = ( height - 1 - row ) * n_chans * width;
+      for ( uint32_t col = 0; col < width; col++ ) {
+        for ( uint32_t chan = 0; chan < n_chans; chan++ ) { rgba[chan] = pixels_ptr[src_byte_idx++]; }
+        if ( 3 == n_chans ) {
+          bgra[0] = rgba[2];
+          bgra[1] = rgba[1];
+          bgra[2] = rgba[0];
+        } else {
+          /* NOTE(Anton) RGBA with alpha channel would be better supported with an extended DIB header */
+          bgra[0] = rgba[3];
+          bgra[1] = rgba[2];
+          bgra[2] = rgba[1];
+          bgra[3] = rgba[0]; // alpha
+        }
+        memcpy( &dst_pixels_ptr[dst_byte_idx], bgra, n_chans );
+        dst_byte_idx += (size_t)n_chans;
+      } // endfor col
+      if ( row_padding_sz > 0 ) {
+        memcpy( &dst_pixels_ptr[dst_byte_idx], padding, row_padding_sz );
+        dst_byte_idx += row_padding_sz;
+      }
+    } // endfor row
+  }
+  {
+    FILE* fp = fopen( filename, "wb" );
+    if ( !fp ) {
+      free( dst_pixels_ptr );
+      return 0;
+    }
+    if ( 1 != fwrite( &file_hdr, _BMP_FILE_HDR_SZ, 1, fp ) ) {
+      free( dst_pixels_ptr );
+      fclose( fp );
+      return 0;
+    }
+    if ( 1 != fwrite( &dib_hdr, dib_hdr_sz, 1, fp ) ) {
+      free( dst_pixels_ptr );
+      fclose( fp );
+      return 0;
+    }
+    if ( 1 != fwrite( dst_pixels_ptr, dst_pixels_padded_sz, 1, fp ) ) {
+      free( dst_pixels_ptr );
+      fclose( fp );
+      return 0;
+    }
+    fclose( fp );
+  }
+  free( dst_pixels_ptr );
+
+  return 1;
+}
--- a/thirdparty/basis_universal/encoder/apg_bmp.h
+++ b/thirdparty/basis_universal/encoder/apg_bmp.h
@ -0,0 +1,123 @@
+/*
+BMP File Reader/Writer Implementation
+Anton Gerdelan
+Version: 3.1 18 March 2020.
+Licence: see bottom of file.
+C89 ( Implementation is C99 )
+
+Contributors:
+- Anton Gerdelan - Initial code.
+- Saija Sorsa    - Fuzz testing.
+
+Instructions:
+- Just drop this header, and the matching .c file into your project.
+- To get debug printouts during parsing define APG_BMP_DEBUG_OUTPUT.
+
+Advantages:
+- The implementation is fast, simple, and supports more formats than most BMP reader libraries.
+- The reader function is fuzzed with AFL https://lcamtuf.coredump.cx/afl/.
+- The reader is robust to large files and malformed files, and will return any valid partial data in an image.
+- Reader supports 32bpp (with alpha channel), 24bpp, 8bpp, 4bpp, and 1bpp monochrome BMP images.
+- Reader handles indexed BMP images using a colour palette.
+- Writer supports 32bpp RGBA and 24bpp uncompressed RGB images.
+
+Current Limitations:
+- 16-bit images not supported (don't have any samples to test on).
+- No support for interleaved channel bit layouts eg RGB101010 RGB555 RGB565.
+- No support for compressed BMP images, although in practice these are not used.
+- Output images with alpha channel are written in BITMAPINFOHEADER format.
+  For better alpha support in other apps the 124-bit v5 header could be used instead,
+	at the cost of some backward compatibility and bloat.
+
+To Do:
+- FUZZING
+  - create a unique fuzz test set for (8,4,1 BPP).
+- (maybe) FEATURE Flipping the image based on negative width and height in header, and/or function arguments. 
+- (maybe) PERF ifdef intrinsics/asm for bitscan. Platform-specific code so won't include unless necessary.
+- (maybe) FEATURE Add parameter for padding output memory to eg 4-byte alignment or n channels.
+- (maybe) FEATURE Improved apps support in alpha channel writing (using v5 header).
+*/
+
+#ifndef APG_BMP_H_
+#define APG_BMP_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* CPP */
+
+/* Reads a bitmap from a file, allocates memory for the raw image data, and returns it.
+PARAMS
+  * w,h,     - Retrieves the width and height of the BMP in pixels.
+  * n_chans  - Retrieves the number of channels in the BMP.
+RETURNS
+  * Tightly-packed pixel memory in RGBA order. The caller must call free() on the memory.
+  * NULL on any error. Any allocated memory is freed before returning NULL. */
+unsigned char* apg_bmp_read( const char* filename, int* w, int* h, unsigned int* n_chans );
+
+/* Calls free() on memory created by apg_bmp_read */
+void apg_bmp_free( unsigned char* pixels_ptr );
+
+/* Writes a bitmap to a file.
+PARAMS
+  * filename   - e.g."my_bitmap.bmp". Must not be NULL.
+  * pixels_ptr - Pointer to tightly-packed pixel memory in RGBA order. Must not be NULL. There must be abs(w)*abs(h)*n_chans bytes in the memory pointed to.
+  * w,h,       - Width and height of the image in pixels.
+  * n_chans    - The number of channels in the BMP. 3 or 4 supported for writing, which means RGB or RGBA memory, respectively.
+RETURNS
+  * Zero on any error, non zero on success. */
+unsigned int apg_bmp_write( const char* filename, unsigned char* pixels_ptr, int w, int h, unsigned int n_chans );
+
+#ifdef __cplusplus
+}
+#endif /* CPP */
+
+#endif /*_APG_BMP_H_ */
+
+/*
+-------------------------------------------------------------------------------------
+This software is available under two licences - you may use it under either licence.
+-------------------------------------------------------------------------------------
+FIRST LICENCE OPTION
+
+>                                  Apache License
+>                            Version 2.0, January 2004
+>                         http://www.apache.org/licenses/
+>    Copyright 2019 Anton Gerdelan.
+>    Licensed under the Apache License, Version 2.0 (the "License");
+>    you may not use this file except in compliance with the License.
+>    You may obtain a copy of the License at
+>        http://www.apache.org/licenses/LICENSE-2.0
+>    Unless required by applicable law or agreed to in writing, software
+>    distributed under the License is distributed on an "AS IS" BASIS,
+>    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+>    See the License for the specific language governing permissions and
+>    limitations under the License.
+-------------------------------------------------------------------------------------
+SECOND LICENCE OPTION
+
+> This is free and unencumbered software released into the public domain.
+>
+> Anyone is free to copy, modify, publish, use, compile, sell, or
+> distribute this software, either in source code form or as a compiled
+> binary, for any purpose, commercial or non-commercial, and by any
+> means.
+>
+> In jurisdictions that recognize copyright laws, the author or authors
+> of this software dedicate any and all copyright interest in the
+> software to the public domain. We make this dedication for the benefit
+> of the public at large and to the detriment of our heirs and
+> successors. We intend this dedication to be an overt act of
+> relinquishment in perpetuity of all present and future rights to this
+> software under copyright law.
+>
+> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+> EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+> MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+> IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+> OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+> ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+> OTHER DEALINGS IN THE SOFTWARE.
+>
+> For more information, please refer to <http://unlicense.org>
+-------------------------------------------------------------------------------------
+*/
--- a/thirdparty/basis_universal/encoder/basisu_astc_decomp.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_astc_decomp.cpp
@ -50,6 +50,13 @@ typedef uint64_t deUint64;

 #define DE_ASSERT assert

+#ifdef _MSC_VER
+#pragma warning (disable:4505) // unreferenced local function has been removed
+#elif defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
 namespace basisu_astc
 {
 	static bool inBounds(int v, int l, int h)
@ -150,7 +157,7 @@ namespace basisu_astc

 		UVec4 asUint() const
 		{
-			return UVec4(std::max(0, m_c[0]), std::max(0, m_c[1]), std::max(0, m_c[2]), std::max(0, m_c[3]));
+			return UVec4(basisu::maximum(0, m_c[0]), basisu::maximum(0, m_c[1]), basisu::maximum(0, m_c[2]), basisu::maximum(0, m_c[3]));
 		}

 		int32_t operator[] (uint32_t idx) const { assert(idx < 4);  return m_c[idx]; }
@ -1256,7 +1263,7 @@ void interpolateWeights (TexelWeightPair* dst, const deUint32 (&unquantizedWeigh
 	const int		numWeightsPerTexel	= blockMode.isDualPlane ? 2 : 1;
 	const deUint32	scaleX				= (1024 + blockWidth/2) / (blockWidth-1);
 	const deUint32	scaleY				= (1024 + blockHeight/2) / (blockHeight-1);
-	DE_ASSERT(blockMode.weightGridWidth*blockMode.weightGridHeight*numWeightsPerTexel <= DE_LENGTH_OF_ARRAY(unquantizedWeights));
+	DE_ASSERT(blockMode.weightGridWidth*blockMode.weightGridHeight*numWeightsPerTexel <= (int)DE_LENGTH_OF_ARRAY(unquantizedWeights));
 	for (int texelY = 0; texelY < blockHeight; texelY++)
 	{
 		for (int texelX = 0; texelX < blockWidth; texelX++)
@ -1548,3 +1555,7 @@ bool decompress(uint8_t *pDst, const uint8_t * data, bool isSRGB, int blockWidth

 } // astc
 } // basisu_astc
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
--- a/thirdparty/basis_universal/encoder/basisu_astc_decomp.h
+++ b/thirdparty/basis_universal/encoder/basisu_astc_decomp.h
@ -23,7 +23,7 @@
 * \brief ASTC Utilities.
 *//*--------------------------------------------------------------------*/

-#include "transcoder/basisu.h" // to pick up the iterator debug level madness
+#include "../transcoder/basisu.h" // to pick up the iterator debug level madness
 #include <vector>
 #include <stdint.h>

--- a/thirdparty/basis_universal/encoder/basisu_backend.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_backend.cpp
@ -1,5 +1,5 @@
 // basisu_backend.cpp
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -17,6 +17,11 @@
 //
 #include "basisu_backend.h"

+#if BASISU_SUPPORT_SSE
+#define CPPSPMD_NAME(a) a##_sse41
+#include "basisu_kernels_declares.h"
+#endif
+
 #define BASISU_FASTER_SELECTOR_REORDERING 0
 #define BASISU_BACKEND_VERIFY(c) verify(c, __LINE__);

@ -176,64 +181,117 @@ namespace basisu
 	void basisu_backend::reoptimize_and_sort_endpoints_codebook(uint32_t total_block_endpoints_remapped, uint_vec& all_endpoint_indices)
 	{
 		basisu_frontend& r = *m_pFront_end;
-		const bool is_video = r.get_params().m_tex_type == basist::cBASISTexTypeVideoFrames;
+		//const bool is_video = r.get_params().m_tex_type == basist::cBASISTexTypeVideoFrames;

-		if ((total_block_endpoints_remapped) && (m_params.m_compression_level > 0))
+		if (m_params.m_used_global_codebooks)
 		{
-			// We're changed the block endpoint indices, so we need to go and adjust the endpoint codebook (remove unused entries, optimize existing entries that have changed)
-			uint_vec new_block_endpoints(get_total_blocks());
-
-			for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
+			m_endpoint_remap_table_old_to_new.clear();
+			m_endpoint_remap_table_old_to_new.resize(r.get_total_endpoint_clusters());
+			for (uint32_t i = 0; i < r.get_total_endpoint_clusters(); i++)
+				m_endpoint_remap_table_old_to_new[i] = i;
+		}
+		else
+		{
+			//if ((total_block_endpoints_remapped) && (m_params.m_compression_level > 0))
+			if ((total_block_endpoints_remapped) && (m_params.m_compression_level > 1))
 			{
-				const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
-				const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
-				const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;
+				// We've changed the block endpoint indices, so we need to go and adjust the endpoint codebook (remove unused entries, optimize existing entries that have changed)
+				uint_vec new_block_endpoints(get_total_blocks());

-				for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
-					for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
-						new_block_endpoints[first_block_index + block_x + block_y * num_blocks_x] = m_slice_encoder_blocks[slice_index](block_x, block_y).m_endpoint_index;
-			}
-
-			int_vec old_to_new_endpoint_indices;
-			r.reoptimize_remapped_endpoints(new_block_endpoints, old_to_new_endpoint_indices, true);
-
-			create_endpoint_palette();
-
-			for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
-			{
-				const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
-
-				const uint32_t width = m_slices[slice_index].m_width;
-				const uint32_t height = m_slices[slice_index].m_height;
-				const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
-				const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;
-
-				for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+				for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
 				{
-					for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+					const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
+					const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
+					const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;
+
+					for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+						for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+							new_block_endpoints[first_block_index + block_x + block_y * num_blocks_x] = m_slice_encoder_blocks[slice_index](block_x, block_y).m_endpoint_index;
+				}
+
+				int_vec old_to_new_endpoint_indices;
+				r.reoptimize_remapped_endpoints(new_block_endpoints, old_to_new_endpoint_indices, true);
+
+				create_endpoint_palette();
+
+				for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
+				{
+					//const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
+
+					//const uint32_t width = m_slices[slice_index].m_width;
+					//const uint32_t height = m_slices[slice_index].m_height;
+					const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
+					const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;
+
+					for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
 					{
-						const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x;
+						for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+						{
+							//const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x;

-						encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+							encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);

-						m.m_endpoint_index = old_to_new_endpoint_indices[m.m_endpoint_index];
-					} // block_x
-				} // block_y
-			} // slice_index
+							m.m_endpoint_index = old_to_new_endpoint_indices[m.m_endpoint_index];
+						} // block_x
+					} // block_y
+				} // slice_index

-			for (uint32_t i = 0; i < all_endpoint_indices.size(); i++)
-				all_endpoint_indices[i] = old_to_new_endpoint_indices[all_endpoint_indices[i]];
+				for (uint32_t i = 0; i < all_endpoint_indices.size(); i++)
+					all_endpoint_indices[i] = old_to_new_endpoint_indices[all_endpoint_indices[i]];

-		} //if (total_block_endpoints_remapped)
+			} //if (total_block_endpoints_remapped)

-		// Sort endpoint codebook
-		palette_index_reorderer reorderer;
-		reorderer.init((uint32_t)all_endpoint_indices.size(), &all_endpoint_indices[0], r.get_total_endpoint_clusters(), nullptr, nullptr, 0);
-		m_endpoint_remap_table_old_to_new = reorderer.get_remap_table();
+			// Sort endpoint codebook
+			palette_index_reorderer reorderer;
+			reorderer.init((uint32_t)all_endpoint_indices.size(), &all_endpoint_indices[0], r.get_total_endpoint_clusters(), nullptr, nullptr, 0);
+			m_endpoint_remap_table_old_to_new = reorderer.get_remap_table();
+		}

+		// For endpoints, old_to_new[] may not be bijective! 
+		// Some "old" entries may be unused and don't get remapped into the "new" array.
+
+		m_old_endpoint_was_used.clear();
+		m_old_endpoint_was_used.resize(r.get_total_endpoint_clusters());
+		uint32_t first_old_entry_index = UINT32_MAX;
+
+		for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
+		{
+			const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x, num_blocks_y = m_slices[slice_index].m_num_blocks_y;
+			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+			{
+				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+				{
+					encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+					const uint32_t old_endpoint_index = m.m_endpoint_index;
+
+					m_old_endpoint_was_used[old_endpoint_index] = true;
+					first_old_entry_index = basisu::minimum(first_old_entry_index, old_endpoint_index);
+				} // block_x
+			} // block_y
+		} // slice_index
+
+		debug_printf("basisu_backend::reoptimize_and_sort_endpoints_codebook: First old entry index: %u\n", first_old_entry_index);
+						
+		m_new_endpoint_was_used.clear();
+		m_new_endpoint_was_used.resize(r.get_total_endpoint_clusters());
+
+		m_endpoint_remap_table_new_to_old.clear();
 		m_endpoint_remap_table_new_to_old.resize(r.get_total_endpoint_clusters());
-		for (uint32_t i = 0; i < m_endpoint_remap_table_old_to_new.size(); i++)
-			m_endpoint_remap_table_new_to_old[m_endpoint_remap_table_old_to_new[i]] = i;
+		
+		// Set unused entries in the new array to point to the first used entry in the old array.
+		m_endpoint_remap_table_new_to_old.set_all(first_old_entry_index);
+
+		for (uint32_t old_index = 0; old_index < m_endpoint_remap_table_old_to_new.size(); old_index++)
+		{
+			if (m_old_endpoint_was_used[old_index])
+			{
+				const uint32_t new_index = m_endpoint_remap_table_old_to_new[old_index];
+				
+				m_new_endpoint_was_used[new_index] = true;
+
+				m_endpoint_remap_table_new_to_old[new_index] = old_index;
+			}
+		}
 	}

 	void basisu_backend::sort_selector_codebook()
@ -242,7 +300,7 @@ namespace basisu

 		m_selector_remap_table_new_to_old.resize(r.get_total_selector_clusters());

-		if (m_params.m_compression_level == 0)
+		if ((m_params.m_compression_level == 0) || (m_params.m_used_global_codebooks))
 		{
 			for (uint32_t i = 0; i < r.get_total_selector_clusters(); i++)
 				m_selector_remap_table_new_to_old[i] = i;
@ -336,10 +394,10 @@ namespace basisu
 		for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
 		{
 			const bool is_iframe = m_slices[slice_index].m_iframe;
-			const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
+			//const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;

-			const uint32_t width = m_slices[slice_index].m_width;
-			const uint32_t height = m_slices[slice_index].m_height;
+			//const uint32_t width = m_slices[slice_index].m_width;
+			//const uint32_t height = m_slices[slice_index].m_height;
 			const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
 			const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;
 			const int prev_frame_slice_index = find_video_frame(slice_index, -1);
@ -393,6 +451,7 @@ namespace basisu

 		BASISU_BACKEND_VERIFY(total_invalid_crs == 0);
 	}
+
 	void basisu_backend::create_encoder_blocks()
 	{
 		basisu_frontend& r = *m_pFront_end;
@ -411,8 +470,8 @@ namespace basisu
 			const bool is_iframe = m_slices[slice_index].m_iframe;
 			const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;

-			const uint32_t width = m_slices[slice_index].m_width;
-			const uint32_t height = m_slices[slice_index].m_height;
+			//const uint32_t width = m_slices[slice_index].m_width;
+			//const uint32_t height = m_slices[slice_index].m_height;
 			const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
 			const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;

@ -590,7 +649,7 @@ namespace basisu
 	{
 		for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
 		{
-			const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
+			//const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
 			const uint32_t width = m_slices[slice_index].m_width;
 			const uint32_t height = m_slices[slice_index].m_height;
 			const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
@ -603,7 +662,7 @@ namespace basisu
 			{
 				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
 				{
-					const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x;
+					//const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x;

 					encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);

@ -662,7 +721,7 @@ namespace basisu
 		histogram selector_histogram(r.get_total_selector_clusters() + basist::MAX_SELECTOR_HISTORY_BUF_SIZE + 1);
 		histogram selector_history_buf_rle_histogram(1 << basist::SELECTOR_HISTORY_BUF_RLE_COUNT_BITS);

-		std::vector<uint_vec> selector_syms(m_slices.size());
+		basisu::vector<uint_vec> selector_syms(m_slices.size());

 		const uint32_t SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX = r.get_total_selector_clusters();
 		const uint32_t SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX = SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX + basist::MAX_SELECTOR_HISTORY_BUF_SIZE;
@ -672,7 +731,7 @@ namespace basisu
 		histogram delta_endpoint_histogram(r.get_total_endpoint_clusters());

 		histogram endpoint_pred_histogram(basist::ENDPOINT_PRED_TOTAL_SYMBOLS);
-		std::vector<uint_vec> endpoint_pred_syms(m_slices.size());
+		basisu::vector<uint_vec> endpoint_pred_syms(m_slices.size());

 		uint32_t total_endpoint_indices_remapped = 0;

@ -680,11 +739,11 @@ namespace basisu

 		for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
 		{
-			const int prev_frame_slice_index = is_video ? find_video_frame(slice_index, -1) : -1;
-			const int next_frame_slice_index = is_video ? find_video_frame(slice_index, 1) : -1;
+			//const int prev_frame_slice_index = is_video ? find_video_frame(slice_index, -1) : -1;
+			//const int next_frame_slice_index = is_video ? find_video_frame(slice_index, 1) : -1;
 			const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
-			const uint32_t width = m_slices[slice_index].m_width;
-			const uint32_t height = m_slices[slice_index].m_height;
+			//const uint32_t width = m_slices[slice_index].m_width;
+			//const uint32_t height = m_slices[slice_index].m_height;
 			const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
 			const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;

@ -702,7 +761,7 @@ namespace basisu
 			{
 				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
 				{
-					const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x;
+					//const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x;

 					encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);

@ -723,6 +782,7 @@ namespace basisu

 				}  // block_x
 			} // block_y
+
 			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
 			{
 				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
@ -821,6 +881,10 @@ namespace basisu
 									if (trial_idx == new_endpoint_index)
 										continue;

+									// Skip it if this new endpoint palette entry is actually never used.
+									if (!m_new_endpoint_was_used[trial_idx])
+										continue;
+
 									const etc1_endpoint_palette_entry& p = m_endpoint_palette[m_endpoint_remap_table_new_to_old[trial_idx]];
 									trial_etc_blk.set_block_color5_etc1s(p.m_color5);
 									trial_etc_blk.set_inten_tables_etc1s(p.m_inten5);
@ -884,23 +948,32 @@ namespace basisu
 						{
 							const pixel_block& src_pixels = r.get_source_pixel_block(block_index);

-							etc_block etc_blk(r.get_output_block(block_index));
+							const etc_block& etc_blk = r.get_output_block(block_index);

 							color_rgba etc_blk_unpacked[16];
 							unpack_etc1(etc_blk, etc_blk_unpacked);

 							uint64_t cur_err = 0;
-							for (uint32_t p = 0; p < 16; p++)
-								cur_err += color_distance(r.get_params().m_perceptual, src_pixels.get_ptr()[p], etc_blk_unpacked[p], false);
-
+							if (r.get_params().m_perceptual)
+							{
+								for (uint32_t p = 0; p < 16; p++)
+									cur_err += color_distance(true, src_pixels.get_ptr()[p], etc_blk_unpacked[p], false);
+							}
+							else
+							{
+								for (uint32_t p = 0; p < 16; p++)
+									cur_err += color_distance(false, src_pixels.get_ptr()[p], etc_blk_unpacked[p], false);
+							}
+														
 							uint64_t best_trial_err = UINT64_MAX;
 							int best_trial_idx = 0;
 							uint32_t best_trial_history_buf_idx = 0;

-
 							const float selector_remap_thresh = maximum(1.0f, m_params.m_selector_rdo_quality_thresh); //2.5f;
 							const bool use_strict_search = (m_params.m_compression_level == 0) && (selector_remap_thresh == 1.0f);

+							const uint64_t limit_err = (uint64_t)ceilf(cur_err * selector_remap_thresh);
+							
 							for (uint32_t j = 0; j < selector_history_buf.size(); j++)
 							{
 								const int trial_idx = selector_history_buf[j];
@ -917,30 +990,42 @@ namespace basisu
 								}
 								else
 								{
-									for (uint32_t sy = 0; sy < 4; sy++)
-										for (uint32_t sx = 0; sx < 4; sx++)
-											etc_blk.set_selector(sx, sy, m_selector_palette[m_selector_remap_table_new_to_old[trial_idx]](sx, sy));
-
-									// TODO: Optimize this
-									unpack_etc1(etc_blk, etc_blk_unpacked);
-
 									uint64_t trial_err = 0;
-									const uint64_t thresh_err = minimum((uint64_t)ceilf(cur_err * selector_remap_thresh), best_trial_err);
-									for (uint32_t p = 0; p < 16; p++)
+									const uint64_t thresh_err = minimum(limit_err, best_trial_err);
+
+									color_rgba block_colors[4];
+									etc_blk.get_block_colors(block_colors, 0);
+
+									const uint8_t* pSelectors = &m_selector_palette[m_selector_remap_table_new_to_old[trial_idx]](0, 0);
+									
+									if (r.get_params().m_perceptual)
 									{
-										trial_err += color_distance(r.get_params().m_perceptual, src_pixels.get_ptr()[p], etc_blk_unpacked[p], false);
-										if (trial_err > thresh_err)
-											break;
+										for (uint32_t p = 0; p < 16; p++)
+										{
+											uint32_t sel = pSelectors[p];
+											trial_err += color_distance(true, src_pixels.get_ptr()[p], block_colors[sel], false);
+											if (trial_err > thresh_err)
+												break;
+										}
+									}
+									else
+									{
+										for (uint32_t p = 0; p < 16; p++)
+										{
+											uint32_t sel = pSelectors[p];
+											trial_err += color_distance(false, src_pixels.get_ptr()[p], block_colors[sel], false);
+											if (trial_err > thresh_err)
+												break;
+										}
 									}

-									if (trial_err <= cur_err * selector_remap_thresh)
+									if ((trial_err < best_trial_err) && (trial_err <= thresh_err))
 									{
-										if (trial_err < best_trial_err)
-										{
-											best_trial_err = trial_err;
-											best_trial_idx = trial_idx;
-											best_trial_history_buf_idx = j;
-										}
+										assert(trial_err <= limit_err);
+										
+										best_trial_err = trial_err;
+										best_trial_idx = trial_idx;
+										best_trial_history_buf_idx = j;
 									}
 								}
 							}
@ -1086,7 +1171,8 @@ namespace basisu
 			total_selector_indices_remapped, total_selector_indices_remapped * 100.0f / get_total_blocks(),
 			total_used_selector_history_buf, total_used_selector_history_buf * 100.0f / get_total_blocks());

-		if ((total_endpoint_indices_remapped) && (m_params.m_compression_level > 0))
+		//if ((total_endpoint_indices_remapped) && (m_params.m_compression_level > 0))
+		if ((total_endpoint_indices_remapped) && (m_params.m_compression_level > 1) && (!m_params.m_used_global_codebooks))
 		{
 			int_vec unused;
 			r.reoptimize_remapped_endpoints(block_endpoint_indices, unused, false, &block_selector_indices);
@ -1168,8 +1254,8 @@ namespace basisu

 		for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
 		{
-			const uint32_t width = m_slices[slice_index].m_width;
-			const uint32_t height = m_slices[slice_index].m_height;
+			//const uint32_t width = m_slices[slice_index].m_width;
+			//const uint32_t height = m_slices[slice_index].m_height;
 			const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
 			const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;

@ -1296,10 +1382,53 @@ namespace basisu
 	{
 		const basisu_frontend& r = *m_pFront_end;

+		// The endpoint indices may have been changed by the backend's RDO step, so go and figure out which ones are actually used again.
+		bool_vec old_endpoint_was_used(r.get_total_endpoint_clusters());
+		uint32_t first_old_entry_index = UINT32_MAX;
+
+		for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
+		{
+			const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x, num_blocks_y = m_slices[slice_index].m_num_blocks_y;
+			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+			{
+				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+				{
+					encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+					const uint32_t old_endpoint_index = m.m_endpoint_index;
+
+					old_endpoint_was_used[old_endpoint_index] = true;
+					first_old_entry_index = basisu::minimum(first_old_entry_index, old_endpoint_index);
+				} // block_x
+			} // block_y
+		} // slice_index
+
+		debug_printf("basisu_backend::encode_endpoint_palette: first_old_entry_index: %u\n", first_old_entry_index);
+
 		// Maps NEW to OLD endpoints
-		uint_vec endpoint_remap_table_inv(r.get_total_endpoint_clusters());
+		uint_vec endpoint_remap_table_new_to_old(r.get_total_endpoint_clusters());
+		endpoint_remap_table_new_to_old.set_all(first_old_entry_index);
+
+		bool_vec new_endpoint_was_used(r.get_total_endpoint_clusters());
+
 		for (uint32_t old_endpoint_index = 0; old_endpoint_index < m_endpoint_remap_table_old_to_new.size(); old_endpoint_index++)
-			endpoint_remap_table_inv[m_endpoint_remap_table_old_to_new[old_endpoint_index]] = old_endpoint_index;
+		{
+			if (old_endpoint_was_used[old_endpoint_index])
+			{
+				const uint32_t new_endpoint_index = m_endpoint_remap_table_old_to_new[old_endpoint_index];
+				
+				new_endpoint_was_used[new_endpoint_index] = true;
+
+				endpoint_remap_table_new_to_old[new_endpoint_index] = old_endpoint_index;
+			}
+		}
+
+		// TODO: Some new endpoint palette entries may actually be unused and aren't worth coding. Fix that.
+
+		uint32_t total_unused_new_entries = 0;
+		for (uint32_t i = 0; i < new_endpoint_was_used.size(); i++)
+			if (!new_endpoint_was_used[i])
+				total_unused_new_entries++;
+		debug_printf("basisu_backend::encode_endpoint_palette: total_unused_new_entries: %u out of %u\n", total_unused_new_entries, new_endpoint_was_used.size());

 		bool is_grayscale = true;
 		for (uint32_t old_endpoint_index = 0; old_endpoint_index < (uint32_t)m_endpoint_palette.size(); old_endpoint_index++)
@ -1324,7 +1453,7 @@ namespace basisu

 		for (uint32_t new_endpoint_index = 0; new_endpoint_index < r.get_total_endpoint_clusters(); new_endpoint_index++)
 		{
-			const uint32_t old_endpoint_index = endpoint_remap_table_inv[new_endpoint_index];
+			const uint32_t old_endpoint_index = endpoint_remap_table_new_to_old[new_endpoint_index];

 			int delta_inten = m_endpoint_palette[old_endpoint_index].m_inten5 - prev_inten;
 			inten_delta_hist.inc(delta_inten & 7);
@ -1390,7 +1519,7 @@ namespace basisu

 		for (uint32_t new_endpoint_index = 0; new_endpoint_index < r.get_total_endpoint_clusters(); new_endpoint_index++)
 		{
-			const uint32_t old_endpoint_index = endpoint_remap_table_inv[new_endpoint_index];
+			const uint32_t old_endpoint_index = endpoint_remap_table_new_to_old[new_endpoint_index];

 			int delta_inten = (m_endpoint_palette[old_endpoint_index].m_inten5 - prev_inten) & 7;
 			coder.put_code(delta_inten, inten_delta_model);
@ -1644,9 +1773,11 @@ namespace basisu

 	uint32_t basisu_backend::encode()
 	{
-		const bool is_video = m_pFront_end->get_params().m_tex_type == basist::cBASISTexTypeVideoFrames;
+		//const bool is_video = m_pFront_end->get_params().m_tex_type == basist::cBASISTexTypeVideoFrames;
 		m_output.m_slice_desc = m_slices;
 		m_output.m_etc1s = m_params.m_etc1s;
+		m_output.m_uses_global_codebooks = m_params.m_used_global_codebooks;
+		m_output.m_srgb = m_pFront_end->get_params().m_perceptual;

 		create_endpoint_palette();
 		create_selector_palette();
--- a/thirdparty/basis_universal/encoder/basisu_backend.h
+++ b/thirdparty/basis_universal/encoder/basisu_backend.h
@ -1,5 +1,5 @@
 // basisu_backend.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -14,10 +14,10 @@
 // limitations under the License.
 #pragma once

-#include "transcoder/basisu.h"
+#include "../transcoder/basisu.h"
 #include "basisu_enc.h"
-#include "transcoder/basisu_transcoder_internal.h"
-#include "transcoder/basisu_global_selector_palette.h"
+#include "../transcoder/basisu_transcoder_internal.h"
+#include "../transcoder/basisu_global_selector_palette.h"
 #include "basisu_frontend.h"

 namespace basisu
@ -49,7 +49,7 @@ namespace basisu
 		}
 	};

-	typedef std::vector<encoder_block> encoder_block_vec;
+	typedef basisu::vector<encoder_block> encoder_block_vec;
 	typedef vector2D<encoder_block> encoder_block_vec2D;

 	struct etc1_endpoint_palette_entry
@ -69,7 +69,7 @@ namespace basisu
 		}
 	};

-	typedef std::vector<etc1_endpoint_palette_entry> etc1_endpoint_palette_entry_vec;
+	typedef basisu::vector<etc1_endpoint_palette_entry> etc1_endpoint_palette_entry_vec;

 	struct basisu_backend_params
 	{
@ -84,6 +84,8 @@ namespace basisu
 		uint32_t m_global_sel_codebook_mod_bits;
 		bool m_use_hybrid_sel_codebooks;

+		bool m_used_global_codebooks;
+
 		basisu_backend_params()
 		{
 			clear();
@ -102,6 +104,7 @@ namespace basisu
 			m_global_sel_codebook_pal_bits = ETC1_GLOBAL_SELECTOR_CODEBOOK_MAX_PAL_BITS;
 			m_global_sel_codebook_mod_bits = basist::etc1_global_palette_entry_modifier::cTotalBits;
 			m_use_hybrid_sel_codebooks = false;
+			m_used_global_codebooks = false;
 		}
 	};

@ -111,10 +114,12 @@ namespace basisu
 		{
 			clear();
 		}
+
 		void clear()
 		{
 			clear_obj(*this);
 		}
+
 		uint32_t m_first_block_index;

 		uint32_t m_orig_width;
@ -135,11 +140,15 @@ namespace basisu
 		bool m_iframe;
 	};

-	typedef std::vector<basisu_backend_slice_desc> basisu_backend_slice_desc_vec;
+	typedef basisu::vector<basisu_backend_slice_desc> basisu_backend_slice_desc_vec;

 	struct basisu_backend_output
 	{
+		basist::basis_tex_format m_tex_format;
+
 		bool m_etc1s;
+		bool m_uses_global_codebooks;
+		bool m_srgb;

 		uint32_t m_num_endpoints;
 		uint32_t m_num_selectors;
@ -150,7 +159,7 @@ namespace basisu
 		basisu_backend_slice_desc_vec m_slice_desc;

 		uint8_vec m_slice_image_tables;
-		std::vector<uint8_vec> m_slice_image_data;
+		basisu::vector<uint8_vec> m_slice_image_data;
 		uint16_vec m_slice_image_crcs;

 		basisu_backend_output()
@ -160,7 +169,10 @@ namespace basisu

 		void clear()
 		{
+			m_tex_format = basist::basis_tex_format::cETC1S;
 			m_etc1s = false;
+			m_uses_global_codebooks = false;
+			m_srgb = true;

 			m_num_endpoints = 0;
 			m_num_selectors = 0;
@ -198,6 +210,7 @@ namespace basisu
 		uint32_t encode();

 		const basisu_backend_output &get_output() const { return m_output; }
+		const basisu_backend_params& get_params() const { return m_params; }

 	private:
 		basisu_frontend *m_pFront_end;
@ -216,15 +229,17 @@ namespace basisu
 			bool m_was_used;
 		};

-		typedef std::vector<etc1_global_selector_cb_entry_desc> etc1_global_selector_cb_entry_desc_vec;
+		typedef basisu::vector<etc1_global_selector_cb_entry_desc> etc1_global_selector_cb_entry_desc_vec;

 		etc1_global_selector_cb_entry_desc_vec m_global_selector_palette_desc;

-		std::vector<encoder_block_vec2D> m_slice_encoder_blocks;
+		basisu::vector<encoder_block_vec2D> m_slice_encoder_blocks;

 		// Maps OLD to NEW endpoint/selector indices
 		uint_vec m_endpoint_remap_table_old_to_new;
 		uint_vec m_endpoint_remap_table_new_to_old;
+		bool_vec m_old_endpoint_was_used;
+		bool_vec m_new_endpoint_was_used;

 		uint_vec m_selector_remap_table_old_to_new;

--- a/thirdparty/basis_universal/encoder/basisu_basis_file.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_basis_file.cpp
@ -1,5 +1,5 @@
 // basisu_basis_file.cpp
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "basisu_basis_file.h"
-#include "transcoder/basisu_transcoder.h"
+#include "../transcoder/basisu_transcoder.h"

 // The output file version. Keep in sync with BASISD_SUPPORTED_BASIS_VERSION.
 #define BASIS_FILE_VERSION (0x13)
@ -31,15 +31,26 @@ namespace basisu
 		m_header.m_total_images = 0;
 		for (uint32_t i = 0; i < encoder_output.m_slice_desc.size(); i++)
 			m_header.m_total_images = maximum<uint32_t>(m_header.m_total_images, encoder_output.m_slice_desc[i].m_source_file_index + 1);
-				
-		m_header.m_format = 0;// basist::block_format::cETC1;
+		
+		m_header.m_tex_format = (int)encoder_output.m_tex_format;
 		m_header.m_flags = 0;
 		
 		if (encoder_output.m_etc1s)
+		{
+			assert(encoder_output.m_tex_format == basist::basis_tex_format::cETC1S);
 			m_header.m_flags = m_header.m_flags | basist::cBASISHeaderFlagETC1S;
+		}
+		else
+		{
+			assert(encoder_output.m_tex_format != basist::basis_tex_format::cETC1S);
+		}

 		if (y_flipped)
 			m_header.m_flags = m_header.m_flags | basist::cBASISHeaderFlagYFlipped;
+		if (encoder_output.m_uses_global_codebooks)
+			m_header.m_flags = m_header.m_flags | basist::cBASISHeaderFlagUsesGlobalCodebook;
+		if (encoder_output.m_srgb)
+			m_header.m_flags = m_header.m_flags | basist::cBASISHeaderFlagSRGB;
 				
 		for (uint32_t i = 0; i < encoder_output.m_slice_desc.size(); i++)
 		{
@ -57,12 +68,26 @@ namespace basisu
 		m_header.m_userdata1 = userdata1;

 		m_header.m_total_endpoints = encoder_output.m_num_endpoints;
-		m_header.m_endpoint_cb_file_ofs = m_endpoint_cb_file_ofs;
-		m_header.m_endpoint_cb_file_size = (uint32_t)encoder_output.m_endpoint_palette.size();
+		if (!encoder_output.m_uses_global_codebooks)
+		{
+			m_header.m_endpoint_cb_file_ofs = m_endpoint_cb_file_ofs;
+			m_header.m_endpoint_cb_file_size = (uint32_t)encoder_output.m_endpoint_palette.size();
+		}
+		else
+		{
+			assert(!m_endpoint_cb_file_ofs);
+		}

 		m_header.m_total_selectors = encoder_output.m_num_selectors;
-		m_header.m_selector_cb_file_ofs = m_selector_cb_file_ofs;
-		m_header.m_selector_cb_file_size = (uint32_t)encoder_output.m_selector_palette.size();
+		if (!encoder_output.m_uses_global_codebooks)
+		{
+			m_header.m_selector_cb_file_ofs = m_selector_cb_file_ofs;
+			m_header.m_selector_cb_file_size = (uint32_t)encoder_output.m_selector_palette.size();
+		}
+		else
+		{
+			assert(!m_selector_cb_file_ofs);
+		}

 		m_header.m_tables_file_ofs = m_tables_file_ofs;
 		m_header.m_tables_file_size = (uint32_t)encoder_output.m_slice_image_tables.size();
@ -85,7 +110,7 @@ namespace basisu
 			m_images_descs[i].m_level_index = slice_descs[i].m_mip_index;
 			
 			if (slice_descs[i].m_alpha)
-				m_images_descs[i].m_flags = m_images_descs[i].m_flags | basist::cSliceDescFlagsIsAlphaData;
+				m_images_descs[i].m_flags = m_images_descs[i].m_flags | basist::cSliceDescFlagsHasAlpha;
 			if (slice_descs[i].m_iframe)
 				m_images_descs[i].m_flags = m_images_descs[i].m_flags | basist::cSliceDescFlagsFrameIsIFrame;

@ -127,14 +152,26 @@ namespace basisu
 		assert(m_comp_data.size() == m_slice_descs_file_ofs);
 		append_vector(m_comp_data, reinterpret_cast<const uint8_t*>(&m_images_descs[0]), m_images_descs.size() * sizeof(m_images_descs[0]));

-		assert(m_comp_data.size() == m_endpoint_cb_file_ofs);
-		append_vector(m_comp_data, reinterpret_cast<const uint8_t*>(&encoder_output.m_endpoint_palette[0]), encoder_output.m_endpoint_palette.size());
+		if (!encoder_output.m_uses_global_codebooks)
+		{
+			if (encoder_output.m_endpoint_palette.size())
+			{
+				assert(m_comp_data.size() == m_endpoint_cb_file_ofs);
+				append_vector(m_comp_data, reinterpret_cast<const uint8_t*>(&encoder_output.m_endpoint_palette[0]), encoder_output.m_endpoint_palette.size());
+			}

-		assert(m_comp_data.size() == m_selector_cb_file_ofs);
-		append_vector(m_comp_data, reinterpret_cast<const uint8_t*>(&encoder_output.m_selector_palette[0]), encoder_output.m_selector_palette.size());
+			if (encoder_output.m_selector_palette.size())
+			{
+				assert(m_comp_data.size() == m_selector_cb_file_ofs);
+				append_vector(m_comp_data, reinterpret_cast<const uint8_t*>(&encoder_output.m_selector_palette[0]), encoder_output.m_selector_palette.size());
+			}
+		}

-		assert(m_comp_data.size() == m_tables_file_ofs);
-		append_vector(m_comp_data, reinterpret_cast<const uint8_t*>(&encoder_output.m_slice_image_tables[0]), encoder_output.m_slice_image_tables.size());
+		if (encoder_output.m_slice_image_tables.size())
+		{
+			assert(m_comp_data.size() == m_tables_file_ofs);
+			append_vector(m_comp_data, reinterpret_cast<const uint8_t*>(&encoder_output.m_slice_image_tables[0]), encoder_output.m_slice_image_tables.size());
+		}

 		assert(m_comp_data.size() == m_first_image_file_ofs);
 		for (uint32_t i = 0; i < slice_descs.size(); i++)
@ -163,8 +200,17 @@ namespace basisu
 		const basisu_backend_slice_desc_vec &slice_descs = encoder_output.m_slice_desc;

 		// The Basis file uses 32-bit fields for lots of stuff, so make sure it's not too large.
-		uint64_t check_size = (uint64_t)sizeof(basist::basis_file_header) + (uint64_t)sizeof(basist::basis_slice_desc) * slice_descs.size() + 
+		uint64_t check_size = 0;
+		if (!encoder_output.m_uses_global_codebooks)
+		{
+			check_size = (uint64_t)sizeof(basist::basis_file_header) + (uint64_t)sizeof(basist::basis_slice_desc) * slice_descs.size() +
 			(uint64_t)encoder_output.m_endpoint_palette.size() + (uint64_t)encoder_output.m_selector_palette.size() + (uint64_t)encoder_output.m_slice_image_tables.size();
+		}
+		else
+		{
+			check_size = (uint64_t)sizeof(basist::basis_file_header) + (uint64_t)sizeof(basist::basis_slice_desc) * slice_descs.size() +
+				(uint64_t)encoder_output.m_slice_image_tables.size();
+		}
 		if (check_size >= 0xFFFF0000ULL)
 		{
 			error_printf("basisu_file::init: File is too large!\n");
@ -173,10 +219,29 @@ namespace basisu

 		m_header_file_ofs = 0;
 		m_slice_descs_file_ofs = sizeof(basist::basis_file_header);
-		m_endpoint_cb_file_ofs = m_slice_descs_file_ofs + sizeof(basist::basis_slice_desc) * (uint32_t)slice_descs.size();
-		m_selector_cb_file_ofs = m_endpoint_cb_file_ofs + (uint32_t)encoder_output.m_endpoint_palette.size();
-		m_tables_file_ofs = m_selector_cb_file_ofs + (uint32_t)encoder_output.m_selector_palette.size();
-		m_first_image_file_ofs = m_tables_file_ofs + (uint32_t)encoder_output.m_slice_image_tables.size();
+		if (encoder_output.m_tex_format == basist::basis_tex_format::cETC1S)
+		{
+			if (encoder_output.m_uses_global_codebooks)
+			{
+				m_endpoint_cb_file_ofs = 0;
+				m_selector_cb_file_ofs = 0;
+				m_tables_file_ofs = m_slice_descs_file_ofs + sizeof(basist::basis_slice_desc) * (uint32_t)slice_descs.size();
+			}
+			else
+			{
+				m_endpoint_cb_file_ofs = m_slice_descs_file_ofs + sizeof(basist::basis_slice_desc) * (uint32_t)slice_descs.size();
+				m_selector_cb_file_ofs = m_endpoint_cb_file_ofs + (uint32_t)encoder_output.m_endpoint_palette.size();
+				m_tables_file_ofs = m_selector_cb_file_ofs + (uint32_t)encoder_output.m_selector_palette.size();
+			}
+			m_first_image_file_ofs = m_tables_file_ofs + (uint32_t)encoder_output.m_slice_image_tables.size();
+		}
+		else
+		{
+			m_endpoint_cb_file_ofs = 0;
+			m_selector_cb_file_ofs = 0;
+			m_tables_file_ofs = 0;
+			m_first_image_file_ofs = m_slice_descs_file_ofs + sizeof(basist::basis_slice_desc) * (uint32_t)slice_descs.size();
+		}
 				
 		uint64_t total_file_size = m_first_image_file_ofs;
 		for (uint32_t i = 0; i < encoder_output.m_slice_image_data.size(); i++)
--- a/thirdparty/basis_universal/encoder/basisu_basis_file.h
+++ b/thirdparty/basis_universal/encoder/basisu_basis_file.h
@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
-#include "transcoder/basisu_file_headers.h"
+#include "../transcoder/basisu_file_headers.h"
 #include "basisu_backend.h"

 namespace basisu
@ -49,7 +49,7 @@ namespace basisu

 	private:
 		basist::basis_file_header m_header;
-		std::vector<basist::basis_slice_desc> m_images_descs;
+		basisu::vector<basist::basis_slice_desc> m_images_descs;

 		uint8_vec m_comp_data;

--- a/thirdparty/basis_universal/encoder/basisu_bc7enc.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_bc7enc.cpp
--- a/thirdparty/basis_universal/encoder/basisu_bc7enc.h
+++ b/thirdparty/basis_universal/encoder/basisu_bc7enc.h
@ -0,0 +1,131 @@
+// File: basisu_bc7enc.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_enc.h"
+#include "../transcoder/basisu_transcoder_uastc.h"
+
+namespace basisu
+{
+
+#define BC7ENC_MAX_PARTITIONS1 (64)
+#define BC7ENC_MAX_UBER_LEVEL (4)
+
+	typedef uint8_t bc7enc_bool;
+
+#define BC7ENC_TRUE (1)
+#define BC7ENC_FALSE (0)
+		
+	typedef struct { float m_c[4]; } bc7enc_vec4F;
+
+	extern const float g_bc7_weights1x[2 * 4];
+	extern const float g_bc7_weights2x[4 * 4];
+	extern const float g_bc7_weights3x[8 * 4];
+	extern const float g_bc7_weights4x[16 * 4];
+	extern const float g_astc_weights4x[16 * 4];
+	extern const float g_astc_weights5x[32 * 4];
+	extern const float g_astc_weights_3levelsx[3 * 4];
+			
+	extern basist::astc_quant_bin g_astc_sorted_order_unquant[basist::BC7ENC_TOTAL_ASTC_RANGES][256]; // [sorted unquantized order]
+	
+	struct color_cell_compressor_params
+	{
+		uint32_t m_num_pixels;
+		const basist::color_quad_u8* m_pPixels;
+
+		uint32_t m_num_selector_weights;
+		const uint32_t* m_pSelector_weights;
+
+		const bc7enc_vec4F* m_pSelector_weightsx;
+		uint32_t m_comp_bits;
+
+		const uint8_t *m_pForce_selectors;
+
+		// Non-zero m_astc_endpoint_range enables ASTC mode. m_comp_bits and m_has_pbits are always false. We only support 2, 3, or 4 bit weight encodings.
+		uint32_t m_astc_endpoint_range;
+
+		uint32_t m_weights[4];
+		bc7enc_bool m_has_alpha;
+		bc7enc_bool m_has_pbits;
+		bc7enc_bool m_endpoints_share_pbit;
+		bc7enc_bool m_perceptual;
+	};
+
+	struct color_cell_compressor_results
+	{
+		uint64_t m_best_overall_err;
+		basist::color_quad_u8 m_low_endpoint;
+		basist::color_quad_u8 m_high_endpoint;
+		uint32_t m_pbits[2];
+		uint8_t* m_pSelectors;
+		uint8_t* m_pSelectors_temp;
+
+		// Encoded ASTC indices, if ASTC mode is enabled
+		basist::color_quad_u8 m_astc_low_endpoint;
+		basist::color_quad_u8 m_astc_high_endpoint;
+	};
+
+	struct bc7enc_compress_block_params
+	{
+		// m_max_partitions_mode1 may range from 0 (disables mode 1) to BC7ENC_MAX_PARTITIONS1. The higher this value, the slower the compressor, but the higher the quality.
+		uint32_t m_max_partitions_mode1;
+
+		// Relative RGBA or YCbCrA weights.
+		uint32_t m_weights[4];
+
+		// m_uber_level may range from 0 to BC7ENC_MAX_UBER_LEVEL. The higher this value, the slower the compressor, but the higher the quality.
+		uint32_t m_uber_level;
+
+		// If m_perceptual is true, colorspace error is computed in YCbCr space, otherwise RGB.
+		bc7enc_bool m_perceptual;
+
+		uint32_t m_least_squares_passes;
+	};
+
+	uint64_t color_cell_compression(uint32_t mode, const color_cell_compressor_params* pParams, color_cell_compressor_results* pResults, const bc7enc_compress_block_params* pComp_params);
+		
+	uint64_t color_cell_compression_est_astc(
+		uint32_t num_weights, uint32_t num_comps, const uint32_t* pWeight_table,
+		uint32_t num_pixels, const basist::color_quad_u8* pPixels,
+		uint64_t best_err_so_far, const uint32_t weights[4]);
+		
+	inline void bc7enc_compress_block_params_init_linear_weights(bc7enc_compress_block_params* p)
+	{
+		p->m_perceptual = BC7ENC_FALSE;
+		p->m_weights[0] = 1;
+		p->m_weights[1] = 1;
+		p->m_weights[2] = 1;
+		p->m_weights[3] = 1;
+	}
+
+	inline void bc7enc_compress_block_params_init_perceptual_weights(bc7enc_compress_block_params* p)
+	{
+		p->m_perceptual = BC7ENC_TRUE;
+		p->m_weights[0] = 128;
+		p->m_weights[1] = 64;
+		p->m_weights[2] = 16;
+		p->m_weights[3] = 32;
+	}
+
+	inline void bc7enc_compress_block_params_init(bc7enc_compress_block_params* p)
+	{
+		p->m_max_partitions_mode1 = BC7ENC_MAX_PARTITIONS1;
+		p->m_least_squares_passes = 1;
+		p->m_uber_level = 0;
+		bc7enc_compress_block_params_init_perceptual_weights(p);
+	}
+
+	// bc7enc_compress_block_init() MUST be called before calling bc7enc_compress_block() (or you'll get artifacts).
+	void bc7enc_compress_block_init();
+				
+} // namespace basisu
--- a/thirdparty/basis_universal/encoder/basisu_comp.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_comp.cpp
--- a/thirdparty/basis_universal/encoder/basisu_comp.h
+++ b/thirdparty/basis_universal/encoder/basisu_comp.h
@ -1,5 +1,5 @@
 // basisu_comp.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -16,8 +16,23 @@
 #include "basisu_frontend.h"
 #include "basisu_backend.h"
 #include "basisu_basis_file.h"
-#include "transcoder/basisu_global_selector_palette.h"
-#include "transcoder/basisu_transcoder.h"
+#include "../transcoder/basisu_global_selector_palette.h"
+#include "../transcoder/basisu_transcoder.h"
+#include "basisu_uastc_enc.h"
+
+#define BASISU_LIB_VERSION 115
+#define BASISU_LIB_VERSION_STRING "1.15"
+
+#ifndef BASISD_SUPPORT_KTX2
+	#error BASISD_SUPPORT_KTX2 is undefined
+#endif
+#ifndef BASISD_SUPPORT_KTX2_ZSTD
+	#error BASISD_SUPPORT_KTX2_ZSTD is undefined
+#endif
+
+#if !BASISD_SUPPORT_KTX2
+	#error BASISD_SUPPORT_KTX2 must be enabled when building the encoder. To reduce code size if KTX2 support is not needed, set BASISD_SUPPORT_KTX2_ZSTD to 0
+#endif

 namespace basisu
 {
@ -40,6 +55,10 @@ namespace basisu

 	const uint32_t BASISU_MAX_SLICES = 0xFFFFFF;

+	const int BASISU_RDO_UASTC_DICT_SIZE_DEFAULT = 4096; // 32768;
+	const int BASISU_RDO_UASTC_DICT_SIZE_MIN = 64;
+	const int BASISU_RDO_UASTC_DICT_SIZE_MAX = 65536;
+
 	struct image_stats
 	{
 		image_stats()
@ -52,43 +71,52 @@ namespace basisu
 			m_filename.clear();
 			m_width = 0;
 			m_height = 0;
+						
+			m_basis_rgb_avg_psnr = 0.0f;
+			m_basis_rgba_avg_psnr = 0.0f;
+			m_basis_a_avg_psnr = 0.0f;
+			m_basis_luma_709_psnr = 0.0f;
+			m_basis_luma_601_psnr = 0.0f;
+			m_basis_luma_709_ssim = 0.0f;

-			m_basis_etc1s_rgb_avg_psnr = 0.0f;
-			m_basis_etc1s_luma_709_psnr = 0.0f;
-			m_basis_etc1s_luma_601_psnr = 0.0f;
-			m_basis_etc1s_luma_709_ssim = 0.0f;
-
-			m_basis_bc1_rgb_avg_psnr = 0.0f;
-			m_basis_bc1_luma_709_psnr = 0.0f;
-			m_basis_bc1_luma_601_psnr = 0.0f;
-			m_basis_bc1_luma_709_ssim = 0.0f;
-
-			m_best_rgb_avg_psnr = 0.0f;
-			m_best_luma_709_psnr = 0.0f;
-			m_best_luma_601_psnr = 0.0f;
-			m_best_luma_709_ssim = 0.0f;
+			m_bc7_rgb_avg_psnr = 0.0f;
+			m_bc7_rgba_avg_psnr = 0.0f;
+			m_bc7_a_avg_psnr = 0.0f;
+			m_bc7_luma_709_psnr = 0.0f;
+			m_bc7_luma_601_psnr = 0.0f;
+			m_bc7_luma_709_ssim = 0.0f;
+						
+			m_best_etc1s_rgb_avg_psnr = 0.0f;
+			m_best_etc1s_luma_709_psnr = 0.0f;
+			m_best_etc1s_luma_601_psnr = 0.0f;
+			m_best_etc1s_luma_709_ssim = 0.0f;
 		}

 		std::string m_filename;
 		uint32_t m_width;
 		uint32_t m_height;

-		// .basis compressed
-		float m_basis_etc1s_rgb_avg_psnr;
-		float m_basis_etc1s_luma_709_psnr;
-		float m_basis_etc1s_luma_601_psnr;
-		float m_basis_etc1s_luma_709_ssim;
-		
-		float m_basis_bc1_rgb_avg_psnr;
-		float m_basis_bc1_luma_709_psnr;
-		float m_basis_bc1_luma_601_psnr;
-		float m_basis_bc1_luma_709_ssim;
+		// .basis compressed (ETC1S or UASTC statistics)
+		float m_basis_rgb_avg_psnr;
+		float m_basis_rgba_avg_psnr;
+		float m_basis_a_avg_psnr;
+		float m_basis_luma_709_psnr;
+		float m_basis_luma_601_psnr;
+		float m_basis_luma_709_ssim;

-		// Normal (highest quality) compressed ETC1S
-		float m_best_rgb_avg_psnr;
-		float m_best_luma_709_psnr;
-		float m_best_luma_601_psnr;
-		float m_best_luma_709_ssim;
+		// BC7 statistics
+		float m_bc7_rgb_avg_psnr;
+		float m_bc7_rgba_avg_psnr;
+		float m_bc7_a_avg_psnr;
+		float m_bc7_luma_709_psnr;
+		float m_bc7_luma_601_psnr;
+		float m_bc7_luma_709_ssim;
+		
+		// Highest achievable quality ETC1S statistics
+		float m_best_etc1s_rgb_avg_psnr;
+		float m_best_etc1s_luma_709_psnr;
+		float m_best_etc1s_luma_601_psnr;
+		float m_best_etc1s_luma_709_ssim;
 	};

 	template<bool def>
@ -175,18 +203,30 @@ namespace basisu
 	struct basis_compressor_params
 	{
 		basis_compressor_params() :
+			m_pSel_codebook(NULL),
+			m_compression_level((int)BASISU_DEFAULT_COMPRESSION_LEVEL, 0, (int)BASISU_MAX_COMPRESSION_LEVEL),
+			m_selector_rdo_thresh(BASISU_DEFAULT_SELECTOR_RDO_THRESH, 0.0f, 1e+10f),
+			m_endpoint_rdo_thresh(BASISU_DEFAULT_ENDPOINT_RDO_THRESH, 0.0f, 1e+10f),
 			m_hybrid_sel_cb_quality_thresh(BASISU_DEFAULT_HYBRID_SEL_CB_QUALITY_THRESH, 0.0f, 1e+10f),
 			m_global_pal_bits(8, 0, ETC1_GLOBAL_SELECTOR_CODEBOOK_MAX_PAL_BITS),
 			m_global_mod_bits(8, 0, basist::etc1_global_palette_entry_modifier::cTotalBits),
-			m_endpoint_rdo_thresh(BASISU_DEFAULT_ENDPOINT_RDO_THRESH, 0.0f, 1e+10f),
-			m_selector_rdo_thresh(BASISU_DEFAULT_SELECTOR_RDO_THRESH, 0.0f, 1e+10f),
-			m_pSel_codebook(NULL),
+			m_mip_scale(1.0f, .000125f, 4.0f),
+			m_mip_smallest_dimension(1, 1, 16384),
 			m_max_endpoint_clusters(512),
 			m_max_selector_clusters(512),
 			m_quality_level(-1),
-			m_mip_scale(1.0f, .000125f, 4.0f),
-			m_mip_smallest_dimension(1, 1, 16384),
-			m_compression_level((int)BASISU_DEFAULT_COMPRESSION_LEVEL, 0, (int)BASISU_MAX_COMPRESSION_LEVEL),
+			m_pack_uastc_flags(cPackUASTCLevelDefault),
+			m_rdo_uastc_quality_scalar(1.0f, 0.001f, 50.0f),
+			m_rdo_uastc_dict_size(BASISU_RDO_UASTC_DICT_SIZE_DEFAULT, BASISU_RDO_UASTC_DICT_SIZE_MIN, BASISU_RDO_UASTC_DICT_SIZE_MAX),
+			m_rdo_uastc_max_smooth_block_error_scale(UASTC_RDO_DEFAULT_SMOOTH_BLOCK_MAX_ERROR_SCALE, 1.0f, 300.0f),
+			m_rdo_uastc_smooth_block_max_std_dev(UASTC_RDO_DEFAULT_MAX_SMOOTH_BLOCK_STD_DEV, .01f, 65536.0f),
+			m_rdo_uastc_max_allowed_rms_increase_ratio(UASTC_RDO_DEFAULT_MAX_ALLOWED_RMS_INCREASE_RATIO, .01f, 100.0f),
+			m_rdo_uastc_skip_block_rms_thresh(UASTC_RDO_DEFAULT_SKIP_BLOCK_RMS_THRESH, .01f, 100.0f),
+			m_resample_width(0, 1, 16384),
+			m_resample_height(0, 1, 16384),
+			m_resample_factor(0.0f, .00125f, 100.0f),
+			m_ktx2_uastc_supercompression(basist::KTX2_SS_NONE),
+			m_ktx2_zstd_supercompression_level(6, INT_MIN, INT_MAX),
 			m_pJob_pool(nullptr)
 		{
 			clear();
@ -196,15 +236,20 @@ namespace basisu
 		{
 			m_pSel_codebook = NULL;

+			m_uastc.clear();
+			m_status_output.clear();
+
 			m_source_filenames.clear();
 			m_source_alpha_filenames.clear();

 			m_source_images.clear();
+			m_source_mipmap_images.clear();

 			m_out_filename.clear();

 			m_y_flip.clear();
 			m_debug.clear();
+			m_validate.clear();
 			m_debug_images.clear();
 			m_global_sel_pal.clear();
 			m_auto_global_sel_pal.clear();
@ -219,7 +264,11 @@ namespace basisu
 			m_check_for_alpha.clear();
 			m_force_alpha.clear();
 			m_multithreading.clear();
-			m_seperate_rg_to_color_alpha.clear();
+			m_swizzle[0] = 0;
+			m_swizzle[1] = 1;
+			m_swizzle[2] = 2;
+			m_swizzle[3] = 3;
+			m_renormalize.clear();
 			m_hybrid_sel_cb_quality_thresh.clear();
 			m_global_pal_bits.clear();
 			m_global_mod_bits.clear();
@ -236,6 +285,7 @@ namespace basisu
 			m_mip_premultiplied.clear();
 			m_mip_renormalize.clear();
 			m_mip_wrapping.clear();
+			m_mip_fast.clear();
 			m_mip_smallest_dimension.clear();

 			m_max_endpoint_clusters = 0;
@ -247,30 +297,63 @@ namespace basisu
 			m_userdata1 = 0;
 			m_us_per_frame = 0;

+			m_pack_uastc_flags = cPackUASTCLevelDefault;
+			m_rdo_uastc.clear();
+			m_rdo_uastc_quality_scalar.clear();
+			m_rdo_uastc_max_smooth_block_error_scale.clear();
+			m_rdo_uastc_smooth_block_max_std_dev.clear();
+			m_rdo_uastc_max_allowed_rms_increase_ratio.clear();
+			m_rdo_uastc_skip_block_rms_thresh.clear();
+			m_rdo_uastc_favor_simpler_modes_in_rdo_mode.clear();
+			m_rdo_uastc_multithreading.clear();
+
+			m_resample_width.clear();
+			m_resample_height.clear();
+			m_resample_factor.clear();
+
+			m_pGlobal_codebooks = nullptr;
+
+			m_create_ktx2_file.clear();
+			m_ktx2_uastc_supercompression = basist::KTX2_SS_NONE;
+			m_ktx2_key_values.clear();
+			m_ktx2_zstd_supercompression_level.clear();
+			m_ktx2_srgb_transfer_func.clear();
+
 			m_pJob_pool = nullptr;
 		}
-
+				
 		// Pointer to the global selector codebook, or nullptr to not use a global selector codebook
 		const basist::etc1_global_selector_codebook *m_pSel_codebook;

+		// True to generate UASTC .basis file data, otherwise ETC1S.
+		bool_param<false> m_uastc;
+
 		// If m_read_source_images is true, m_source_filenames (and optionally m_source_alpha_filenames) contains the filenames of PNG images to read. 
 		// Otherwise, the compressor processes the images in m_source_images.
-		std::vector<std::string> m_source_filenames;
-		std::vector<std::string> m_source_alpha_filenames;
+		basisu::vector<std::string> m_source_filenames;
+		basisu::vector<std::string> m_source_alpha_filenames;
 		
-		std::vector<image> m_source_images;
-		// TODO: Allow caller to supply their own mipmaps
+		basisu::vector<image> m_source_images;
+		
+		// Stores mipmaps starting from level 1. Level 0 is still stored in m_source_images, as usual.
+		// If m_source_mipmaps isn't empty, automatic mipmap generation isn't done. m_source_mipmaps.size() MUST equal m_source_images.size() or the compressor returns an error.
+		// The compressor applies the user-provided swizzling (in m_swizzle) to these images.
+		basisu::vector< basisu::vector<image> > m_source_mipmap_images;
 						
 		// Filename of the output basis file
-		std::string m_out_filename;	
+		std::string m_out_filename;

 		// The params are done this way so we can detect when the user has explictly changed them.

 		// Flip images across Y axis
 		bool_param<false> m_y_flip;
+
+		// If true, the compressor will print basis status to stdout during compression.
+		bool_param<true> m_status_output;
 		
 		// Output debug information during compression
 		bool_param<false> m_debug;
+		bool_param<false> m_validate;
 		
 		// m_debug_images is pretty slow
 		bool_param<false> m_debug_images;
@ -284,7 +367,7 @@ namespace basisu
 		// Frontend/backend codec parameters
 		bool_param<false> m_no_hybrid_sel_cb;
 		
-		// Use perceptual sRGB colorspace metrics (for normal maps, etc.)
+		// Use perceptual sRGB colorspace metrics instead of linear
 		bool_param<true> m_perceptual;

 		// Disable selector RDO, for faster compression but larger files
@ -299,7 +382,7 @@ namespace basisu

 		// Write the output basis file to disk using m_out_filename
 		bool_param<false> m_write_output_basis_files;
-				
+								
 		// Compute and display image metrics 
 		bool_param<false> m_compute_stats;
 		
@ -311,7 +394,9 @@ namespace basisu
 		bool_param<true> m_multithreading;
 		
 		// Split the R channel to RGB and the G channel to alpha, then write a basis file with alpha channels
-		bool_param<false> m_seperate_rg_to_color_alpha;
+		char m_swizzle[4];
+
+		bool_param<false> m_renormalize;

 		bool_param<false> m_disable_hierarchical_endpoint_codebooks;

@ -328,10 +413,11 @@ namespace basisu
 		bool_param<true> m_mip_premultiplied; // not currently supported
 		bool_param<false> m_mip_renormalize; 
 		bool_param<true> m_mip_wrapping;
+		bool_param<true> m_mip_fast;
 		param<int> m_mip_smallest_dimension;
 				
 		// Codebook size (quality) control. 
-		// If m_quality_level != -1, it controls the quality level. It ranges from [0,255].
+		// If m_quality_level != -1, it controls the quality level. It ranges from [0,255] or [BASISU_QUALITY_MIN, BASISU_QUALITY_MAX].
 		// Otherwise m_max_endpoint_clusters/m_max_selector_clusters controls the codebook sizes directly.
 		uint32_t m_max_endpoint_clusters;
 		uint32_t m_max_selector_clusters;
@ -343,6 +429,31 @@ namespace basisu
 		uint32_t m_userdata1;
 		uint32_t m_us_per_frame;

+		// cPackUASTCLevelDefault, etc.
+		uint32_t m_pack_uastc_flags;
+		bool_param<false> m_rdo_uastc;
+		param<float> m_rdo_uastc_quality_scalar;
+		param<int> m_rdo_uastc_dict_size;
+		param<float> m_rdo_uastc_max_smooth_block_error_scale;
+		param<float> m_rdo_uastc_smooth_block_max_std_dev;
+		param<float> m_rdo_uastc_max_allowed_rms_increase_ratio;
+		param<float> m_rdo_uastc_skip_block_rms_thresh;
+		bool_param<true> m_rdo_uastc_favor_simpler_modes_in_rdo_mode;
+		bool_param<true> m_rdo_uastc_multithreading;
+
+		param<int> m_resample_width;
+		param<int> m_resample_height;
+		param<float> m_resample_factor;
+		const basist::basisu_lowlevel_etc1s_transcoder *m_pGlobal_codebooks;
+
+		// KTX2 specific parameters.
+		// Internally, the compressor always creates a .basis file then it converts that lossless to KTX2.
+		bool_param<false> m_create_ktx2_file;
+		basist::ktx2_supercompression m_ktx2_uastc_supercompression;
+		basist::ktx2_transcoder::key_value_vec m_ktx2_key_values;
+		param<int> m_ktx2_zstd_supercompression_level;
+		bool_param<false> m_ktx2_srgb_transfer_func;
+
 		job_pool *m_pJob_pool;
 	};
 	
@ -360,35 +471,41 @@ namespace basisu
 			cECSuccess = 0,
 			cECFailedReadingSourceImages,
 			cECFailedValidating,
+			cECFailedEncodeUASTC,
 			cECFailedFrontEnd,
 			cECFailedFontendExtract,
 			cECFailedBackend,
 			cECFailedCreateBasisFile,
-			cECFailedWritingOutput
+			cECFailedWritingOutput,
+			cECFailedUASTCRDOPostProcess,
+			cECFailedCreateKTX2File
 		};

 		error_code process();

+		// The output .basis file will always be valid of process() succeeded.
 		const uint8_vec &get_output_basis_file() const { return m_output_basis_file; }
-		const etc_block_vec &get_output_blocks() const { return m_output_blocks; }
+		
+		// The output .ktx2 file will only be valid if m_create_ktx2_file was true and process() succeeded.
+		const uint8_vec& get_output_ktx2_file() const { return m_output_ktx2_file; }

-		const std::vector<image_stats> &get_stats() const { return m_stats; }
+		const basisu::vector<image_stats> &get_stats() const { return m_stats; }

 		uint32_t get_basis_file_size() const { return m_basis_file_size; }
 		double get_basis_bits_per_texel() const { return m_basis_bits_per_texel; }
-
+		
 		bool get_any_source_image_has_alpha() const { return m_any_source_image_has_alpha; }
-
+								
 	private:
 		basis_compressor_params m_params;
 		
-		std::vector<image> m_slice_images;
+		basisu::vector<image> m_slice_images;

-		std::vector<image_stats> m_stats;
+		basisu::vector<image_stats> m_stats;

 		uint32_t m_basis_file_size;
 		double m_basis_bits_per_texel;
-		
+						
 		basisu_backend_slice_desc_vec m_slice_descs;

 		uint32_t m_total_blocks;
@ -397,33 +514,41 @@ namespace basisu
 		basisu_frontend m_frontend;
 		pixel_block_vec m_source_blocks;

-		std::vector<gpu_image> m_frontend_output_textures;
+		basisu::vector<gpu_image> m_frontend_output_textures;

-		std::vector<gpu_image> m_best_etc1s_images;
-		std::vector<image> m_best_etc1s_images_unpacked;
+		basisu::vector<gpu_image> m_best_etc1s_images;
+		basisu::vector<image> m_best_etc1s_images_unpacked;

 		basisu_backend m_backend;

 		basisu_file m_basis_file;

-		std::vector<gpu_image> m_decoded_output_textures;
-		std::vector<image> m_decoded_output_textures_unpacked;
-		std::vector<gpu_image> m_decoded_output_textures_bc1;
-		std::vector<image> m_decoded_output_textures_unpacked_bc1;
+		basisu::vector<gpu_image> m_decoded_output_textures;
+		basisu::vector<image> m_decoded_output_textures_unpacked;
+		basisu::vector<gpu_image> m_decoded_output_textures_bc7;
+		basisu::vector<image> m_decoded_output_textures_unpacked_bc7;

 		uint8_vec m_output_basis_file;
-		etc_block_vec m_output_blocks;
+		uint8_vec m_output_ktx2_file;
+		
+		basisu::vector<gpu_image> m_uastc_slice_textures;
+		basisu_backend_output m_uastc_backend_output;

 		bool m_any_source_image_has_alpha;

 		bool read_source_images();
+		bool extract_source_blocks();
 		bool process_frontend();
 		bool extract_frontend_texture_data();
 		bool process_backend();
 		bool create_basis_file_and_transcode();
 		bool write_output_files_and_compute_stats();
-		bool generate_mipmaps(const image &img, std::vector<image> &mips, bool has_alpha);
+		error_code encode_slices_to_uastc();
+		bool generate_mipmaps(const image &img, basisu::vector<image> &mips, bool has_alpha);
 		bool validate_texture_type_constraints();
+		bool validate_ktx2_constraints();
+		void get_dfd(uint8_vec& dfd, const basist::ktx2_header& hdr);
+		bool create_ktx2_file();
 	};

 } // namespace basisu
--- a/thirdparty/basis_universal/encoder/basisu_enc.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_enc.cpp
--- a/thirdparty/basis_universal/encoder/basisu_enc.h
+++ b/thirdparty/basis_universal/encoder/basisu_enc.h
@ -1,5 +1,5 @@
 // basisu_enc.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -13,8 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
-#include "transcoder/basisu.h"
-#include "transcoder/basisu_transcoder_internal.h"
+#include "../transcoder/basisu.h"
+#include "../transcoder/basisu_transcoder_internal.h"

 #include <mutex>
 #include <atomic>
@ -28,13 +28,29 @@
 #include <libgen.h>
 #endif

+// This module is really just a huge grab bag of classes and helper functions needed by the encoder.
+
+// If BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE is 1, quality in perceptual mode will be slightly greater, but at a large increase in encoding CPU time.
+#define BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE (0)
+
 namespace basisu
 {
 	extern uint8_t g_hamming_dist[256];
+	extern const uint8_t g_debug_font8x8_basic[127 - 32 + 1][8];

-	// Encoder library initialization
+	// Encoder library initialization.
+	// This function MUST be called before encoding anything!
 	void basisu_encoder_init();

+	// basisu_kernels_sse.cpp - will be a no-op and g_cpu_supports_sse41 will always be false unless compiled with BASISU_SUPPORT_SSE=1
+	extern void detect_sse41();
+
+#if BASISU_SUPPORT_SSE
+	extern bool g_cpu_supports_sse41;
+#else
+	const bool g_cpu_supports_sse41 = false;
+#endif
+
 	void error_printf(const char *pFmt, ...);

 	// Helpers
@ -43,7 +59,68 @@ namespace basisu
 	{
 		return (uint8_t)((i & 0xFFFFFF00U) ? (~(i >> 31)) : i);
 	}
-	
+
+	inline int32_t clampi(int32_t value, int32_t low, int32_t high) 
+	{ 
+		if (value < low) 
+			value = low; 
+		else if (value > high) 
+			value = high; 
+		return value; 
+	}
+
+	inline uint8_t mul_8(uint32_t v, uint32_t a)
+	{
+		v = v * a + 128; 
+		return (uint8_t)((v + (v >> 8)) >> 8);
+	}
+
+	inline uint64_t read_bits(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize)
+	{
+		assert(codesize <= 64);
+		uint64_t bits = 0;
+		uint32_t total_bits = 0;
+
+		while (total_bits < codesize)
+		{
+			uint32_t byte_bit_offset = bit_offset & 7;
+			uint32_t bits_to_read = minimum<int>(codesize - total_bits, 8 - byte_bit_offset);
+
+			uint32_t byte_bits = pBuf[bit_offset >> 3] >> byte_bit_offset;
+			byte_bits &= ((1 << bits_to_read) - 1);
+
+			bits |= ((uint64_t)(byte_bits) << total_bits);
+
+			total_bits += bits_to_read;
+			bit_offset += bits_to_read;
+		}
+
+		return bits;
+	}
+
+	inline uint32_t read_bits32(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize)
+	{
+		assert(codesize <= 32);
+		uint32_t bits = 0;
+		uint32_t total_bits = 0;
+
+		while (total_bits < codesize)
+		{
+			uint32_t byte_bit_offset = bit_offset & 7;
+			uint32_t bits_to_read = minimum<int>(codesize - total_bits, 8 - byte_bit_offset);
+
+			uint32_t byte_bits = pBuf[bit_offset >> 3] >> byte_bit_offset;
+			byte_bits &= ((1 << bits_to_read) - 1);
+
+			bits |= (byte_bits << total_bits);
+
+			total_bits += bits_to_read;
+			bit_offset += bits_to_read;
+		}
+
+		return bits;
+	}
+				
 	// Hashing
 	
 	inline uint32_t bitmix32c(uint32_t v) 
@ -69,6 +146,16 @@ namespace basisu
 		return v;
 	}

+	inline uint32_t wang_hash(uint32_t seed)
+	{
+		 seed = (seed ^ 61) ^ (seed >> 16);
+		 seed *= 9;
+		 seed = seed ^ (seed >> 4);
+		 seed *= 0x27d4eb2d;
+		 seed = seed ^ (seed >> 15);
+		 return seed;
+	}
+
 	uint32_t hash_hsieh(const uint8_t* pBuf, size_t len);

 	template <typename Key>
@ -80,6 +167,72 @@ namespace basisu
 		}
 	};

+	class running_stat
+	{
+	public:
+		running_stat() :
+			m_n(0),
+			m_old_m(0), m_new_m(0), m_old_s(0), m_new_s(0)
+		{
+		}
+		void clear()
+		{
+			m_n = 0;
+		}
+		void push(double x)
+		{
+			m_n++;
+			if (m_n == 1)
+			{
+				m_old_m = m_new_m = x;
+				m_old_s = 0.0;
+				m_min = x;
+				m_max = x;
+			}
+			else
+			{
+				m_new_m = m_old_m + (x - m_old_m) / m_n;
+				m_new_s = m_old_s + (x - m_old_m) * (x - m_new_m);
+				m_old_m = m_new_m;
+				m_old_s = m_new_s;
+				m_min = basisu::minimum(x, m_min);
+				m_max = basisu::maximum(x, m_max);
+			}
+		}
+		uint32_t get_num() const
+		{
+			return m_n;
+		}
+		double get_mean() const
+		{
+			return (m_n > 0) ? m_new_m : 0.0;
+		}
+
+		double get_variance() const
+		{
+			return ((m_n > 1) ? m_new_s / (m_n - 1) : 0.0);
+		}
+
+		double get_std_dev() const
+		{
+			return sqrt(get_variance());
+		}
+
+		double get_min() const
+		{
+			return m_min;
+		}
+
+		double get_max() const
+		{
+			return m_max;
+		}
+
+	private:
+		uint32_t m_n;
+		double m_old_m, m_new_m, m_old_s, m_new_s, m_min, m_max;
+	};
+
 	// Linear algebra

 	template <uint32_t N, typename T>
@ -118,7 +271,7 @@ namespace basisu
 		inline vec &set(const vec<OtherN, OtherT> &other)
 		{
 			uint32_t i;
-			if (static_cast<void *>(&other) == static_cast<void *>(this))
+			if ((const void *)(&other) == (const void *)(this))
 				return *this;
 			const uint32_t m = minimum(OtherN, N);
 			for (i = 0; i < m; i++)
@ -358,6 +511,7 @@ namespace basisu
 		BASISU_NO_EQUALS_OR_COPY_CONSTRUCT(job_pool);

 	public:
+		// num_threads is the TOTAL number of job pool threads, including the calling thread! So 2=1 new thread, 3=2 new threads, etc.
 		job_pool(uint32_t num_threads);
 		~job_pool();
 				
@ -370,7 +524,7 @@ namespace basisu
 		
 	private:
 		std::vector<std::thread> m_threads;
-        std::vector<std::function<void()> > m_queue;
+		std::vector<std::function<void()> > m_queue;
 		
 		std::mutex m_mutex;
 		std::condition_variable m_has_work;
@ -420,7 +574,7 @@ namespace basisu
 			return *this;
 		}
 	};
-		
+				
 	class color_rgba
 	{
 	public:
@ -440,6 +594,25 @@ namespace basisu
 		inline color_rgba()
 		{
 			static_assert(sizeof(*this) == 4, "sizeof(*this) != 4");
+			static_assert(sizeof(*this) == sizeof(basist::color32), "sizeof(*this) != sizeof(basist::color32)");
+		}
+
+		// Not too hot about this idea.
+		inline color_rgba(const basist::color32& other) :
+			r(other.r),
+			g(other.g),
+			b(other.b),
+			a(other.a)
+		{
+		}
+
+		color_rgba& operator= (const basist::color32& rhs)
+		{
+			r = rhs.r;
+			g = rhs.g;
+			b = rhs.b;
+			a = rhs.a;
+			return *this;
 		}

 		inline color_rgba(int y)
@ -563,11 +736,20 @@ namespace basisu
 		inline int get_601_luma() const { return (19595U * m_comps[0] + 38470U * m_comps[1] + 7471U * m_comps[2] + 32768U) >> 16U; }
 		inline int get_709_luma() const { return (13938U * m_comps[0] + 46869U * m_comps[1] + 4729U * m_comps[2] + 32768U) >> 16U; } 
 		inline int get_luma(bool luma_601) const { return luma_601 ? get_601_luma() : get_709_luma(); }
+
+		inline basist::color32 get_color32() const
+		{
+			return basist::color32(r, g, b, a);
+		}
+
+		static color_rgba comp_min(const color_rgba& a, const color_rgba& b) { return color_rgba(basisu::minimum(a[0], b[0]), basisu::minimum(a[1], b[1]), basisu::minimum(a[2], b[2]), basisu::minimum(a[3], b[3])); }
+		static color_rgba comp_max(const color_rgba& a, const color_rgba& b) { return color_rgba(basisu::maximum(a[0], b[0]), basisu::maximum(a[1], b[1]), basisu::maximum(a[2], b[2]), basisu::maximum(a[3], b[3])); }
 	};

-	typedef std::vector<color_rgba> color_rgba_vec;
+	typedef basisu::vector<color_rgba> color_rgba_vec;

 	const color_rgba g_black_color(0, 0, 0, 255);
+	const color_rgba g_black_trans_color(0, 0, 0, 0);
 	const color_rgba g_white_color(255, 255, 255, 255);

 	inline int color_distance(int r0, int g0, int b0, int r1, int g1, int b1)
@ -595,6 +777,7 @@ namespace basisu
 	{
 		if (perceptual)
 		{
+#if BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE
 			const float l1 = e1.r * .2126f + e1.g * .715f + e1.b * .0722f;
 			const float l2 = e2.r * .2126f + e2.g * .715f + e2.b * .0722f;

@ -617,11 +800,61 @@ namespace basisu
 			}

 			return d;
+#elif 1
+			int dr = e1.r - e2.r;
+			int dg = e1.g - e2.g;
+			int db = e1.b - e2.b;
+
+			int delta_l = dr * 27 + dg * 92 + db * 9;
+			int delta_cr = dr * 128 - delta_l;
+			int delta_cb = db * 128 - delta_l;
+
+			uint32_t id = ((uint32_t)(delta_l * delta_l) >> 7U) +
+				((((uint32_t)(delta_cr * delta_cr) >> 7U) * 26U) >> 7U) +
+				((((uint32_t)(delta_cb * delta_cb) >> 7U) * 3U) >> 7U);
+
+			if (alpha)
+			{
+				int da = (e1.a - e2.a) << 7;
+				id += ((uint32_t)(da * da) >> 7U);
+			}
+
+			return id;
+#else
+			int dr = e1.r - e2.r;
+			int dg = e1.g - e2.g;
+			int db = e1.b - e2.b;
+
+			int64_t delta_l = dr * 27 + dg * 92 + db * 9;
+			int64_t delta_cr = dr * 128 - delta_l;
+			int64_t delta_cb = db * 128 - delta_l;
+
+			int64_t id = ((delta_l * delta_l) * 128) +
+				((delta_cr * delta_cr) * 26) +
+				((delta_cb * delta_cb) * 3);
+
+			if (alpha)
+			{
+				int64_t da = (e1.a - e2.a);
+				id += (da * da) * 128;
+			}
+
+			int d = (id + 8192) >> 14;
+
+			return d;
+#endif
 		}
 		else
 			return color_distance(e1, e2, alpha);
 	}

+	static inline uint32_t color_distance_la(const color_rgba& a, const color_rgba& b)
+	{
+		const int dl = a.r - b.r;
+		const int da = a.a - b.a;
+		return dl * dl + da * da;
+	}
+
 	// String helpers

 	inline int string_find_right(const std::string& filename, char c)
@ -929,7 +1162,7 @@ namespace basisu
 			float m_priority;
 		};

-		std::vector<entry> m_heap;
+		basisu::vector<entry> m_heap;
 		uint32_t m_size;

 		// Push down entry at index
@ -961,7 +1194,7 @@ namespace basisu
 	public:
 		typedef TrainingVectorType training_vec_type;
 		typedef std::pair<TrainingVectorType, uint64_t> training_vec_with_weight;
-		typedef std::vector< training_vec_with_weight > array_of_weighted_training_vecs;
+		typedef basisu::vector< training_vec_with_weight > array_of_weighted_training_vecs;

 		tree_vector_quant() :
 			m_next_codebook_index(0)
@ -981,7 +1214,7 @@ namespace basisu
 		const array_of_weighted_training_vecs &get_training_vecs() const	{ return m_training_vecs; }
 				array_of_weighted_training_vecs &get_training_vecs()			{ return m_training_vecs; }

-		void retrieve(std::vector< std::vector<uint32_t> > &codebook) const
+		void retrieve(basisu::vector< basisu::vector<uint32_t> > &codebook) const
 		{
 			for (uint32_t i = 0; i < m_nodes.size(); i++)
 			{
@ -994,7 +1227,7 @@ namespace basisu
 			}
 		}

-		void retrieve(std::vector<TrainingVectorType> &codebook) const
+		void retrieve(basisu::vector<TrainingVectorType> &codebook) const
 		{
 			for (uint32_t i = 0; i < m_nodes.size(); i++)
 			{
@ -1007,7 +1240,7 @@ namespace basisu
 			}
 		}

-		void retrieve(uint32_t max_clusters, std::vector<uint_vec> &codebook) const
+		void retrieve(uint32_t max_clusters, basisu::vector<uint_vec> &codebook) const
      {
 			uint_vec node_stack;
         node_stack.reserve(512);
@ -1054,7 +1287,7 @@ namespace basisu
 			priority_queue var_heap;
 			var_heap.init(max_size, 0, m_nodes[0].m_var);

-			std::vector<uint32_t> l_children, r_children;
+			basisu::vector<uint32_t> l_children, r_children;

 			// Now split the worst nodes
 			l_children.reserve(m_training_vecs.size() + 1);
@ -1092,7 +1325,7 @@ namespace basisu
 			inline tsvq_node() : m_weight(0), m_origin(cZero), m_left_index(-1), m_right_index(-1), m_codebook_index(-1) { }

 			// vecs is erased
-			inline void set(const TrainingVectorType &org, uint64_t weight, float var, std::vector<uint32_t> &vecs) { m_origin = org; m_weight = weight; m_var = var; m_training_vecs.swap(vecs); }
+			inline void set(const TrainingVectorType &org, uint64_t weight, float var, basisu::vector<uint32_t> &vecs) { m_origin = org; m_weight = weight; m_var = var; m_training_vecs.swap(vecs); }

 			inline bool is_leaf() const { return m_left_index < 0; }

@ -1100,11 +1333,11 @@ namespace basisu
 			uint64_t m_weight;
 			TrainingVectorType m_origin;
 			int32_t m_left_index, m_right_index;
-			std::vector<uint32_t> m_training_vecs;
+			basisu::vector<uint32_t> m_training_vecs;
 			int m_codebook_index;
 		};

-		typedef std::vector<tsvq_node> tsvq_node_vec;
+		typedef basisu::vector<tsvq_node> tsvq_node_vec;
 		tsvq_node_vec m_nodes;

 		array_of_weighted_training_vecs m_training_vecs;
@ -1139,7 +1372,7 @@ namespace basisu
 			return root;
 		}

-		bool split_node(uint32_t node_index, priority_queue &var_heap, std::vector<uint32_t> &l_children, std::vector<uint32_t> &r_children)
+		bool split_node(uint32_t node_index, priority_queue &var_heap, basisu::vector<uint32_t> &l_children, basisu::vector<uint32_t> &r_children)
 		{
 			TrainingVectorType l_child_org, r_child_org;
 			uint64_t l_weight = 0, r_weight = 0;
@ -1239,7 +1472,7 @@ namespace basisu

 		bool prep_split(const tsvq_node &node, TrainingVectorType &l_child_result, TrainingVectorType &r_child_result) const
 		{
-			const uint32_t N = TrainingVectorType::num_elements;
+			//const uint32_t N = TrainingVectorType::num_elements;

 			if (2 == node.m_training_vecs.size())
 			{
@ -1304,7 +1537,7 @@ namespace basisu
 				if (largest_axis_index < 0)
 					return false;

-				std::vector<float> keys(node.m_training_vecs.size());
+				basisu::vector<float> keys(node.m_training_vecs.size());
 				for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
 					keys[i] = m_training_vecs[node.m_training_vecs[i]].first[largest_axis_index];

@ -1352,8 +1585,8 @@ namespace basisu
 		}

 		bool refine_split(const tsvq_node &node,
-			TrainingVectorType &l_child, uint64_t &l_weight, float &l_var, std::vector<uint32_t> &l_children,
-			TrainingVectorType &r_child, uint64_t &r_weight, float &r_var, std::vector<uint32_t> &r_children) const
+			TrainingVectorType &l_child, uint64_t &l_weight, float &l_var, basisu::vector<uint32_t> &l_children,
+			TrainingVectorType &r_child, uint64_t &r_weight, float &r_var, basisu::vector<uint32_t> &r_children) const
 		{
 			l_children.reserve(node.m_training_vecs.size());
 			r_children.reserve(node.m_training_vecs.size());
@ -1466,8 +1699,8 @@ namespace basisu
 	template<typename Quantizer>
 	bool generate_hierarchical_codebook_threaded_internal(Quantizer& q,
 		uint32_t max_codebook_size, uint32_t max_parent_codebook_size,
-		std::vector<uint_vec>& codebook,
-		std::vector<uint_vec>& parent_codebook,
+		basisu::vector<uint_vec>& codebook,
+		basisu::vector<uint_vec>& parent_codebook,
 		uint32_t max_threads, bool limit_clusterizers, job_pool *pJob_pool)
 	{
 		codebook.resize(0);
@ -1493,7 +1726,7 @@ namespace basisu
 		if (!q.generate(max_threads))
 			return false;

-		std::vector<uint_vec> initial_codebook;
+		basisu::vector<uint_vec> initial_codebook;

 		q.retrieve(initial_codebook);

@ -1512,12 +1745,14 @@ namespace basisu
 		bool success_flags[cMaxThreads];
 		clear_obj(success_flags);

-		std::vector<uint_vec> local_clusters[cMaxThreads];
-		std::vector<uint_vec> local_parent_clusters[cMaxThreads];
+		basisu::vector<uint_vec> local_clusters[cMaxThreads];
+		basisu::vector<uint_vec> local_parent_clusters[cMaxThreads];

 		for (uint32_t thread_iter = 0; thread_iter < max_threads; thread_iter++)
 		{
+#ifndef __EMSCRIPTEN__
 			pJob_pool->add_job( [thread_iter, &local_clusters, &local_parent_clusters, &success_flags, &quantizers, &initial_codebook, &q, &limit_clusterizers, &max_codebook_size, &max_threads, &max_parent_codebook_size] {
+#endif

 				Quantizer& lq = quantizers[thread_iter];
 				uint_vec& cluster_indices = initial_codebook[thread_iter];
@ -1558,11 +1793,15 @@ namespace basisu
 					}
 				}

+#ifndef __EMSCRIPTEN__
 			} );
+#endif

 		} // thread_iter

+#ifndef __EMSCRIPTEN__
 		pJob_pool->wait_for_all();
+#endif

 		uint32_t total_clusters = 0, total_parent_clusters = 0;

@ -1598,8 +1837,8 @@ namespace basisu
 	template<typename Quantizer>
 	bool generate_hierarchical_codebook_threaded(Quantizer& q,
 		uint32_t max_codebook_size, uint32_t max_parent_codebook_size,
-		std::vector<uint_vec>& codebook,
-		std::vector<uint_vec>& parent_codebook,
+		basisu::vector<uint_vec>& codebook,
+		basisu::vector<uint_vec>& parent_codebook,
 		uint32_t max_threads, job_pool *pJob_pool)
 	{
 		typedef bit_hasher<typename Quantizer::training_vec_type> training_vec_bit_hasher;
@ -1629,7 +1868,7 @@ namespace basisu

 		Quantizer group_quant;
 		typedef typename group_hash::const_iterator group_hash_const_iter;
-		std::vector<group_hash_const_iter> unique_vec_iters;
+		basisu::vector<group_hash_const_iter> unique_vec_iters;
 		unique_vec_iters.reserve(unique_vecs.size());

 		for (auto iter = unique_vecs.begin(); iter != unique_vecs.end(); ++iter)
@ -1644,7 +1883,7 @@ namespace basisu

 		debug_printf("Limit clusterizers: %u\n", limit_clusterizers);

-		std::vector<uint_vec> group_codebook, group_parent_codebook;
+		basisu::vector<uint_vec> group_codebook, group_parent_codebook;
 		bool status = generate_hierarchical_codebook_threaded_internal(group_quant,
 			max_codebook_size, max_parent_codebook_size,
 			group_codebook,
@ -1693,7 +1932,7 @@ namespace basisu

 	class histogram
 	{
-		std::vector<uint32_t> m_hist;
+		basisu::vector<uint32_t> m_hist;

 	public:
 		histogram(uint32_t size = 0) { init(size); }
@ -1754,7 +1993,8 @@ namespace basisu
 		
 	struct sym_freq
 	{
-		uint16_t m_key, m_sym_index;
+		uint32_t m_key;
+		uint16_t m_sym_index;
 	};

 	sym_freq *canonical_huffman_radix_sort_syms(uint32_t num_syms, sym_freq *pSyms0, sym_freq *pSyms1);
@ -1835,7 +2075,7 @@ namespace basisu
 		{
 			if (m_bit_buffer_size)
 			{
-				m_total_bits += 8;
+				m_total_bits += 8 - (m_bit_buffer_size & 7);
 				append_byte(static_cast<uint8_t>(m_bit_buffer));

 				m_bit_buffer = 0;
@ -2107,6 +2347,12 @@ namespace basisu
 			resize(w, h, p);
 		}

+		image(const uint8_t *pImage, uint32_t width, uint32_t height, uint32_t comps) :
+			m_width(0), m_height(0), m_pitch(0)
+		{
+			init(pImage, width, height, comps);
+		}
+
 		image(const image &other) :
 			m_width(0), m_height(0), m_pitch(0)
 		{
@ -2155,6 +2401,47 @@ namespace basisu
 			return *this;
 		}

+		void init(const uint8_t *pImage, uint32_t width, uint32_t height, uint32_t comps)
+		{
+			assert(comps >= 1 && comps <= 4);
+			
+			resize(width, height);
+
+			for (uint32_t y = 0; y < height; y++)
+			{
+				for (uint32_t x = 0; x < width; x++)
+				{
+					const uint8_t *pSrc = &pImage[(x + y * width) * comps];
+					color_rgba &dst = (*this)(x, y);
+
+					if (comps == 1)
+					{
+						dst.r = pSrc[0];
+						dst.g = pSrc[0];
+						dst.b = pSrc[0];
+						dst.a = 255;
+					}
+					else if (comps == 2)
+					{
+						dst.r = pSrc[0];
+						dst.g = pSrc[0];
+						dst.b = pSrc[0];
+						dst.a = pSrc[1];
+					}
+					else
+					{
+						dst.r = pSrc[0];
+						dst.g = pSrc[1];
+						dst.b = pSrc[2];
+						if (comps == 4)
+							dst.a = pSrc[3];
+						else
+							dst.a = 255;
+					}
+				}
+			}
+		}
+
 		image &fill_box(uint32_t x, uint32_t y, uint32_t w, uint32_t h, const color_rgba &c)
 		{
 			for (uint32_t iy = 0; iy < h; iy++)
@ -2163,6 +2450,14 @@ namespace basisu
 			return *this;
 		}

+		image& fill_box_alpha(uint32_t x, uint32_t y, uint32_t w, uint32_t h, const color_rgba& c)
+		{
+			for (uint32_t iy = 0; iy < h; iy++)
+				for (uint32_t ix = 0; ix < w; ix++)
+					set_clipped_alpha(x + ix, y + iy, c);
+			return *this;
+		}
+
 		image &crop_dup_borders(uint32_t w, uint32_t h)
 		{
 			const uint32_t orig_w = m_width, orig_h = m_height;
@ -2252,6 +2547,13 @@ namespace basisu
 			return *this;
 		}

+		inline image& set_clipped_alpha(int x, int y, const color_rgba& c)
+		{
+			if ((static_cast<uint32_t>(x) < m_width) && (static_cast<uint32_t>(y) < m_height))
+				(*this)(x, y).m_comps[3] = c.m_comps[3];
+			return *this;
+		}
+
 		// Very straightforward blit with full clipping. Not fast, but it works.
 		image &blit(const image &src, int src_x, int src_y, int src_w, int src_h, int dst_x, int dst_y)
 		{
@ -2376,6 +2678,8 @@ namespace basisu
 			}
 			return *this;
 		}
+
+		void debug_text(uint32_t x_ofs, uint32_t y_ofs, uint32_t x_scale, uint32_t y_scale, const color_rgba &fg, const color_rgba *pBG, bool alpha_only, const char* p, ...);
 				
 	private:
 		uint32_t m_width, m_height, m_pitch;  // all in pixels
@ -2384,7 +2688,7 @@ namespace basisu

 	// Float images

-	typedef std::vector<vec4F> vec4F_vec;
+	typedef basisu::vector<vec4F> vec4F_vec;

 	class imagef
 	{
@ -2635,10 +2939,27 @@ namespace basisu
 	};

 	// Image saving/loading/resampling
-
+	
+	bool load_png(const uint8_t* pBuf, size_t buf_size, image& img, const char* pFilename = nullptr);
 	bool load_png(const char* pFilename, image& img);
 	inline bool load_png(const std::string &filename, image &img) { return load_png(filename.c_str(), img); }

+	bool load_bmp(const char* pFilename, image& img);
+	inline bool load_bmp(const std::string &filename, image &img) { return load_bmp(filename.c_str(), img); }
+		
+	bool load_tga(const char* pFilename, image& img);
+	inline bool load_tga(const std::string &filename, image &img) { return load_tga(filename.c_str(), img); }
+
+	bool load_jpg(const char *pFilename, image& img);
+	inline bool load_jpg(const std::string &filename, image &img) { return load_jpg(filename.c_str(), img); }
+	
+	// Currently loads .BMP, .PNG, or .TGA.
+	bool load_image(const char* pFilename, image& img);
+	inline bool load_image(const std::string &filename, image &img) { return load_image(filename.c_str(), img); }
+
+	uint8_t *read_tga(const uint8_t *pBuf, uint32_t buf_size, int &width, int &height, int &n_chans);
+	uint8_t *read_tga(const char *pFilename, int &width, int &height, int &n_chans);
+		
 	enum
 	{
 		cImageSaveGrayscale = 1,
@ -2697,7 +3018,7 @@ namespace basisu
 	template<typename T>
 	class vector2D
 	{
-		typedef std::vector<T> TVec;
+		typedef basisu::vector<T> TVec;

 		uint32_t m_width, m_height;
 		TVec m_values;
@ -2800,7 +3121,7 @@ namespace basisu
 	}

 	void fill_buffer_with_random_bytes(void *pBuf, size_t size, uint32_t seed = 1);
-
+		
 } // namespace basisu


--- a/thirdparty/basis_universal/encoder/basisu_etc.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_etc.cpp
--- a/thirdparty/basis_universal/encoder/basisu_etc.h
+++ b/thirdparty/basis_universal/encoder/basisu_etc.h
@ -1,5 +1,5 @@
 // basis_etc.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -13,9 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
-#include "transcoder/basisu.h"
+#include "../transcoder/basisu.h"
 #include "basisu_enc.h"
-#include <set>

 namespace basisu
 {
@ -116,7 +115,7 @@ namespace basisu
 		{
 			assert((ofs + num) <= 64U);
 			assert(num && (num < 32U));
-			return (read_be64(&m_uint64) >> ofs) & ((1UL << num) - 1UL);
+			return (uint32_t)(read_be64(&m_uint64) >> ofs) & ((1UL << num) - 1UL);
 		}

 		inline void set_general_bits(uint32_t ofs, uint32_t num, uint32_t bits)
@ -266,6 +265,27 @@ namespace basisu
 			p[-2] |= (msb << byte_bit_ofs);
 		}

+		// Selector "etc1_val" ranges from 0-3 and is a direct (raw) ETC1 selector.
+		inline void set_raw_selector(uint32_t x, uint32_t y, uint32_t etc1_val)
+		{
+			assert((x | y | etc1_val) < 4);
+			const uint32_t bit_index = x * 4 + y;
+
+			uint8_t* p = &m_bytes[7 - (bit_index >> 3)];
+
+			const uint32_t byte_bit_ofs = bit_index & 7;
+			const uint32_t mask = 1 << byte_bit_ofs;
+						
+			const uint32_t lsb = etc1_val & 1;
+			const uint32_t msb = etc1_val >> 1;
+
+			p[0] &= ~mask;
+			p[0] |= (lsb << byte_bit_ofs);
+
+			p[-2] &= ~mask;
+			p[-2] |= (msb << byte_bit_ofs);
+		}
+
 		inline uint32_t get_raw_selector_bits() const
 		{
 			return m_bytes[4] | (m_bytes[5] << 8) | (m_bytes[6] << 16) | (m_bytes[7] << 24);
@ -622,6 +642,23 @@ namespace basisu
 			return true;
 		}

+		bool set_block_color5_clamp(const color_rgba &c0_unscaled, const color_rgba &c1_unscaled)
+		{
+			set_diff_bit(true);
+			set_base5_color(pack_color5(c0_unscaled, false));
+
+			int dr = c1_unscaled.r - c0_unscaled.r;
+			int dg = c1_unscaled.g - c0_unscaled.g;
+			int db = c1_unscaled.b - c0_unscaled.b;
+			
+			dr = clamp<int>(dr, cETC1ColorDeltaMin, cETC1ColorDeltaMax);
+			dg = clamp<int>(dg, cETC1ColorDeltaMin, cETC1ColorDeltaMax);
+			db = clamp<int>(db, cETC1ColorDeltaMin, cETC1ColorDeltaMax);
+						
+			set_delta3_color(pack_delta3(dr, dg, db));
+
+			return true;
+		}
 		color_rgba get_selector_color(uint32_t x, uint32_t y, uint32_t s) const
 		{
 			color_rgba block_colors[4];
@ -720,7 +757,7 @@ namespace basisu
 		}
 	};
 		
-	typedef std::vector<etc_block> etc_block_vec;
+	typedef basisu::vector<etc_block> etc_block_vec;

 	// Returns false if the unpack fails (could be bogus data or ETC2)
 	bool unpack_etc1(const etc_block& block, color_rgba *pDst, bool preserve_alpha = false);
@ -844,10 +881,10 @@ namespace basisu
 				bb = (m_unscaled_color.b >> 2) | (m_unscaled_color.b << 3);
 			}
 			const int* pInten_table = g_etc1_inten_tables[m_inten_table];
-			pBlock_colors[0].set((uint8_t)(br + pInten_table[0]), (uint8_t)(bg + pInten_table[0]), (uint8_t)(bb + pInten_table[0]), 255);
-			pBlock_colors[1].set((uint8_t)(br + pInten_table[1]), (uint8_t)(bg + pInten_table[1]), (uint8_t)(bb + pInten_table[1]), 255);
-			pBlock_colors[2].set((uint8_t)(br + pInten_table[2]), (uint8_t)(bg + pInten_table[2]), (uint8_t)(bb + pInten_table[2]), 255);
-			pBlock_colors[3].set((uint8_t)(br + pInten_table[3]), (uint8_t)(bg + pInten_table[3]), (uint8_t)(bb + pInten_table[3]), 255);
+			pBlock_colors[0].set(br + pInten_table[0], bg + pInten_table[0], bb + pInten_table[0], 255);
+			pBlock_colors[1].set(br + pInten_table[1], bg + pInten_table[1], bb + pInten_table[1], 255);
+			pBlock_colors[2].set(br + pInten_table[2], bg + pInten_table[2], bb + pInten_table[2], 255);
+			pBlock_colors[3].set(br + pInten_table[3], bg + pInten_table[3], bb + pInten_table[3], 255);
 		}

 		color_rgba m_unscaled_color;
@ -914,9 +951,6 @@ namespace basisu
 				m_refinement = true;

 				m_pForce_selectors = nullptr;
-
-				m_pEval_solution_override = nullptr;
-				m_pEval_solution_override_data = nullptr;
 			}

 			uint32_t m_num_src_pixels;
@ -932,9 +966,6 @@ namespace basisu
 			bool m_refinement;

 			const uint8_t* m_pForce_selectors;
-
-			evaluate_solution_override_func m_pEval_solution_override;
-			void *m_pEval_solution_override_data;
 		};

 		struct results
@ -970,7 +1001,7 @@ namespace basisu
 			}

 			etc1_solution_coordinates  m_coords;
-			std::vector<uint8_t>    m_selectors;
+			basisu::vector<uint8_t>    m_selectors;
 			uint64_t                     m_error;
 			bool                       m_valid;

@ -1001,33 +1032,36 @@ namespace basisu

 		vec3F m_avg_color;
 		int m_br, m_bg, m_bb;
-		std::vector<uint16_t> m_luma;
-		std::vector<uint32_t> m_sorted_luma;
-		std::vector<uint32_t> m_sorted_luma_indices;
+		int m_max_comp_spread;
+		basisu::vector<uint16_t> m_luma;
+		basisu::vector<uint32_t> m_sorted_luma;
+		basisu::vector<uint32_t> m_sorted_luma_indices;
 		const uint32_t* m_pSorted_luma_indices;
 		uint32_t* m_pSorted_luma;

-		std::vector<uint8_t> m_selectors;
-		std::vector<uint8_t> m_best_selectors;
+		basisu::vector<uint8_t> m_selectors;
+		basisu::vector<uint8_t> m_best_selectors;

 		potential_solution m_best_solution;
 		potential_solution m_trial_solution;
-		std::vector<uint8_t> m_temp_selectors;
-
-		std::set<uint32_t> m_solutions_tried;
+		basisu::vector<uint8_t> m_temp_selectors;

+		enum { cSolutionsTriedHashBits = 10, cTotalSolutionsTriedHashSize = 1 << cSolutionsTriedHashBits, cSolutionsTriedHashMask = cTotalSolutionsTriedHashSize - 1 };
+		uint8_t m_solutions_tried[cTotalSolutionsTriedHashSize / 8];
+		
 		void get_nearby_inten_tables(uint32_t idx, int &first_inten_table, int &last_inten_table)
 		{
 			first_inten_table = maximum<int>(idx - 1, 0);
 			last_inten_table = minimum<int>(cETC1IntenModifierValues, idx + 1);
 		}
-
+		
+		bool check_for_redundant_solution(const etc1_solution_coordinates& coords);
 		bool evaluate_solution_slow(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution);
 		bool evaluate_solution_fast(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution);

 		inline bool evaluate_solution(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution)
 		{
-			if (m_pParams->m_quality >= cETCQualitySlow)
+			if (m_pParams->m_quality >= cETCQualityMedium)
 				return evaluate_solution_slow(coords, trial_solution, pBest_solution);
 			else
 				return evaluate_solution_fast(coords, trial_solution, pBest_solution);
@ -1042,5 +1076,77 @@ namespace basisu
 	{
 		etc1_optimizer m_optimizer;
 	};
+	
+	void pack_etc1_solid_color_init();
+	uint64_t pack_etc1_block_solid_color(etc_block& block, const uint8_t* pColor);

+	// ETC EAC
+	extern const int8_t g_etc2_eac_tables[16][8];
+	extern const int8_t g_etc2_eac_tables8[16][8];
+
+	const uint32_t ETC2_EAC_MIN_VALUE_SELECTOR = 3, ETC2_EAC_MAX_VALUE_SELECTOR = 7;
+
+	struct eac_a8_block
+	{
+		uint16_t m_base : 8;
+		uint16_t m_table : 4;
+		uint16_t m_multiplier : 4;
+
+		uint8_t m_selectors[6];
+
+		inline uint32_t get_selector(uint32_t x, uint32_t y, uint64_t selector_bits) const
+		{
+			assert((x < 4) && (y < 4));
+			return static_cast<uint32_t>((selector_bits >> (45 - (y + x * 4) * 3)) & 7);
+		}
+
+		inline uint64_t get_selector_bits() const
+		{
+			uint64_t pixels = ((uint64_t)m_selectors[0] << 40) | ((uint64_t)m_selectors[1] << 32) | ((uint64_t)m_selectors[2] << 24) | ((uint64_t)m_selectors[3] << 16) | ((uint64_t)m_selectors[4] << 8) | m_selectors[5];
+			return pixels;
+		}
+
+		inline void set_selector_bits(uint64_t pixels)
+		{
+			m_selectors[0] = (uint8_t)(pixels >> 40);
+			m_selectors[1] = (uint8_t)(pixels >> 32);
+			m_selectors[2] = (uint8_t)(pixels >> 24);
+			m_selectors[3] = (uint8_t)(pixels >> 16);
+			m_selectors[4] = (uint8_t)(pixels >> 8);
+			m_selectors[5] = (uint8_t)(pixels);
+		}
+
+		void set_selector(uint32_t x, uint32_t y, uint32_t s)
+		{
+			assert((x < 4) && (y < 4) && (s < 8));
+
+			const uint32_t ofs = 45 - (y + x * 4) * 3;
+
+			uint64_t pixels = get_selector_bits();
+
+			pixels &= ~(7ULL << ofs);
+			pixels |= (static_cast<uint64_t>(s) << ofs);
+
+			set_selector_bits(pixels);
+		}
+	};
+
+	struct etc2_rgba_block
+	{
+		eac_a8_block m_alpha;
+		etc_block m_rgb;
+	};
+
+	struct pack_eac_a8_results
+	{
+		uint32_t m_base;
+		uint32_t m_table;
+		uint32_t m_multiplier;
+		uint8_vec m_selectors;
+		uint8_vec m_selectors_temp;
+	};
+
+	uint64_t pack_eac_a8(pack_eac_a8_results& results, const uint8_t* pPixels, uint32_t num_pixels, uint32_t base_search_rad, uint32_t mul_search_rad, uint32_t table_mask = UINT32_MAX);
+	void pack_eac_a8(eac_a8_block* pBlock, const uint8_t* pPixels, uint32_t base_search_rad, uint32_t mul_search_rad, uint32_t table_mask = UINT32_MAX);
+		
 } // namespace basisu
--- a/thirdparty/basis_universal/encoder/basisu_frontend.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_frontend.cpp
--- a/thirdparty/basis_universal/encoder/basisu_frontend.h
+++ b/thirdparty/basis_universal/encoder/basisu_frontend.h
@ -1,5 +1,5 @@
 // basisu_frontend.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -17,7 +17,8 @@
 #include "basisu_etc.h"
 #include "basisu_gpu_texture.h"
 #include "basisu_global_selector_palette_helpers.h"
-#include "transcoder/basisu_file_headers.h"
+#include "../transcoder/basisu_file_headers.h"
+#include "../transcoder/basisu_transcoder.h"

 namespace basisu
 {
@ -34,8 +35,8 @@ namespace basisu
 		uint32_t &operator[] (uint32_t i) { assert(i < 2); return m_comps[i]; }
 	};

-	const uint32_t BASISU_DEFAULT_COMPRESSION_LEVEL = 1;
-	const uint32_t BASISU_MAX_COMPRESSION_LEVEL = 5;
+	const uint32_t BASISU_DEFAULT_COMPRESSION_LEVEL = 2;
+	const uint32_t BASISU_MAX_COMPRESSION_LEVEL = 6;

 	class basisu_frontend
 	{
@ -72,16 +73,19 @@ namespace basisu
 				m_perceptual(true),
 				m_debug_stats(false),
 				m_debug_images(false),
+																
 				m_dump_endpoint_clusterization(true),
+				m_validate(false),
+				m_multithreaded(false),
+				m_disable_hierarchical_endpoint_codebooks(false),
 				m_pGlobal_sel_codebook(NULL),
 				m_num_global_sel_codebook_pal_bits(0),
 				m_num_global_sel_codebook_mod_bits(0),
 				m_use_hybrid_selector_codebooks(false),
 				m_hybrid_codebook_quality_thresh(0.0f),
-				m_validate(false),
 				m_tex_type(basist::cBASISTexType2D),
-				m_multithreaded(false),
-				m_disable_hierarchical_endpoint_codebooks(false),
+				m_pGlobal_codebooks(nullptr),
+				
 				m_pJob_pool(nullptr)
 			{
 			}
@ -108,6 +112,7 @@ namespace basisu
 			bool m_use_hybrid_selector_codebooks;
 			float m_hybrid_codebook_quality_thresh;
 			basist::basis_texture_type m_tex_type;
+			const basist::basisu_lowlevel_etc1s_transcoder *m_pGlobal_codebooks;
 			
 			job_pool *m_pJob_pool;
 		};
@ -142,7 +147,7 @@ namespace basisu
 		bool get_endpoint_cluster_color_is_used(uint32_t cluster_index, bool individual_mode) const { return m_endpoint_cluster_etc_params[cluster_index].m_color_used[individual_mode]; }

 		// Selector clusters
-		uint32_t get_total_selector_clusters() const { return static_cast<uint32_t>(m_selector_cluster_indices.size()); }
+		uint32_t get_total_selector_clusters() const { return static_cast<uint32_t>(m_selector_cluster_block_indices.size()); }
 		uint32_t get_block_selector_cluster_index(uint32_t block_index) const { return m_block_selector_cluster_index[block_index]; }
 		const etc_block &get_selector_cluster_selector_bits(uint32_t cluster_index) const { return m_optimized_cluster_selectors[cluster_index]; }

@ -150,7 +155,7 @@ namespace basisu
 		const bool_vec &get_selector_cluster_uses_global_cb_vec() const { return m_selector_cluster_uses_global_cb; }

 		// Returns block indices using each selector cluster
-		const uint_vec &get_selector_cluster_block_indices(uint32_t selector_cluster_index) const { return m_selector_cluster_indices[selector_cluster_index]; }
+		const uint_vec &get_selector_cluster_block_indices(uint32_t selector_cluster_index) const { return m_selector_cluster_block_indices[selector_cluster_index]; }

 		void dump_debug_image(const char *pFilename, uint32_t first_block, uint32_t num_blocks_x, uint32_t num_blocks_y, bool output_blocks);
 		
@ -188,16 +193,16 @@ namespace basisu

 		// For each endpoint cluster: An array of which subblock indices (block_index*2+subblock) are located in that cluster.
 		// Array of block indices for each endpoint cluster
-		std::vector<uint_vec> m_endpoint_clusters; 
+		basisu::vector<uint_vec> m_endpoint_clusters;

 		// Array of block indices for each parent endpoint cluster
-		std::vector<uint_vec> m_endpoint_parent_clusters;  
+		basisu::vector<uint_vec> m_endpoint_parent_clusters;
 		
 		// Each block's parent cluster index
 		uint8_vec m_block_parent_endpoint_cluster; 

 		// Array of endpoint cluster indices for each parent endpoint cluster
-		std::vector<uint_vec> m_endpoint_clusters_within_each_parent_cluster; 
+		basisu::vector<uint_vec> m_endpoint_clusters_within_each_parent_cluster;
 				
 		struct endpoint_cluster_etc_params
 		{
@ -267,35 +272,35 @@ namespace basisu
 			}
 		};

-		typedef std::vector<endpoint_cluster_etc_params> cluster_subblock_etc_params_vec;
+		typedef basisu::vector<endpoint_cluster_etc_params> cluster_subblock_etc_params_vec;
 		
 		// Each endpoint cluster's ETC1S parameters 
 		cluster_subblock_etc_params_vec m_endpoint_cluster_etc_params;

 		// The endpoint cluster index used by each ETC1 subblock.
-		std::vector<vec2U> m_block_endpoint_clusters_indices;
+		basisu::vector<vec2U> m_block_endpoint_clusters_indices;
 				
 		// The block(s) within each selector cluster
 		// Note: If you add anything here that uses selector cluster indicies, be sure to update optimize_selector_codebook()!
-		std::vector<uint_vec> m_selector_cluster_indices;
+		basisu::vector<uint_vec> m_selector_cluster_block_indices;

 		// The selector bits for each selector cluster.
-		std::vector<etc_block> m_optimized_cluster_selectors;
+		basisu::vector<etc_block> m_optimized_cluster_selectors;

 		// The block(s) within each parent selector cluster.
-		std::vector<uint_vec> m_selector_parent_cluster_indices;
+		basisu::vector<uint_vec> m_selector_parent_cluster_block_indices;
 		
 		// Each block's parent selector cluster
 		uint8_vec m_block_parent_selector_cluster;

 		// Array of selector cluster indices for each parent selector cluster
-		std::vector<uint_vec> m_selector_clusters_within_each_parent_cluster; 
+		basisu::vector<uint_vec> m_selector_clusters_within_each_parent_cluster;

 		basist::etc1_global_selector_codebook_entry_id_vec m_optimized_cluster_selector_global_cb_ids;
 		bool_vec m_selector_cluster_uses_global_cb;

 		// Each block's selector cluster index
-		std::vector<uint32_t> m_block_selector_cluster_index;
+		basisu::vector<uint32_t> m_block_selector_cluster_index;

 		struct subblock_endpoint_quant_err
 		{
@ -321,13 +326,14 @@ namespace basisu
 		};

 		// The sorted subblock endpoint quant error for each endpoint cluster
-		std::vector<subblock_endpoint_quant_err> m_subblock_endpoint_quant_err_vec;
+		basisu::vector<subblock_endpoint_quant_err> m_subblock_endpoint_quant_err_vec;

 		std::mutex m_lock;

 		//-----------------------------------------------------------------------------

 		void init_etc1_images();
+		bool init_global_codebooks();
 		void init_endpoint_training_vectors();
 		void dump_endpoint_clusterization_visualization(const char *pFilename, bool vis_endpoint_colors);
 		void generate_endpoint_clusters();
--- a/thirdparty/basis_universal/encoder/basisu_global_selector_palette_helpers.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_global_selector_palette_helpers.cpp
--- a/thirdparty/basis_universal/encoder/basisu_global_selector_palette_helpers.h
+++ b/thirdparty/basis_universal/encoder/basisu_global_selector_palette_helpers.h
@ -14,9 +14,9 @@
 // limitations under the License.
 #pragma once

-#include "transcoder/basisu.h"
+#include "../transcoder/basisu.h"
 #include "basisu_etc.h"
-#include "transcoder/basisu_global_selector_palette.h"
+#include "../transcoder/basisu_global_selector_palette.h"

 namespace basisu
 {
@ -36,7 +36,7 @@ namespace basisu

 		void clear() { clear_obj(*this); }
 	};
-	typedef std::vector<pixel_block> pixel_block_vec;
+	typedef basisu::vector<pixel_block> pixel_block_vec;

 	uint64_t etc1_global_selector_codebook_find_best_entry(const basist::etc1_global_selector_codebook &codebook,
 		uint32_t num_src_pixel_blocks, const pixel_block *pSrc_pixel_blocks, const etc_block *pBlock_endpoints,
--- a/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp
@ -1,5 +1,5 @@
 // basisu_gpu_texture.cpp
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -16,38 +16,10 @@
 #include "basisu_enc.h"
 #include "basisu_pvrtc1_4.h"
 #include "basisu_astc_decomp.h"
+#include "basisu_bc7enc.h"

 namespace basisu
 {
-	const int8_t g_etc2_eac_tables[16][8] = 
-	{
-		{ -3, -6, -9, -15, 2, 5, 8, 14 }, { -3, -7, -10, -13, 2, 6, 9, 12 }, { -2, -5, -8, -13, 1, 4, 7, 12 }, { -2, -4, -6, -13, 1, 3, 5, 12 },
-		{ -3, -6, -8, -12, 2, 5, 7, 11 }, { -3, -7, -9, -11, 2, 6, 8, 10 }, { -4, -7, -8, -11, 3, 6, 7, 10 }, { -3, -5, -8, -11, 2, 4, 7, 10 },
-		{ -2, -6, -8, -10, 1, 5, 7, 9 }, { -2, -5, -8, -10, 1, 4, 7, 9 }, { -2, -4, -8, -10, 1, 3, 7, 9 }, { -2, -5, -7, -10, 1, 4, 6, 9 },
-		{ -3, -4, -7, -10, 2, 3, 6, 9 }, { -1, -2, -3, -10, 0, 1, 2, 9 }, { -4, -6, -8, -9, 3, 5, 7, 8 }, { -3, -5, -7, -9, 2, 4, 6, 8 }
-	};
-
-	struct eac_a8_block
-	{
-		uint16_t m_base : 8;
-		uint16_t m_table : 4;
-		uint16_t m_multiplier : 4;
-
-		uint8_t m_selectors[6];
-
-		inline uint32_t get_selector(uint32_t x, uint32_t y, uint64_t selector_bits) const
-		{
-			assert((x < 4) && (y < 4));
-			return static_cast<uint32_t>((selector_bits >> (45 - (y + x * 4) * 3)) & 7);
-		}
-				
-		inline uint64_t get_selector_bits() const
-		{
-			uint64_t pixels = ((uint64_t)m_selectors[0] << 40) | ((uint64_t)m_selectors[1] << 32) | ((uint64_t)m_selectors[2] << 24) |	((uint64_t)m_selectors[3] << 16) | ((uint64_t)m_selectors[4] << 8) | m_selectors[5];
-			return pixels;
-		}
-	};
-		
 	void unpack_etc2_eac(const void *pBlock_bits, color_rgba *pPixels)
 	{
 		static_assert(sizeof(eac_a8_block) == 8, "sizeof(eac_a8_block) == 8");
@ -123,19 +95,18 @@ namespace basisu
 		bc1_block::unpack_color(l, r0, g0, b0);
 		bc1_block::unpack_color(h, r1, g1, b1);

+		c[0].set_noclamp_rgba(r0, g0, b0, 255);
+		c[1].set_noclamp_rgba(r1, g1, b1, 255);
+
 		bool used_punchthrough = false;

 		if (l > h)
 		{
-			c[0].set_noclamp_rgba(r0, g0, b0, 255);
-			c[1].set_noclamp_rgba(r1, g1, b1, 255);
 			c[2].set_noclamp_rgba((r0 * 2 + r1) / 3, (g0 * 2 + g1) / 3, (b0 * 2 + b1) / 3, 255);
 			c[3].set_noclamp_rgba((r1 * 2 + r0) / 3, (g1 * 2 + g0) / 3, (b1 * 2 + b0) / 3, 255);
 		}
 		else
 		{
-			c[0].set_noclamp_rgba(r0, g0, b0, 255);
-			c[1].set_noclamp_rgba(r1, g1, b1, 255);
 			c[2].set_noclamp_rgba((r0 + r1) / 2, (g0 + g1) / 2, (b0 + b1) / 2, 255);
 			c[3].set_noclamp_rgba(0, 0, 0, 0);
 			used_punchthrough = true;
@ -165,6 +136,142 @@ namespace basisu
 		return used_punchthrough;
 	}

+	bool unpack_bc1_nv(const void *pBlock_bits, color_rgba *pPixels, bool set_alpha)
+	{
+		static_assert(sizeof(bc1_block) == 8, "sizeof(bc1_block) == 8");
+
+		const bc1_block *pBlock = static_cast<const bc1_block *>(pBlock_bits);
+
+		const uint32_t l = pBlock->get_low_color();
+		const uint32_t h = pBlock->get_high_color();
+
+		color_rgba c[4];
+
+		int r0 = (l >> 11) & 31;
+		int g0 = (l >> 5) & 63;
+		int b0 = l & 31;
+		int r1 = (h >> 11) & 31;
+		int g1 = (h >> 5) & 63;
+		int b1 = h & 31;
+
+		c[0].b = (uint8_t)((3 * b0 * 22) / 8);
+		c[0].g = (uint8_t)((g0 << 2) | (g0 >> 4));
+		c[0].r = (uint8_t)((3 * r0 * 22) / 8);
+		c[0].a = 0xFF;
+
+		c[1].r = (uint8_t)((3 * r1 * 22) / 8);
+		c[1].g = (uint8_t)((g1 << 2) | (g1 >> 4));
+		c[1].b = (uint8_t)((3 * b1 * 22) / 8);
+		c[1].a = 0xFF;
+
+		int gdiff = c[1].g - c[0].g;
+
+		bool used_punchthrough = false;
+
+		if (l > h)
+		{
+			c[2].r = (uint8_t)(((2 * r0 + r1) * 22) / 8);
+			c[2].g = (uint8_t)(((256 * c[0].g + gdiff/4 + 128 + gdiff * 80) / 256));
+			c[2].b = (uint8_t)(((2 * b0 + b1) * 22) / 8);
+			c[2].a = 0xFF;
+
+			c[3].r = (uint8_t)(((2 * r1 + r0) * 22) / 8);
+			c[3].g = (uint8_t)((256 * c[1].g - gdiff/4 + 128 - gdiff * 80) / 256);
+			c[3].b = (uint8_t)(((2 * b1 + b0) * 22) / 8);
+			c[3].a = 0xFF;
+		}
+		else
+		{
+			c[2].r = (uint8_t)(((r0 + r1) * 33) / 8);
+			c[2].g = (uint8_t)((256 * c[0].g + gdiff/4 + 128 + gdiff * 128) / 256);
+			c[2].b = (uint8_t)(((b0 + b1) * 33) / 8);
+			c[2].a = 0xFF;
+
+			c[3].set_noclamp_rgba(0, 0, 0, 0);
+			used_punchthrough = true;
+		}
+
+		if (set_alpha)
+		{
+			for (uint32_t y = 0; y < 4; y++, pPixels += 4)
+			{
+				pPixels[0] = c[pBlock->get_selector(0, y)]; 
+				pPixels[1] = c[pBlock->get_selector(1, y)]; 
+				pPixels[2] = c[pBlock->get_selector(2, y)]; 
+				pPixels[3] = c[pBlock->get_selector(3, y)];
+			}
+		}
+		else
+		{
+			for (uint32_t y = 0; y < 4; y++, pPixels += 4)
+			{
+				pPixels[0].set_rgb(c[pBlock->get_selector(0, y)]); 
+				pPixels[1].set_rgb(c[pBlock->get_selector(1, y)]); 
+				pPixels[2].set_rgb(c[pBlock->get_selector(2, y)]); 
+				pPixels[3].set_rgb(c[pBlock->get_selector(3, y)]);
+			}
+		}
+
+		return used_punchthrough;
+	}
+
+	static inline int interp_5_6_amd(int c0, int c1) { assert(c0 < 256 && c1 < 256); return (c0 * 43 + c1 * 21 + 32) >> 6; }
+	static inline int interp_half_5_6_amd(int c0, int c1) { assert(c0 < 256 && c1 < 256); return (c0 + c1 + 1) >> 1; }
+
+	bool unpack_bc1_amd(const void *pBlock_bits, color_rgba *pPixels, bool set_alpha)
+	{
+		const bc1_block *pBlock = static_cast<const bc1_block *>(pBlock_bits);
+
+		const uint32_t l = pBlock->get_low_color();
+		const uint32_t h = pBlock->get_high_color();
+
+		color_rgba c[4];
+
+		uint32_t r0, g0, b0, r1, g1, b1;
+		bc1_block::unpack_color(l, r0, g0, b0);
+		bc1_block::unpack_color(h, r1, g1, b1);
+
+		c[0].set_noclamp_rgba(r0, g0, b0, 255);
+		c[1].set_noclamp_rgba(r1, g1, b1, 255);
+				
+		bool used_punchthrough = false;
+
+		if (l > h)
+		{
+			c[2].set_noclamp_rgba(interp_5_6_amd(r0, r1), interp_5_6_amd(g0, g1), interp_5_6_amd(b0, b1), 255);
+			c[3].set_noclamp_rgba(interp_5_6_amd(r1, r0), interp_5_6_amd(g1, g0), interp_5_6_amd(b1, b0), 255);
+		}
+		else
+		{
+			c[2].set_noclamp_rgba(interp_half_5_6_amd(r0, r1), interp_half_5_6_amd(g0, g1), interp_half_5_6_amd(b0, b1), 255);
+			c[3].set_noclamp_rgba(0, 0, 0, 0);
+			used_punchthrough = true;
+		}
+
+		if (set_alpha)
+		{
+			for (uint32_t y = 0; y < 4; y++, pPixels += 4)
+			{
+				pPixels[0] = c[pBlock->get_selector(0, y)]; 
+				pPixels[1] = c[pBlock->get_selector(1, y)]; 
+				pPixels[2] = c[pBlock->get_selector(2, y)]; 
+				pPixels[3] = c[pBlock->get_selector(3, y)];
+			}
+		}
+		else
+		{
+			for (uint32_t y = 0; y < 4; y++, pPixels += 4)
+			{
+				pPixels[0].set_rgb(c[pBlock->get_selector(0, y)]); 
+				pPixels[1].set_rgb(c[pBlock->get_selector(1, y)]); 
+				pPixels[2].set_rgb(c[pBlock->get_selector(2, y)]); 
+				pPixels[3].set_rgb(c[pBlock->get_selector(3, y)]);
+			}
+		}
+
+		return used_punchthrough;
+	}
+
 	struct bc4_block
 	{
 		enum { cBC4SelectorBits = 3, cTotalSelectorBytes = 6, cMaxSelectorValues = 8 };
@ -292,7 +399,7 @@ namespace basisu

 		if (mode)
 		{
-			c[1].set(std::max(0, c[0].r - (c[3].r >> 2)), std::max(0, c[0].g - (c[3].g >> 2)), std::max(0, c[0].b - (c[3].b >> 2)), 255);
+			c[1].set(basisu::maximum(0, c[0].r - (c[3].r >> 2)), basisu::maximum(0, c[0].g - (c[3].g >> 2)), basisu::maximum(0, c[0].b - (c[3].b >> 2)), 255);
 			c[2] = c[0];
 			c[0].set(0, 0, 0, 255);
 		}
@ -317,6 +424,191 @@ namespace basisu
 		}
 	}

+	// BC7 mode 0-7 decompression.
+	// Instead of one monster routine to unpack all the BC7 modes, we're lumping the 3 subset, 2 subset, 1 subset, and dual plane modes together into simple shared routines.
+
+	static inline uint32_t bc7_dequant(uint32_t val, uint32_t pbit, uint32_t val_bits) { assert(val < (1U << val_bits)); assert(pbit < 2); assert(val_bits >= 4 && val_bits <= 8); const uint32_t total_bits = val_bits + 1; val = (val << 1) | pbit; val <<= (8 - total_bits); val |= (val >> total_bits); assert(val <= 255); return val; }
+	static inline uint32_t bc7_dequant(uint32_t val, uint32_t val_bits) { assert(val < (1U << val_bits)); assert(val_bits >= 4 && val_bits <= 8); val <<= (8 - val_bits); val |= (val >> val_bits); assert(val <= 255); return val; }
+
+	static inline uint32_t bc7_interp2(uint32_t l, uint32_t h, uint32_t w) { assert(w < 4); return (l * (64 - basist::g_bc7_weights2[w]) + h * basist::g_bc7_weights2[w] + 32) >> 6; }
+	static inline uint32_t bc7_interp3(uint32_t l, uint32_t h, uint32_t w) { assert(w < 8); return (l * (64 - basist::g_bc7_weights3[w]) + h * basist::g_bc7_weights3[w] + 32) >> 6; }
+	static inline uint32_t bc7_interp4(uint32_t l, uint32_t h, uint32_t w) { assert(w < 16); return (l * (64 - basist::g_bc7_weights4[w]) + h * basist::g_bc7_weights4[w] + 32) >> 6; }
+	static inline uint32_t bc7_interp(uint32_t l, uint32_t h, uint32_t w, uint32_t bits)
+	{
+		assert(l <= 255 && h <= 255);
+		switch (bits)
+		{
+		case 2: return bc7_interp2(l, h, w);
+		case 3: return bc7_interp3(l, h, w);
+		case 4: return bc7_interp4(l, h, w);
+		default: 
+			break;
+		}
+		return 0;
+	}
+		
+	bool unpack_bc7_mode0_2(uint32_t mode, const void* pBlock_bits, color_rgba* pPixels)
+	{
+		//const uint32_t SUBSETS = 3;
+		const uint32_t ENDPOINTS = 6;
+		const uint32_t COMPS = 3;
+		const uint32_t WEIGHT_BITS = (mode == 0) ? 3 : 2;
+		const uint32_t ENDPOINT_BITS = (mode == 0) ? 4 : 5;
+		const uint32_t PBITS = (mode == 0) ? 6 : 0;
+		const uint32_t WEIGHT_VALS = 1 << WEIGHT_BITS;
+		
+		uint32_t bit_offset = 0;
+		const uint8_t* pBuf = static_cast<const uint8_t*>(pBlock_bits);
+
+		if (read_bits32(pBuf, bit_offset, mode + 1) != (1U << mode)) return false;
+
+		const uint32_t part = read_bits32(pBuf, bit_offset, (mode == 0) ? 4 : 6);
+
+		color_rgba endpoints[ENDPOINTS];
+		for (uint32_t c = 0; c < COMPS; c++)
+			for (uint32_t e = 0; e < ENDPOINTS; e++)
+				endpoints[e][c] = (uint8_t)read_bits32(pBuf, bit_offset, ENDPOINT_BITS);
+
+		uint32_t pbits[6];
+		for (uint32_t p = 0; p < PBITS; p++)
+			pbits[p] = read_bits32(pBuf, bit_offset, 1);
+
+		uint32_t weights[16];
+		for (uint32_t i = 0; i < 16; i++)
+			weights[i] = read_bits32(pBuf, bit_offset, ((!i) || (i == basist::g_bc7_table_anchor_index_third_subset_1[part]) || (i == basist::g_bc7_table_anchor_index_third_subset_2[part])) ? (WEIGHT_BITS - 1) : WEIGHT_BITS);
+
+		assert(bit_offset == 128);
+
+		for (uint32_t e = 0; e < ENDPOINTS; e++)
+			for (uint32_t c = 0; c < 4; c++)
+				endpoints[e][c] = (uint8_t)((c == 3) ? 255 : (PBITS ? bc7_dequant(endpoints[e][c], pbits[e], ENDPOINT_BITS) : bc7_dequant(endpoints[e][c], ENDPOINT_BITS)));
+
+		color_rgba block_colors[3][8];
+		for (uint32_t s = 0; s < 3; s++)
+			for (uint32_t i = 0; i < WEIGHT_VALS; i++)
+			{
+				for (uint32_t c = 0; c < 3; c++)
+					block_colors[s][i][c] = (uint8_t)bc7_interp(endpoints[s * 2 + 0][c], endpoints[s * 2 + 1][c], i, WEIGHT_BITS);
+				block_colors[s][i][3] = 255;
+			}
+
+		for (uint32_t i = 0; i < 16; i++)
+			pPixels[i] = block_colors[basist::g_bc7_partition3[part * 16 + i]][weights[i]];
+
+		return true;
+	}
+
+	bool unpack_bc7_mode1_3_7(uint32_t mode, const void* pBlock_bits, color_rgba* pPixels)
+	{
+		//const uint32_t SUBSETS = 2;
+		const uint32_t ENDPOINTS = 4;
+		const uint32_t COMPS = (mode == 7) ? 4 : 3;
+		const uint32_t WEIGHT_BITS = (mode == 1) ? 3 : 2;
+		const uint32_t ENDPOINT_BITS = (mode == 7) ? 5 : ((mode == 1) ? 6 : 7);
+		const uint32_t PBITS = (mode == 1) ? 2 : 4;
+		const uint32_t SHARED_PBITS = (mode == 1) ? true : false;
+		const uint32_t WEIGHT_VALS = 1 << WEIGHT_BITS;
+		
+		uint32_t bit_offset = 0;
+		const uint8_t* pBuf = static_cast<const uint8_t*>(pBlock_bits);
+
+		if (read_bits32(pBuf, bit_offset, mode + 1) != (1U << mode)) return false;
+
+		const uint32_t part = read_bits32(pBuf, bit_offset, 6);
+
+		color_rgba endpoints[ENDPOINTS];
+		for (uint32_t c = 0; c < COMPS; c++)
+			for (uint32_t e = 0; e < ENDPOINTS; e++)
+				endpoints[e][c] = (uint8_t)read_bits32(pBuf, bit_offset, ENDPOINT_BITS);
+		
+		uint32_t pbits[4];
+		for (uint32_t p = 0; p < PBITS; p++)
+			pbits[p] = read_bits32(pBuf, bit_offset, 1);
+						
+		uint32_t weights[16];
+		for (uint32_t i = 0; i < 16; i++)
+			weights[i] = read_bits32(pBuf, bit_offset, ((!i) || (i == basist::g_bc7_table_anchor_index_second_subset[part])) ? (WEIGHT_BITS - 1) : WEIGHT_BITS);
+		
+		assert(bit_offset == 128);
+
+		for (uint32_t e = 0; e < ENDPOINTS; e++)
+			for (uint32_t c = 0; c < 4; c++)
+				endpoints[e][c] = (uint8_t)((c == ((mode == 7U) ? 4U : 3U)) ? 255 : bc7_dequant(endpoints[e][c], pbits[SHARED_PBITS ? (e >> 1) : e], ENDPOINT_BITS));
+		
+		color_rgba block_colors[2][8];
+		for (uint32_t s = 0; s < 2; s++)
+			for (uint32_t i = 0; i < WEIGHT_VALS; i++)
+			{
+				for (uint32_t c = 0; c < COMPS; c++)
+					block_colors[s][i][c] = (uint8_t)bc7_interp(endpoints[s * 2 + 0][c], endpoints[s * 2 + 1][c], i, WEIGHT_BITS);
+				block_colors[s][i][3] = (COMPS == 3) ? 255 : block_colors[s][i][3];
+			}
+
+		for (uint32_t i = 0; i < 16; i++)
+			pPixels[i] = block_colors[basist::g_bc7_partition2[part * 16 + i]][weights[i]];
+
+		return true;
+	}
+
+	bool unpack_bc7_mode4_5(uint32_t mode, const void* pBlock_bits, color_rgba* pPixels)
+	{
+		const uint32_t ENDPOINTS = 2;
+		const uint32_t COMPS = 4;
+		const uint32_t WEIGHT_BITS = 2;
+		const uint32_t A_WEIGHT_BITS = (mode == 4) ? 3 : 2;
+		const uint32_t ENDPOINT_BITS = (mode == 4) ? 5 : 7;
+		const uint32_t A_ENDPOINT_BITS = (mode == 4) ? 6 : 8;
+		//const uint32_t WEIGHT_VALS = 1 << WEIGHT_BITS;
+		//const uint32_t A_WEIGHT_VALS = 1 << A_WEIGHT_BITS;
+
+		uint32_t bit_offset = 0;
+		const uint8_t* pBuf = static_cast<const uint8_t*>(pBlock_bits);
+
+		if (read_bits32(pBuf, bit_offset, mode + 1) != (1U << mode)) return false;
+
+		const uint32_t comp_rot = read_bits32(pBuf, bit_offset, 2);
+		const uint32_t index_mode = (mode == 4) ? read_bits32(pBuf, bit_offset, 1) : 0;
+
+		color_rgba endpoints[ENDPOINTS];
+		for (uint32_t c = 0; c < COMPS; c++)
+			for (uint32_t e = 0; e < ENDPOINTS; e++)
+				endpoints[e][c] = (uint8_t)read_bits32(pBuf, bit_offset, (c == 3) ? A_ENDPOINT_BITS : ENDPOINT_BITS);
+		
+		const uint32_t weight_bits[2] = { index_mode ? A_WEIGHT_BITS : WEIGHT_BITS,  index_mode ? WEIGHT_BITS : A_WEIGHT_BITS };
+		
+		uint32_t weights[16], a_weights[16];
+		
+		for (uint32_t i = 0; i < 16; i++)
+			(index_mode ? a_weights : weights)[i] = read_bits32(pBuf, bit_offset, weight_bits[index_mode] - ((!i) ? 1 : 0));
+
+		for (uint32_t i = 0; i < 16; i++)
+			(index_mode ? weights : a_weights)[i] = read_bits32(pBuf, bit_offset, weight_bits[1 - index_mode] - ((!i) ? 1 : 0));
+
+		assert(bit_offset == 128);
+
+		for (uint32_t e = 0; e < ENDPOINTS; e++)
+			for (uint32_t c = 0; c < 4; c++)
+				endpoints[e][c] = (uint8_t)bc7_dequant(endpoints[e][c], (c == 3) ? A_ENDPOINT_BITS : ENDPOINT_BITS);
+
+		color_rgba block_colors[8];
+		for (uint32_t i = 0; i < (1U << weight_bits[0]); i++)
+			for (uint32_t c = 0; c < 3; c++)
+				block_colors[i][c] = (uint8_t)bc7_interp(endpoints[0][c], endpoints[1][c], i, weight_bits[0]);
+
+		for (uint32_t i = 0; i < (1U << weight_bits[1]); i++)
+			block_colors[i][3] = (uint8_t)bc7_interp(endpoints[0][3], endpoints[1][3], i, weight_bits[1]);
+
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			pPixels[i] = block_colors[weights[i]];
+			pPixels[i].a = block_colors[a_weights[i]].a;
+			if (comp_rot >= 1)
+				std::swap(pPixels[i].a, pPixels[i].m_comps[comp_rot - 1]);
+		}
+
+		return true;
+	}
+
 	struct bc7_mode_6
 	{
 		struct
@ -364,9 +656,6 @@ namespace basisu
 		};
 	};

-	static const uint32_t g_bc7_weights4[16] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
-	
-	// The transcoder only outputs mode 6 at the moment, so this is easy.
 	bool unpack_bc7_mode6(const void *pBlock_bits, color_rgba *pPixels)
 	{
 		static_assert(sizeof(bc7_mode_6) == 16, "sizeof(bc7_mode_6) == 16");
@ -388,7 +677,7 @@ namespace basisu
 		color_rgba vals[16];
 		for (uint32_t i = 0; i < 16; i++)
 		{
-			const uint32_t w = g_bc7_weights4[i];
+			const uint32_t w = basist::g_bc7_weights4[i];
 			const uint32_t iw = 64 - w;
 			vals[i].set_noclamp_rgba( 
 				(r0 * iw + r1 * w + 32) >> 6, 
@ -420,183 +709,37 @@ namespace basisu
 		return true;
 	}

-	static inline uint32_t get_block_bits(const uint8_t* pBytes, uint32_t bit_ofs, uint32_t bits_wanted)
+	bool unpack_bc7(const void *pBlock, color_rgba *pPixels)
 	{
-		assert(bits_wanted < 32);
+		const uint32_t first_byte = static_cast<const uint8_t*>(pBlock)[0];

-		uint32_t v = 0;
-		uint32_t total_bits = 0;
-
-		while (total_bits < bits_wanted)
+		for (uint32_t mode = 0; mode <= 7; mode++)
 		{
-			uint32_t k = pBytes[bit_ofs >> 3];
-			k >>= (bit_ofs & 7);
-			uint32_t num_bits_in_byte = 8 - (bit_ofs & 7);
-
-			v |= (k << total_bits);
-			total_bits += num_bits_in_byte;
-			bit_ofs += num_bits_in_byte;
-		}
-
-		return v & ((1 << bits_wanted) - 1);
-	}
-						
-	struct bc7_mode_5
-	{
-		union
-		{
-			struct
+			if (first_byte & (1U << mode))
 			{
-				uint64_t m_mode : 6;
-				uint64_t m_rot : 2;
-				
-				uint64_t m_r0 : 7;
-				uint64_t m_r1 : 7;
-				uint64_t m_g0 : 7;
-				uint64_t m_g1 : 7;
-				uint64_t m_b0 : 7;
-				uint64_t m_b1 : 7;
-				uint64_t m_a0 : 8;
-				uint64_t m_a1_0 : 6;
-
-			} m_lo;
-
-			uint64_t m_lo_bits;
-		};
-
-		union
-		{
-			struct
-			{
-				uint64_t m_a1_1 : 2;
-
-				// bit 2
-				uint64_t m_c00 : 1;
-				uint64_t m_c10 : 2;
-				uint64_t m_c20 : 2;
-				uint64_t m_c30 : 2;
-
-				uint64_t m_c01 : 2;
-				uint64_t m_c11 : 2;
-				uint64_t m_c21 : 2;
-				uint64_t m_c31 : 2;
-
-				uint64_t m_c02 : 2;
-				uint64_t m_c12 : 2;
-				uint64_t m_c22 : 2;
-				uint64_t m_c32 : 2;
-
-				uint64_t m_c03 : 2;
-				uint64_t m_c13 : 2;
-				uint64_t m_c23 : 2;
-				uint64_t m_c33 : 2;
-
-				// bit 33
-				uint64_t m_a00 : 1;
-				uint64_t m_a10 : 2;
-				uint64_t m_a20 : 2;
-				uint64_t m_a30 : 2;
-
-				uint64_t m_a01 : 2;
-				uint64_t m_a11 : 2;
-				uint64_t m_a21 : 2;
-				uint64_t m_a31 : 2;
-
-				uint64_t m_a02 : 2;
-				uint64_t m_a12 : 2;
-				uint64_t m_a22 : 2;
-				uint64_t m_a32 : 2;
-
-				uint64_t m_a03 : 2;
-				uint64_t m_a13 : 2;
-				uint64_t m_a23 : 2;
-				uint64_t m_a33 : 2;
-
-			} m_hi;
-
-			uint64_t m_hi_bits;
-		};
-
-		color_rgba get_low_color() const
-		{
-			return color_rgba(cNoClamp,
-				(int)((m_lo.m_r0 << 1) | (m_lo.m_r0 >> 6)),
-				(int)((m_lo.m_g0 << 1) | (m_lo.m_g0 >> 6)),
-				(int)((m_lo.m_b0 << 1) | (m_lo.m_b0 >> 6)),
-				m_lo.m_a0);
-		}
-
-		color_rgba get_high_color() const
-		{
-			return color_rgba(cNoClamp,
-				(int)((m_lo.m_r1 << 1) | (m_lo.m_r1 >> 6)),
-				(int)((m_lo.m_g1 << 1) | (m_lo.m_g1 >> 6)),
-				(int)((m_lo.m_b1 << 1) | (m_lo.m_b1 >> 6)),
-				(int)m_lo.m_a1_0 | ((int)m_hi.m_a1_1 << 6));
-		}
-
-		void get_block_colors(color_rgba* pColors) const
-		{
-			const color_rgba low_color(get_low_color());
-			const color_rgba high_color(get_high_color());
-
-			for (uint32_t i = 0; i < 4; i++)
-			{
-				static const uint32_t s_bc7_weights2[4] = { 0, 21, 43, 64 };
-
-				pColors[i].set_noclamp_rgba(
-					(low_color.r * (64 - s_bc7_weights2[i]) + high_color.r * s_bc7_weights2[i] + 32) >> 6,
-					(low_color.g * (64 - s_bc7_weights2[i]) + high_color.g * s_bc7_weights2[i] + 32) >> 6,
-					(low_color.b * (64 - s_bc7_weights2[i]) + high_color.b * s_bc7_weights2[i] + 32) >> 6,
-					(low_color.a * (64 - s_bc7_weights2[i]) + high_color.a * s_bc7_weights2[i] + 32) >> 6);
+				switch (mode)
+				{
+				case 0:
+				case 2:
+					return unpack_bc7_mode0_2(mode, pBlock, pPixels);
+				case 1:
+				case 3:
+				case 7:
+					return unpack_bc7_mode1_3_7(mode, pBlock, pPixels);
+				case 4:
+				case 5:
+					return unpack_bc7_mode4_5(mode, pBlock, pPixels);
+				case 6:
+					return unpack_bc7_mode6(pBlock, pPixels);
+				default:
+					break;
+				}
 			}
-		} 
-
-		uint32_t get_selector(uint32_t idx, bool alpha) const
-		{
-			const uint32_t size = (idx == 0) ? 1 : 2;
-
-			uint32_t ofs = alpha ? 97 : 66;
-			
-			if (idx)
-				ofs += 1 + 2 * (idx - 1);
-
-			return get_block_bits(reinterpret_cast<const uint8_t*>(this), ofs, size);
-		}
-	};
-
-	bool unpack_bc7_mode5(const void* pBlock_bits, color_rgba* pPixels)
-	{
-		static_assert(sizeof(bc7_mode_5) == 16, "sizeof(bc7_mode_5) == 16");
-
-		const bc7_mode_5& block = *static_cast<const bc7_mode_5*>(pBlock_bits);
-
-		if (block.m_lo.m_mode != (1 << 5))
-			return false;
-				
-		color_rgba block_colors[4];
-		block.get_block_colors(block_colors);
-
-		const uint32_t rot = block.m_lo.m_rot;
-
-		for (uint32_t i = 0; i < 16; i++)
-		{
-			const uint32_t cs = block.get_selector(i, false);
-
-			color_rgba c(block_colors[cs]);
-
-			const uint32_t as = block.get_selector(i, true);
-			c.a = block_colors[as].a;
-
-			if (rot > 0)
-				std::swap(c[3], c[rot - 1]);
-
-			pPixels[i] = c;
 		}

-		return true;
+		return false;
 	}
-
+	
 	struct fxt1_block
 	{
 		union
@ -903,13 +1046,14 @@ namespace basisu
 		etc2_eac_r11 m_c[2];
 	};

-	static void unpack_etc2_eac_r(const etc2_eac_r11* p, color_rgba* pPixels, uint32_t c)
+	void unpack_etc2_eac_r(const void *p, color_rgba* pPixels, uint32_t c)
 	{
-		const uint64_t sels = p->get_sels();
+		const etc2_eac_r11* pBlock = static_cast<const etc2_eac_r11*>(p);
+		const uint64_t sels = pBlock->get_sels();

-		const int base = (int)p->m_base * 8 + 4;
-		const int mul = p->m_mul ? ((int)p->m_mul * 8) : 1;
-		const int table = (int)p->m_table;
+		const int base = (int)pBlock->m_base * 8 + 4;
+		const int mul = pBlock->m_mul ? ((int)pBlock->m_mul * 8) : 1;
+		const int table = (int)pBlock->m_table;

 		for (uint32_t y = 0; y < 4; y++)
 		{
@ -923,7 +1067,8 @@ namespace basisu
 				val = clamp<int>(val, 0, 2047);

 				// Convert to 8-bits with rounding
-				pPixels[x + y * 4].m_comps[c] = static_cast<uint8_t>((val * 255 + 1024) / 2047);
+				//pPixels[x + y * 4].m_comps[c] = static_cast<uint8_t>((val * 255 + 1024) / 2047);
+				pPixels[x + y * 4].m_comps[c] = static_cast<uint8_t>((val * 255 + 1023) / 2047);

 			} // x
 		} // y
@ -939,6 +1084,11 @@ namespace basisu
 		}
 	}
 	
+	void unpack_uastc(const void* p, color_rgba* pPixels)
+	{
+		basist::unpack_uastc(*static_cast<const basist::uastc_block*>(p), (basist::color32 *)pPixels, false);
+	}
+	
 	// Unpacks to RGBA, R, RG, or A
 	bool unpack_block(texture_format fmt, const void* pBlock, color_rgba* pPixels)
 	{
@ -949,6 +1099,16 @@ namespace basisu
 			unpack_bc1(pBlock, pPixels, true);
 			break;
 		}
+		case texture_format::cBC1_NV:
+		{
+			unpack_bc1_nv(pBlock, pPixels, true);
+			break;
+		}
+		case texture_format::cBC1_AMD:
+		{
+			unpack_bc1_amd(pBlock, pPixels, true);
+			break;
+		}
 		case texture_format::cBC3:
 		{
 			return unpack_bc3(pBlock, pPixels);
@ -966,14 +1126,7 @@ namespace basisu
 		}
 		case texture_format::cBC7:
 		{
-			// We only support modes 5 and 6.
-			if (!unpack_bc7_mode5(pBlock, pPixels))
-			{
-				if (!unpack_bc7_mode6(pBlock, pPixels))
-					return false;
-			}
-
-			break;
+			return unpack_bc7(pBlock, pPixels);
 		}
 		// Full ETC2 color blocks (planar/T/H modes) is currently unsupported in basisu, but we do support ETC2 with alpha (using ETC1 for color)
 		case texture_format::cETC2_RGB:
@ -1032,6 +1185,11 @@ namespace basisu
 			unpack_etc2_eac_rg(pBlock, pPixels);
 			break;
 		}
+		case texture_format::cUASTC4x4:
+		{
+			unpack_uastc(pBlock, pPixels);
+			break;
+		}
 		default:
 		{
 			assert(0);
@ -1113,6 +1271,7 @@ namespace basisu
 		KTX_COMPRESSED_RGBA_PVRTC_4BPPV1_IMG = 0x8C02,
 		KTX_COMPRESSED_RGBA_ASTC_4x4_KHR = 0x93B0,
 		KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR = 0x93D0,
+		KTX_COMPRESSED_RGBA_UASTC_4x4_KHR = 0x94CC, // TODO - Use proper value!
 		KTX_ATC_RGB_AMD = 0x8C92,
 		KTX_ATC_RGBA_INTERPOLATED_ALPHA_AMD = 0x87EE,
 		KTX_COMPRESSED_RGB_FXT1_3DFX = 0x86B0,
@ -1143,7 +1302,7 @@ namespace basisu
 	};

 	// Input is a texture array of mipmapped gpu_image's: gpu_images[array_index][level_index]
-	bool create_ktx_texture_file(uint8_vec &ktx_data, const std::vector<gpu_image_vec>& gpu_images, bool cubemap_flag)
+	bool create_ktx_texture_file(uint8_vec &ktx_data, const basisu::vector<gpu_image_vec>& gpu_images, bool cubemap_flag)
 	{
 		if (!gpu_images.size())
 		{
@ -1220,6 +1379,8 @@ namespace basisu
 		switch (fmt)
 		{
 		case texture_format::cBC1:
+		case texture_format::cBC1_NV:
+		case texture_format::cBC1_AMD:
 		{
 			internal_fmt = KTX_COMPRESSED_RGB_S3TC_DXT1_EXT;
 			break;
@ -1305,6 +1466,12 @@ namespace basisu
 			base_internal_fmt = KTX_RG;
 			break;
 		}
+		case texture_format::cUASTC4x4:
+		{
+			internal_fmt = KTX_COMPRESSED_RGBA_UASTC_4x4_KHR;
+			base_internal_fmt = KTX_RGBA;
+			break;
+		}
 		case texture_format::cFXT1_RGB:
 		{
 			internal_fmt = KTX_COMPRESSED_RGB_FXT1_3DFX;
@ -1378,7 +1545,7 @@ namespace basisu
 		return true;
 	}

-	bool write_compressed_texture_file(const char* pFilename, const std::vector<gpu_image_vec>& g, bool cubemap_flag)
+	bool write_compressed_texture_file(const char* pFilename, const basisu::vector<gpu_image_vec>& g, bool cubemap_flag)
 	{
 		std::string extension(string_tolower(string_get_extension(pFilename)));

@ -1410,12 +1577,12 @@ namespace basisu

 	bool write_compressed_texture_file(const char* pFilename, const gpu_image& g)
 	{
-		std::vector<gpu_image_vec> v;
+		basisu::vector<gpu_image_vec> v;
 		enlarge_vector(v, 1)->push_back(g);
 		return write_compressed_texture_file(pFilename, v, false);
 	}

-	const uint32_t OUT_FILE_MAGIC = 'TEXC';
+	//const uint32_t OUT_FILE_MAGIC = 'TEXC';
 	struct out_file_header 
 	{
 		packed_uint<4> m_magic;
@ -1428,7 +1595,11 @@ namespace basisu
 	bool write_3dfx_out_file(const char* pFilename, const gpu_image& gi)
 	{
 		out_file_header hdr;
-		hdr.m_magic = OUT_FILE_MAGIC;
+		//hdr.m_magic = OUT_FILE_MAGIC;
+		hdr.m_magic.m_bytes[0] = 67;
+		hdr.m_magic.m_bytes[1] = 88;
+		hdr.m_magic.m_bytes[2] = 69;
+		hdr.m_magic.m_bytes[3] = 84;
 		hdr.m_pad = 0;
 		hdr.m_width = gi.get_blocks_x() * 8;
 		hdr.m_height = gi.get_blocks_y() * 4;
--- a/thirdparty/basis_universal/encoder/basisu_gpu_texture.h
+++ b/thirdparty/basis_universal/encoder/basisu_gpu_texture.h
@ -1,5 +1,5 @@
 // basisu_gpu_texture.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -13,13 +13,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
-#include "transcoder/basisu.h"
+#include "../transcoder/basisu.h"
 #include "basisu_etc.h"

 namespace basisu
 {
-	// GPU texture image
-
+	// GPU texture "image"
 	class gpu_image
 	{
 	public:
@ -115,17 +114,17 @@ namespace basisu
 		uint64_vec m_blocks;
 	};

-	typedef std::vector<gpu_image> gpu_image_vec;
+	typedef basisu::vector<gpu_image> gpu_image_vec;

 	// KTX file writing

-	bool create_ktx_texture_file(uint8_vec &ktx_data, const std::vector<gpu_image_vec>& gpu_images, bool cubemap_flag);
+	bool create_ktx_texture_file(uint8_vec &ktx_data, const basisu::vector<gpu_image_vec>& gpu_images, bool cubemap_flag);
 		
-	bool write_compressed_texture_file(const char *pFilename, const std::vector<gpu_image_vec>& g, bool cubemap_flag);
+	bool write_compressed_texture_file(const char *pFilename, const basisu::vector<gpu_image_vec>& g, bool cubemap_flag);
 	
 	inline bool write_compressed_texture_file(const char *pFilename, const gpu_image_vec &g)
 	{
-		std::vector<gpu_image_vec> a;
+		basisu::vector<gpu_image_vec> a;
 		a.push_back(g);
 		return write_compressed_texture_file(pFilename, a, false);
 	}
@ -133,22 +132,23 @@ namespace basisu
 	bool write_compressed_texture_file(const char *pFilename, const gpu_image &g);
 	
 	bool write_3dfx_out_file(const char* pFilename, const gpu_image& gi);
-	// GPU texture block unpacking

+	// GPU texture block unpacking
 	void unpack_etc2_eac(const void *pBlock_bits, color_rgba *pPixels);
 	bool unpack_bc1(const void *pBlock_bits, color_rgba *pPixels, bool set_alpha);
 	void unpack_bc4(const void *pBlock_bits, uint8_t *pPixels, uint32_t stride);
 	bool unpack_bc3(const void *pBlock_bits, color_rgba *pPixels);
 	void unpack_bc5(const void *pBlock_bits, color_rgba *pPixels);
 	bool unpack_bc7_mode6(const void *pBlock_bits, color_rgba *pPixels);
-	bool unpack_bc7_mode5(const void* pBlock_bits, color_rgba* pPixels);
+	bool unpack_bc7(const void* pBlock_bits, color_rgba* pPixels);
 	void unpack_atc(const void* pBlock_bits, color_rgba* pPixels);
 	bool unpack_fxt1(const void* p, color_rgba* pPixels);
 	bool unpack_pvrtc2(const void* p, color_rgba* pPixels);
+	void unpack_etc2_eac_r(const void *p, color_rgba* pPixels, uint32_t c);
 	void unpack_etc2_eac_rg(const void* p, color_rgba* pPixels);

-	// unpack_block() is only capable of unpacking texture data created by the transcoder. 
-	// For some texture formats (like BC7, or ETC2) it's not a complete implementation.
+	// unpack_block() is primarily intended to unpack texture data created by the transcoder.
+	// For some texture formats (like ETC2 RGB, PVRTC2, FXT1) it's not a complete implementation.
 	bool unpack_block(texture_format fmt, const void *pBlock, color_rgba *pPixels);
 			
 } // namespace basisu
--- a/thirdparty/basis_universal/encoder/basisu_kernels_declares.h
+++ b/thirdparty/basis_universal/encoder/basisu_kernels_declares.h
@ -0,0 +1,25 @@
+// basisu_kernels_declares.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if BASISU_SUPPORT_SSE
+void CPPSPMD_NAME(perceptual_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err);
+void CPPSPMD_NAME(linear_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err);
+
+void CPPSPMD_NAME(find_selectors_perceptual_rgb_4_N)(int64_t* pDistance, uint8_t* pSelectors, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err);
+void CPPSPMD_NAME(find_selectors_linear_rgb_4_N)(int64_t* pDistance, uint8_t* pSelectors, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err);
+
+void CPPSPMD_NAME(find_lowest_error_perceptual_rgb_4_N)(int64_t* pDistance, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error);
+void CPPSPMD_NAME(find_lowest_error_linear_rgb_4_N)(int64_t* pDistance, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error);
+#endif
--- a/thirdparty/basis_universal/encoder/basisu_kernels_imp.h
+++ b/thirdparty/basis_universal/encoder/basisu_kernels_imp.h
@ -0,0 +1,584 @@
+// basisu_kernels_imp.h - Do not directly include
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using namespace CPPSPMD;
+
+namespace CPPSPMD_NAME(basisu_kernels_namespace)
+{
+   struct perceptual_distance_rgb_4_N : spmd_kernel
+   {
+      void _call(int64_t* pDistance,
+         const uint8_t* pSelectors,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_err)
+      {
+         assert(early_out_err >= 0);
+
+         *pDistance = 0;
+
+         __m128i block_colors[4];
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            block_colors[i] = load_rgba32(&pBlock_colors[i]);
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         uint32_t i;
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            int s0 = pSelectors[i], s1 = pSelectors[i + 1], s2 = pSelectors[i + 2], s3 = pSelectors[i + 3];
+
+            vint base_r, base_g, base_b, base_a;
+            if ((s0 == s1) && (s0 == s2) && (s0 == s3))
+            {
+               store_all(base_r, block_colors_r[s0]);
+               store_all(base_g, block_colors_g[s0]);
+               store_all(base_b, block_colors_b[s0]);
+            }
+            else
+            {
+               __m128i k0 = block_colors[s0], k1 = block_colors[s1], k2 = block_colors[s2], k3 = block_colors[s3];
+               transpose4x4(base_r.m_value, base_g.m_value, base_b.m_value, base_a.m_value, k0, k1, k2, k3);
+            }
+
+            vint dr = base_r - r;
+            vint dg = base_g - g;
+            vint db = base_b - b;
+
+            vint delta_l = dr * 27 + dg * 92 + db * 9;
+            vint delta_cr = dr * 128 - delta_l;
+            vint delta_cb = db * 128 - delta_l;
+
+            vint id = ((delta_l * delta_l) >> 7) +
+               ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
+               ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
+
+            *pDistance += reduce_add(id);
+            if (*pDistance >= early_out_err)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int sel = pSelectors[i];
+            int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+            int dr = base_r - r;
+            int dg = base_g - g;
+            int db = base_b - b;
+
+            int delta_l = dr * 27 + dg * 92 + db * 9;
+            int delta_cr = dr * 128 - delta_l;
+            int delta_cb = db * 128 - delta_l;
+
+            int id = ((delta_l * delta_l) >> 7) +
+               ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
+               ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
+
+            *pDistance += id;
+            if (*pDistance >= early_out_err)
+               return;
+         }
+      }
+   };
+
+   struct linear_distance_rgb_4_N : spmd_kernel
+   {
+      void _call(int64_t* pDistance,
+         const uint8_t* pSelectors,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_err)
+      {
+         assert(early_out_err >= 0);
+
+         *pDistance = 0;
+
+         __m128i block_colors[4];
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            block_colors[i] = load_rgba32(&pBlock_colors[i]);
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         uint32_t i;
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            int s0 = pSelectors[i], s1 = pSelectors[i + 1], s2 = pSelectors[i + 2], s3 = pSelectors[i + 3];
+
+            vint base_r, base_g, base_b, base_a;
+            if ((s0 == s1) && (s0 == s2) && (s0 == s3))
+            {
+               store_all(base_r, block_colors_r[s0]);
+               store_all(base_g, block_colors_g[s0]);
+               store_all(base_b, block_colors_b[s0]);
+            }
+            else
+            {
+               __m128i k0 = block_colors[s0], k1 = block_colors[s1], k2 = block_colors[s2], k3 = block_colors[s3];
+               transpose4x4(base_r.m_value, base_g.m_value, base_b.m_value, base_a.m_value, k0, k1, k2, k3);
+            }
+
+            vint dr = base_r - r;
+            vint dg = base_g - g;
+            vint db = base_b - b;
+
+            vint id = dr * dr + dg * dg + db * db;
+
+            *pDistance += reduce_add(id);
+            if (*pDistance >= early_out_err)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int sel = pSelectors[i];
+            int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+            int dr = base_r - r;
+            int dg = base_g - g;
+            int db = base_b - b;
+
+            int id = dr * dr + dg * dg + db * db;
+
+            *pDistance += id;
+            if (*pDistance >= early_out_err)
+               return;
+         }
+      }
+   };
+
+   struct find_selectors_perceptual_rgb_4_N : spmd_kernel
+   {
+      inline vint compute_dist(
+         const vint& base_r, const vint& base_g, const vint& base_b,
+         const vint& r, const vint& g, const vint& b)
+      {
+         vint dr = base_r - r;
+         vint dg = base_g - g;
+         vint db = base_b - b;
+
+         vint delta_l = dr * 27 + dg * 92 + db * 9;
+         vint delta_cr = dr * 128 - delta_l;
+         vint delta_cb = db * 128 - delta_l;
+
+         vint id = VINT_SHIFT_RIGHT(delta_l * delta_l, 7) +
+            VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cr * delta_cr, 7) * 26, 7) +
+            VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cb * delta_cb, 7) * 3, 7);
+
+         return id;
+      }
+
+      void _call(int64_t* pDistance,
+         uint8_t* pSelectors,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_err)
+      {
+         assert(early_out_err >= 0);
+
+         *pDistance = 0;
+
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         const __m128i shuf = _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 12, 8, 4, 0);
+
+         uint32_t i;
+
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
+            vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
+            vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
+            vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
+
+            vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
+
+            vint sels = spmd_ternaryi(min_dist == dist0, 0, spmd_ternaryi(min_dist == dist1, 1, spmd_ternaryi(min_dist == dist2, 2, 3)));
+
+            __m128i vsels = shuffle_epi8(sels.m_value, shuf);
+            storeu_si32((void *)(pSelectors + i), vsels);
+
+            *pDistance += reduce_add(min_dist);
+            if (*pDistance >= early_out_err)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int best_err = INT_MAX, best_sel = 0;
+            for (int sel = 0; sel < 4; sel++)
+            {
+               int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+               int dr = base_r - r;
+               int dg = base_g - g;
+               int db = base_b - b;
+
+               int delta_l = dr * 27 + dg * 92 + db * 9;
+               int delta_cr = dr * 128 - delta_l;
+               int delta_cb = db * 128 - delta_l;
+
+               int id = ((delta_l * delta_l) >> 7) +
+                  ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
+                  ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
+               if (id < best_err)
+               {
+                  best_err = id;
+                  best_sel = sel;
+               }
+            }
+
+            pSelectors[i] = (uint8_t)best_sel;
+
+            *pDistance += best_err;
+            if (*pDistance >= early_out_err)
+               return;
+         }
+      }
+   };
+
+   struct find_selectors_linear_rgb_4_N : spmd_kernel
+   {
+      inline vint compute_dist(
+         const vint& base_r, const vint& base_g, const vint& base_b,
+         const vint& r, const vint& g, const vint& b)
+      {
+         vint dr = base_r - r;
+         vint dg = base_g - g;
+         vint db = base_b - b;
+
+         vint id = dr * dr + dg * dg + db * db;
+         return id;
+      }
+
+      void _call(int64_t* pDistance,
+         uint8_t* pSelectors,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_err)
+      {
+         assert(early_out_err >= 0);
+
+         *pDistance = 0;
+
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         const __m128i shuf = _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 12, 8, 4, 0);
+
+         uint32_t i;
+
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
+            vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
+            vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
+            vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
+
+            vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
+
+            vint sels = spmd_ternaryi(min_dist == dist0, 0, spmd_ternaryi(min_dist == dist1, 1, spmd_ternaryi(min_dist == dist2, 2, 3)));
+
+            __m128i vsels = shuffle_epi8(sels.m_value, shuf);
+            storeu_si32((void *)(pSelectors + i), vsels);
+
+            *pDistance += reduce_add(min_dist);
+            if (*pDistance >= early_out_err)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int best_err = INT_MAX, best_sel = 0;
+            for (int sel = 0; sel < 4; sel++)
+            {
+               int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+               int dr = base_r - r;
+               int dg = base_g - g;
+               int db = base_b - b;
+
+               int id = dr * dr + dg * dg + db * db;
+               if (id < best_err)
+               {
+                  best_err = id;
+                  best_sel = sel;
+               }
+            }
+
+            pSelectors[i] = (uint8_t)best_sel;
+
+            *pDistance += best_err;
+            if (*pDistance >= early_out_err)
+               return;
+         }
+      }
+   };
+
+   struct find_lowest_error_perceptual_rgb_4_N : spmd_kernel
+   {
+      inline vint compute_dist(
+         const vint& base_r, const vint& base_g, const vint& base_b,
+         const vint& r, const vint& g, const vint& b)
+      {
+         vint dr = base_r - r;
+         vint dg = base_g - g;
+         vint db = base_b - b;
+
+         vint delta_l = dr * 27 + dg * 92 + db * 9;
+         vint delta_cr = dr * 128 - delta_l;
+         vint delta_cb = db * 128 - delta_l;
+
+         vint id = VINT_SHIFT_RIGHT(delta_l * delta_l, 7) +
+            VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cr * delta_cr, 7) * 26, 7) +
+            VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cb * delta_cb, 7) * 3, 7);
+
+         return id;
+      }
+
+      void _call(int64_t* pDistance,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_error)
+      {
+         assert(early_out_error >= 0);
+
+         *pDistance = 0;
+
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         uint32_t i;
+
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
+            vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
+            vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
+            vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
+
+            vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
+
+            *pDistance += reduce_add(min_dist);
+            if (*pDistance > early_out_error)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int best_err = INT_MAX;
+            for (int sel = 0; sel < 4; sel++)
+            {
+               int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+               int dr = base_r - r;
+               int dg = base_g - g;
+               int db = base_b - b;
+
+               int delta_l = dr * 27 + dg * 92 + db * 9;
+               int delta_cr = dr * 128 - delta_l;
+               int delta_cb = db * 128 - delta_l;
+
+               int id = ((delta_l * delta_l) >> 7) +
+                  ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
+                  ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
+               
+               if (id < best_err)
+               {
+                  best_err = id;
+               }
+            }
+
+            *pDistance += best_err;
+            if (*pDistance > early_out_error)
+               return;
+         }
+      }
+   };
+
+   struct find_lowest_error_linear_rgb_4_N : spmd_kernel
+   {
+      inline vint compute_dist(
+         const vint& base_r, const vint& base_g, const vint& base_b,
+         const vint& r, const vint& g, const vint& b)
+      {
+         vint dr = base_r - r;
+         vint dg = base_g - g;
+         vint db = base_b - b;
+
+         vint id = dr * dr + dg * dg + db * db;
+
+         return id;
+      }
+
+      void _call(int64_t* pDistance,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n,
+         int64_t early_out_error)
+      {
+         assert(early_out_error >= 0);
+
+         *pDistance = 0;
+
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         uint32_t i;
+
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
+            vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
+            vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
+            vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
+
+            vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
+
+            *pDistance += reduce_add(min_dist);
+            if (*pDistance > early_out_error)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int best_err = INT_MAX;
+            for (int sel = 0; sel < 4; sel++)
+            {
+               int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+               int dr = base_r - r;
+               int dg = base_g - g;
+               int db = base_b - b;
+
+               int id = dr * dr + dg * dg + db * db;
+
+               if (id < best_err)
+               {
+                  best_err = id;
+               }
+            }
+
+            *pDistance += best_err;
+            if (*pDistance > early_out_error)
+               return;
+         }
+      }
+   };
+
+} // namespace
+
+using namespace CPPSPMD_NAME(basisu_kernels_namespace);
+
+void CPPSPMD_NAME(perceptual_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
+{
+   spmd_call< perceptual_distance_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
+}
+
+void CPPSPMD_NAME(linear_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
+{
+   spmd_call< linear_distance_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
+}
+
+void CPPSPMD_NAME(find_selectors_perceptual_rgb_4_N)(int64_t *pDistance, uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
+{
+   spmd_call< find_selectors_perceptual_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
+}
+
+void CPPSPMD_NAME(find_selectors_linear_rgb_4_N)(int64_t* pDistance, uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
+{
+   spmd_call< find_selectors_linear_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
+}
+
+void CPPSPMD_NAME(find_lowest_error_perceptual_rgb_4_N)(int64_t* pDistance, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error)
+{
+   spmd_call< find_lowest_error_perceptual_rgb_4_N >(pDistance, pBlock_colors, pSrc_pixels, n, early_out_error);
+}
+
+void CPPSPMD_NAME(find_lowest_error_linear_rgb_4_N)(int64_t* pDistance, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error)
+{
+   spmd_call< find_lowest_error_linear_rgb_4_N >(pDistance, pBlock_colors, pSrc_pixels, n, early_out_error);
+}
+
--- a/thirdparty/basis_universal/encoder/basisu_kernels_sse.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_kernels_sse.cpp
@ -0,0 +1,161 @@
+// basisu_kernels_sse.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_enc.h"
+
+#if BASISU_SUPPORT_SSE
+
+#define CPPSPMD_SSE2 0
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#if !defined(_MSC_VER)
+	#if __AVX__ || __AVX2__ || __AVX512F__
+		#error Please check your compiler options
+	#endif
+	
+	#if CPPSPMD_SSE2
+		#if __SSE4_1__ || __SSE3__ || __SSE4_2__ || __SSSE3__
+			#error SSE4.1/SSE3/SSE4.2/SSSE3 cannot be enabled to use this file
+		#endif
+	#else
+		#if !__SSE4_1__ || !__SSE3__ || __SSE4_2__ || !__SSSE3__
+			#error Please check your compiler options
+		#endif
+	#endif
+#endif
+
+#include "cppspmd_sse.h"
+
+#include "cppspmd_type_aliases.h"
+
+using namespace basisu;
+
+#include "basisu_kernels_declares.h"
+#include "basisu_kernels_imp.h"
+
+namespace basisu
+{
+
+struct cpu_info
+{
+	cpu_info() { memset(this, 0, sizeof(*this)); }
+
+	bool m_has_fpu;
+	bool m_has_mmx;
+	bool m_has_sse;
+	bool m_has_sse2;
+	bool m_has_sse3;
+	bool m_has_ssse3;
+	bool m_has_sse41;
+	bool m_has_sse42;
+	bool m_has_avx;
+	bool m_has_avx2;
+	bool m_has_pclmulqdq;
+};
+
+static void extract_x86_flags(cpu_info &info, uint32_t ecx, uint32_t edx)
+{
+	info.m_has_fpu = (edx & (1 << 0)) != 0;
+	info.m_has_mmx = (edx & (1 << 23)) != 0;
+	info.m_has_sse = (edx & (1 << 25)) != 0;
+	info.m_has_sse2 = (edx & (1 << 26)) != 0;
+	info.m_has_sse3 = (ecx & (1 << 0)) != 0;
+	info.m_has_ssse3 = (ecx & (1 << 9)) != 0;
+	info.m_has_sse41 = (ecx & (1 << 19)) != 0;
+	info.m_has_sse42 = (ecx & (1 << 20)) != 0;
+	info.m_has_pclmulqdq = (ecx & (1 << 1)) != 0;
+	info.m_has_avx = (ecx & (1 << 28)) != 0;
+}
+
+static void extract_x86_extended_flags(cpu_info &info, uint32_t ebx)
+{
+	info.m_has_avx2 = (ebx & (1 << 5)) != 0;
+}
+
+#ifndef _MSC_VER
+static void do_cpuid(uint32_t eax, uint32_t ecx, uint32_t* regs)
+{
+	uint32_t ebx = 0, edx = 0;
+
+#if defined(__PIC__) && defined(__i386__)
+	__asm__("movl %%ebx, %%edi;"
+		"cpuid;"
+		"xchgl %%ebx, %%edi;"
+		: "=D"(ebx), "+a"(eax), "+c"(ecx), "=d"(edx));
+#else
+	__asm__("cpuid;" : "+b"(ebx), "+a"(eax), "+c"(ecx), "=d"(edx));
+#endif
+
+	regs[0] = eax; regs[1] = ebx; regs[2] = ecx; regs[3] = edx;
+}
+#endif
+
+static void get_cpuinfo(cpu_info &info)
+{
+	int regs[4];
+
+#ifdef _MSC_VER
+	__cpuid(regs, 0);
+#else
+	do_cpuid(0, 0, (uint32_t *)regs);
+#endif
+
+	const uint32_t max_eax = regs[0];
+
+	if (max_eax >= 1U)
+	{
+#ifdef _MSC_VER
+		__cpuid(regs, 1);
+#else
+		do_cpuid(1, 0, (uint32_t*)regs);
+#endif
+		extract_x86_flags(info, regs[2], regs[3]);
+	}
+
+	if (max_eax >= 7U)
+	{
+#ifdef _MSC_VER
+		__cpuidex(regs, 7, 0);
+#else
+		do_cpuid(7, 0, (uint32_t*)regs);
+#endif
+
+		extract_x86_extended_flags(info, regs[1]);
+	}
+}
+
+void detect_sse41()
+{
+	cpu_info info;
+	get_cpuinfo(info);
+
+	// Check for everything from SSE to SSE 4.1
+	g_cpu_supports_sse41 = info.m_has_sse && info.m_has_sse2 && info.m_has_sse3 && info.m_has_ssse3 && info.m_has_sse41;
+}
+
+} // namespace basisu
+#else // #if BASISU_SUPPORT_SSE
+namespace basisu
+{
+
+void detect_sse41()
+{
+}
+
+} // namespace basisu
+#endif // #if BASISU_SUPPORT_SSE
+
--- a/thirdparty/basis_universal/encoder/basisu_miniz.h
+++ b/thirdparty/basis_universal/encoder/basisu_miniz.h
--- a/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.cpp
@ -0,0 +1,564 @@
+// basisu_pvrtc1_4.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_pvrtc1_4.h"
+
+namespace basisu
+{
+#if 0
+	static const uint8_t g_pvrtc_5[32] = { 0,8,16,24,33,41,49,57,66,74,82,90,99,107,115,123,132,140,148,156,165,173,181,189,198,206,214,222,231,239,247,255 };
+	static const uint8_t g_pvrtc_4[16] = { 0,16,33,49,66,82,99,115,140,156,173,189,206,222,239,255 };
+	static const uint8_t g_pvrtc_3[8] = { 0,33,74,107,148,181,222,255 };
+	static const uint8_t g_pvrtc_alpha[9] = { 0,34,68,102,136,170,204,238,255 };
+#endif
+
+	static const uint8_t g_pvrtc_5_nearest[256] = { 0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15,16,16,16,16,16,16,16,16,16,17,17,17,17,17,17,17,17,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,20,20,20,20,20,20,20,20,20,21,21,21,21,21,21,21,21,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23,24,24,24,24,24,24,24,24,24,25,25,25,25,25,25,25,25,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,28,28,28,28,28,28,28,28,28,29,29,29,29,29,29,29,29,30,30,30,30,30,30,30,30,31,31,31,31 };
+	static const uint8_t g_pvrtc_4_nearest[256] = { 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15 };
+#if 0
+	static const uint8_t g_pvrtc_3_nearest[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 };
+	static const uint8_t g_pvrtc_alpha_nearest[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8 };
+#endif
+
+#if 0
+	static const uint8_t g_pvrtc_5_floor[256] =
+	{
+		0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,
+		3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,
+		7,7,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,
+		11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,
+		15,15,15,15,16,16,16,16,16,16,16,16,17,17,17,17,17,17,17,17,18,18,18,18,18,18,18,18,19,19,19,19,
+		19,19,19,19,19,20,20,20,20,20,20,20,20,21,21,21,21,21,21,21,21,22,22,22,22,22,22,22,22,23,23,23,
+		23,23,23,23,23,23,24,24,24,24,24,24,24,24,25,25,25,25,25,25,25,25,26,26,26,26,26,26,26,26,27,27,
+		27,27,27,27,27,27,27,28,28,28,28,28,28,28,28,29,29,29,29,29,29,29,29,30,30,30,30,30,30,30,30,31
+	};
+
+	static const uint8_t g_pvrtc_5_ceil[256] =
+	{
+		0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,
+		4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,8,8,8,8,8,8,
+		8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,
+		12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15,16,16,16,16,
+		16,16,16,16,16,17,17,17,17,17,17,17,17,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,20,20,20,
+		20,20,20,20,20,20,21,21,21,21,21,21,21,21,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23,24,24,
+		24,24,24,24,24,24,24,25,25,25,25,25,25,25,25,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,28,
+		28,28,28,28,28,28,28,28,29,29,29,29,29,29,29,29,30,30,30,30,30,30,30,30,31,31,31,31,31,31,31,31
+	};
+
+	static const uint8_t g_pvrtc_4_floor[256] =
+	{
+		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+		1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+		3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+		5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,
+		7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,9,9,9,9,
+		9,9,9,9,9,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,11,11,11,
+		11,11,11,11,11,11,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,13,13,
+		13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,15
+	};
+
+	static const uint8_t g_pvrtc_4_ceil[256] =
+	{
+		0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+		4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,
+		6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,8,8,
+		8,8,8,8,8,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,10,10,10,
+		10,10,10,10,10,10,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,12,12,
+		12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,14,
+		14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15
+	};
+
+	static const uint8_t g_pvrtc_3_floor[256] =
+	{
+		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+		0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+		1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+		3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,
+		4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,
+		5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,
+		6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7
+	};
+
+	static const uint8_t g_pvrtc_3_ceil[256] =
+	{
+		0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+		1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+		3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+		4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,
+		5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,
+		6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,
+		7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+	};
+
+	static const uint8_t g_pvrtc_alpha_floor[256] =
+	{
+		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+		0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+		1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+		3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+		4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+		5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+		6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8
+	};
+
+	static const uint8_t g_pvrtc_alpha_ceil[256] =
+	{
+		0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+		1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+		3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+		4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+		5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+		6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+		7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+	};
+#endif
+
+	uint32_t pvrtc4_swizzle_uv(uint32_t width, uint32_t height, uint32_t x, uint32_t y)
+	{
+		assert((x < width) && (y < height) && basisu::is_pow2(height) && basisu::is_pow2(width));
+				
+		uint32_t min_d = width, max_v = y;
+		if (height < width)
+		{
+			min_d = height;
+			max_v = x;
+		}
+
+		// Interleave the XY LSB's
+		uint32_t shift_ofs = 0, swizzled = 0;
+		for (uint32_t s_bit = 1, d_bit = 1; s_bit < min_d; s_bit <<= 1, d_bit <<= 2, ++shift_ofs)
+		{
+			if (y & s_bit) swizzled |= d_bit;
+			if (x & s_bit) swizzled |= (2 * d_bit);
+		}
+
+		max_v >>= shift_ofs;
+		
+		// OR in the rest of the bits from the largest dimension
+		swizzled |= (max_v << (2 * shift_ofs));
+
+		return swizzled;
+	}
+
+	color_rgba pvrtc4_block::get_endpoint(uint32_t endpoint_index, bool unpack) const
+	{
+		assert(endpoint_index < 2);
+		const uint32_t packed = m_endpoints >> (endpoint_index * 16);
+
+		uint32_t r, g, b, a;
+		if (packed & 0x8000)
+		{
+			// opaque 554 or 555
+			if (!endpoint_index)
+			{
+				r = (packed >> 10) & 31;
+				g = (packed >> 5) & 31;
+				b = (packed >> 1) & 15;
+					
+				if (unpack)
+				{
+					b = (b << 1) | (b >> 3);
+				}
+			}
+			else
+			{
+				r = (packed >> 10) & 31;
+				g = (packed >> 5) & 31;
+				b = packed & 31;
+			}
+
+			a = unpack ? 255 : 7;
+		}
+		else
+		{
+			// translucent 4433 or 4443
+			if (!endpoint_index)
+			{
+				a = (packed >> 12) & 7;
+				r = (packed >> 8) & 15;
+				g = (packed >> 4) & 15;
+				b = (packed >> 1) & 7;
+
+				if (unpack)
+				{
+					a = (a << 1);
+					a = (a << 4) | a;
+						
+					r = (r << 1) | (r >> 3);
+					g = (g << 1) | (g >> 3);
+					b = (b << 2) | (b >> 1);
+				}
+			}
+			else
+			{
+				a = (packed >> 12) & 7;
+				r = (packed >> 8) & 15;
+				g = (packed >> 4) & 15;
+				b = packed & 15;
+
+				if (unpack)
+				{
+					a = (a << 1);
+					a = (a << 4) | a;
+
+					r = (r << 1) | (r >> 3);
+					g = (g << 1) | (g >> 3);
+					b = (b << 1) | (b >> 3);
+				}
+			}
+		}
+
+		if (unpack)
+		{
+			r = (r << 3) | (r >> 2);
+			g = (g << 3) | (g >> 2);
+			b = (b << 3) | (b >> 2);
+		}
+
+		assert((r < 256) && (g < 256) && (b < 256) && (a < 256));
+
+		return color_rgba(r, g, b, a);
+	}
+
+	color_rgba pvrtc4_block::get_endpoint_5554(uint32_t endpoint_index) const
+	{
+		assert(endpoint_index < 2);
+		const uint32_t packed = m_endpoints >> (endpoint_index * 16);
+
+		uint32_t r, g, b, a;
+		if (packed & 0x8000)
+		{
+			// opaque 554 or 555
+			if (!endpoint_index)
+			{
+				r = (packed >> 10) & 31;
+				g = (packed >> 5) & 31;
+				b = (packed >> 1) & 15;
+
+				b = (b << 1) | (b >> 3);
+			}
+			else
+			{
+				r = (packed >> 10) & 31;
+				g = (packed >> 5) & 31;
+				b = packed & 31;
+			}
+
+			a = 15;
+		}
+		else
+		{
+			// translucent 4433 or 4443
+			if (!endpoint_index)
+			{
+				a = (packed >> 12) & 7;
+				r = (packed >> 8) & 15;
+				g = (packed >> 4) & 15;
+				b = (packed >> 1) & 7;
+
+				a = a << 1;
+						
+				r = (r << 1) | (r >> 3);
+				g = (g << 1) | (g >> 3);
+				b = (b << 2) | (b >> 1);
+			}
+			else
+			{
+				a = (packed >> 12) & 7;
+				r = (packed >> 8) & 15;
+				g = (packed >> 4) & 15;
+				b = packed & 15;
+
+				a = a << 1;
+						
+				r = (r << 1) | (r >> 3);
+				g = (g << 1) | (g >> 3);
+				b = (b << 1) | (b >> 3);
+			}
+		}
+						
+		assert((r < 32) && (g < 32) && (b < 32) && (a < 16));
+
+		return color_rgba(r, g, b, a);
+	}
+
+	bool pvrtc4_image::get_interpolated_colors(uint32_t x, uint32_t y, color_rgba* pColors) const
+	{
+		assert((x < m_width) && (y < m_height));
+
+		int block_x0 = (static_cast<int>(x) - 2) >> 2;
+		int block_x1 = block_x0 + 1;
+		int block_y0 = (static_cast<int>(y) - 2) >> 2;
+		int block_y1 = block_y0 + 1;
+		
+		block_x0 = posmod(block_x0, m_block_width);
+		block_x1 = posmod(block_x1, m_block_width);
+		block_y0 = posmod(block_y0, m_block_height);
+		block_y1 = posmod(block_y1, m_block_height);
+		
+		pColors[0] = interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0));
+		pColors[3] = interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1));
+
+		if (get_block_uses_transparent_modulation(x >> 2, y >> 2))
+		{
+			for (uint32_t c = 0; c < 4; c++)
+			{
+				uint32_t m = (pColors[0][c] + pColors[3][c]) / 2;
+				pColors[1][c] = static_cast<uint8_t>(m);
+				pColors[2][c] = static_cast<uint8_t>(m);
+			}
+			pColors[2][3] = 0;
+			return true;
+		}
+
+		for (uint32_t c = 0; c < 4; c++)
+		{
+			pColors[1][c] = static_cast<uint8_t>((pColors[0][c] * 5 + pColors[3][c] * 3) / 8);
+			pColors[2][c] = static_cast<uint8_t>((pColors[0][c] * 3 + pColors[3][c] * 5) / 8);
+		}
+
+		return false;
+	}
+		
+	color_rgba pvrtc4_image::get_pixel(uint32_t x, uint32_t y, uint32_t m) const
+	{
+		assert((x < m_width) && (y < m_height));
+
+		int block_x0 = (static_cast<int>(x) - 2) >> 2;
+		int block_x1 = block_x0 + 1;
+		int block_y0 = (static_cast<int>(y) - 2) >> 2;
+		int block_y1 = block_y0 + 1;
+		
+		block_x0 = posmod(block_x0, m_block_width);
+		block_x1 = posmod(block_x1, m_block_width);
+		block_y0 = posmod(block_y0, m_block_height);
+		block_y1 = posmod(block_y1, m_block_height);
+		
+		if (get_block_uses_transparent_modulation(x >> 2, y >> 2))
+		{
+			if (m == 0)
+				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0));
+			else if (m == 3)
+				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1));
+
+			color_rgba l(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0)));
+			color_rgba h(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1)));
+
+			return color_rgba((l[0] + h[0]) / 2, (l[1] + h[1]) / 2, (l[2] + h[2]) / 2, (m == 2) ? 0 : (l[3] + h[3]) / 2);
+		}
+		else
+		{
+			if (m == 0)
+				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0));
+			else if (m == 3)
+				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1));
+
+			color_rgba l(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0)));
+			color_rgba h(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1)));
+
+			if (m == 2)
+				return color_rgba((l[0] * 3 + h[0] * 5) / 8, (l[1] * 3 + h[1] * 5) / 8, (l[2] * 3 + h[2] * 5) / 8, (l[3] * 3 + h[3] * 5) / 8);
+			else
+				return color_rgba((l[0] * 5 + h[0] * 3) / 8, (l[1] * 5 + h[1] * 3) / 8, (l[2] * 5 + h[2] * 3) / 8, (l[3] * 5 + h[3] * 3) / 8);
+		}
+	}
+
+	uint64_t pvrtc4_image::local_endpoint_optimization_opaque(uint32_t bx, uint32_t by, const image& orig_img, bool perceptual)
+	{
+		uint64_t initial_error = evaluate_1x1_endpoint_error(bx, by, orig_img, perceptual, false);
+		if (!initial_error)
+			return initial_error;
+
+		vec3F c_avg_orig(0);
+
+		for (int y = 0; y < 7; y++)
+		{
+			const uint32_t py = wrap_y(by * 4 + y - 1);
+			for (uint32_t x = 0; x < 7; x++)
+			{
+				const uint32_t px = wrap_x(bx * 4 + x - 1);
+
+				const color_rgba& c = orig_img(px, py);
+
+				c_avg_orig[0] += c[0];
+				c_avg_orig[1] += c[1];
+				c_avg_orig[2] += c[2];
+			}
+		}
+
+		c_avg_orig *= 1.0f / 49.0f;
+
+		vec3F quant_colors[2];
+		quant_colors[0].set(c_avg_orig);
+		quant_colors[0] -= vec3F(.0125f);
+
+		quant_colors[1].set(c_avg_orig);
+		quant_colors[1] += vec3F(.0125f);
+
+		float total_weight[2];
+
+		bool success = true;
+
+		for (uint32_t pass = 0; pass < 4; pass++)
+		{
+			vec3F new_colors[2] = { vec3F(0), vec3F(0) };
+			memset(total_weight, 0, sizeof(total_weight));
+
+			static const float s_weights[7][7] =
+			{
+				{ 1.000000f, 1.637089f, 2.080362f, 2.242640f, 2.080362f, 1.637089f, 1.000000f },
+				{ 1.637089f, 2.414213f, 3.006572f, 3.242640f, 3.006572f, 2.414213f, 1.637089f },
+				{ 2.080362f, 3.006572f, 3.828426f, 4.242640f, 3.828426f, 3.006572f, 2.080362f },
+				{ 2.242640f, 3.242640f, 4.242640f, 5.000000f, 4.242640f, 3.242640f, 2.242640f },
+				{ 2.080362f, 3.006572f, 3.828426f, 4.242640f, 3.828426f, 3.006572f, 2.080362f },
+				{ 1.637089f, 2.414213f, 3.006572f, 3.242640f, 3.006572f, 2.414213f, 1.637089f },
+				{ 1.000000f, 1.637089f, 2.080362f, 2.242640f, 2.080362f, 1.637089f, 1.000000f }
+			};
+
+			for (int y = 0; y < 7; y++)
+			{
+				const uint32_t py = wrap_y(by * 4 + y - 1);
+				for (uint32_t x = 0; x < 7; x++)
+				{
+					const uint32_t px = wrap_x(bx * 4 + x - 1);
+
+					const color_rgba& orig_c = orig_img(px, py);
+
+					vec3F color(orig_c[0], orig_c[1], orig_c[2]);
+
+					uint32_t c = quant_colors[0].squared_distance(color) > quant_colors[1].squared_distance(color);
+
+					const float weight = s_weights[y][x];
+					new_colors[c] += color * weight;
+
+					total_weight[c] += weight;
+				}
+			}
+
+			if (!total_weight[0] || !total_weight[1])
+				success = false;
+
+			quant_colors[0] = new_colors[0] / (float)total_weight[0];
+			quant_colors[1] = new_colors[1] / (float)total_weight[1];
+		}
+
+		if (!success)
+		{
+			quant_colors[0] = c_avg_orig;
+			quant_colors[1] = c_avg_orig;
+		}
+
+		vec4F colors[2] = { quant_colors[0], quant_colors[1] };
+
+		colors[0] += vec3F(.5f);
+		colors[1] += vec3F(.5f);
+		color_rgba color_0((int)colors[0][0], (int)colors[0][1], (int)colors[0][2], 0);
+		color_rgba color_1((int)colors[1][0], (int)colors[1][1], (int)colors[1][2], 0);
+
+		pvrtc4_block cur_blocks[3][3];
+		
+		for (int y = -1; y <= 1; y++)
+		{
+			for (int x = -1; x <= 1; x++)
+			{
+				const uint32_t block_x = wrap_block_x(bx + x);
+				const uint32_t block_y = wrap_block_y(by + y);
+				cur_blocks[x + 1][y + 1] = m_blocks(block_x, block_y);
+			}
+		}
+
+		color_rgba l1(0), h1(0);
+
+		l1[0] = g_pvrtc_5_nearest[color_0[0]];
+		h1[0] = g_pvrtc_5_nearest[color_1[0]];
+
+		l1[1] = g_pvrtc_5_nearest[color_0[1]];
+		h1[1] = g_pvrtc_5_nearest[color_1[1]];
+
+		l1[2] = g_pvrtc_4_nearest[color_0[2]];
+		h1[2] = g_pvrtc_5_nearest[color_0[2]];
+
+		l1[3] = 0;
+		h1[3] = 0;
+
+		m_blocks(bx, by).set_endpoint_raw(0, l1, true);
+		m_blocks(bx, by).set_endpoint_raw(1, h1, true);
+
+		uint64_t e03_err_0 = remap_pixels_influenced_by_endpoint(bx, by, orig_img, perceptual, false);
+
+		pvrtc4_block blocks0[3][3];
+		for (int y = -1; y <= 1; y++)
+		{
+			for (int x = -1; x <= 1; x++)
+			{
+				const uint32_t block_x = wrap_block_x(bx + x);
+				const uint32_t block_y = wrap_block_y(by + y);
+				blocks0[x + 1][y + 1] = m_blocks(block_x, block_y);
+			}
+		}
+
+		l1[0] = g_pvrtc_5_nearest[color_1[0]];
+		h1[0] = g_pvrtc_5_nearest[color_0[0]];
+
+		l1[1] = g_pvrtc_5_nearest[color_1[1]];
+		h1[1] = g_pvrtc_5_nearest[color_0[1]];
+
+		l1[2] = g_pvrtc_4_nearest[color_1[2]];
+		h1[2] = g_pvrtc_5_nearest[color_0[2]];
+
+		l1[3] = 0;
+		h1[3] = 0;
+
+		m_blocks(bx, by).set_endpoint_raw(0, l1, true);
+		m_blocks(bx, by).set_endpoint_raw(1, h1, true);
+
+		uint64_t e03_err_1 = remap_pixels_influenced_by_endpoint(bx, by, orig_img, perceptual, false);
+
+		if (initial_error < basisu::minimum(e03_err_0, e03_err_1))
+		{
+			for (int y = -1; y <= 1; y++)
+			{
+				for (int x = -1; x <= 1; x++)
+				{
+					const uint32_t block_x = wrap_block_x(bx + x);
+					const uint32_t block_y = wrap_block_y(by + y);
+					m_blocks(block_x, block_y) = cur_blocks[x + 1][y + 1];
+				}
+			}
+			return initial_error;
+		}
+		else if (e03_err_0 < e03_err_1)
+		{
+			for (int y = -1; y <= 1; y++)
+			{
+				for (int x = -1; x <= 1; x++)
+				{
+					const uint32_t block_x = wrap_block_x(bx + x);
+					const uint32_t block_y = wrap_block_y(by + y);
+					m_blocks(block_x, block_y) = blocks0[x + 1][y + 1];
+				}
+			}
+			assert(e03_err_0 == evaluate_1x1_endpoint_error(bx, by, orig_img, perceptual, false));
+			return e03_err_0;
+		}
+
+		assert(e03_err_1 == evaluate_1x1_endpoint_error(bx, by, orig_img, perceptual, false));
+		return e03_err_1;
+	}
+
+} // basisu
--- a/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.h
+++ b/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.h
@ -1,5 +1,5 @@
 // basisu_pvrtc1_4.cpp
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -87,6 +87,14 @@ namespace basisu
 			return (m_modulation >> ((y * 4 + x) * 2)) & 3;
 		}

+		inline void set_modulation(uint32_t x, uint32_t y, uint32_t s)
+		{
+			assert((x < 4) && (y < 4) && (s < 4));
+			uint32_t n = (y * 4 + x) * 2;
+			m_modulation = (m_modulation & (~(3 << n))) | (s << n);
+			assert(get_modulation(x, y) == s);
+		}
+
 		// Scaled by 8
 		inline const uint32_t* get_scaled_modulation_values(bool block_uses_transparent_modulation) const
 		{
@ -107,7 +115,7 @@ namespace basisu
 		}

 		// opaque endpoints:	554, 555
-		// transparent endpoints: 3443 or 3444
+		// transparent endpoints: 3443, 3444
 		inline void set_endpoint_raw(uint32_t endpoint_index, const color_rgba& c, bool opaque_endpoint)
 		{
 			assert(endpoint_index < 2);
@ -352,7 +360,93 @@ namespace basisu

 			return result;
 		}
-						
+
+		inline void set_modulation(uint32_t x, uint32_t y, uint32_t s)
+		{
+			assert((x < m_width) && (y < m_height));
+			return m_blocks(x >> 2, y >> 2).set_modulation(x & 3, y & 3, s);
+		}
+
+		inline uint64_t map_pixel(uint32_t x, uint32_t y, const color_rgba& c, bool perceptual, bool alpha_is_significant, bool record = true)
+		{
+			color_rgba v[4];
+			get_interpolated_colors(x, y, v);
+
+			uint64_t best_dist = color_distance(perceptual, c, v[0], alpha_is_significant);
+			uint32_t best_v = 0;
+			for (uint32_t i = 1; i < 4; i++)
+			{
+				uint64_t dist = color_distance(perceptual, c, v[i], alpha_is_significant);
+				if (dist < best_dist)
+				{
+					best_dist = dist;
+					best_v = i;
+				}
+			}
+
+			if (record)
+				set_modulation(x, y, best_v);
+
+			return best_dist;
+		}
+
+		inline uint64_t remap_pixels_influenced_by_endpoint(uint32_t bx, uint32_t by, const image& orig_img, bool perceptual, bool alpha_is_significant)
+		{
+			uint64_t total_error = 0;
+
+			for (int yd = -3; yd <= 3; yd++)
+			{
+				const int y = wrap_y((int)by * 4 + 2 + yd);
+
+				for (int xd = -3; xd <= 3; xd++)
+				{
+					const int x = wrap_x((int)bx * 4 + 2 + xd);
+
+					total_error += map_pixel(x, y, orig_img(x, y), perceptual, alpha_is_significant);
+				}
+			}
+
+			return total_error;
+		}
+
+		inline uint64_t evaluate_1x1_endpoint_error(uint32_t bx, uint32_t by, const image& orig_img, bool perceptual, bool alpha_is_significant, uint64_t threshold_error = 0) const
+		{
+			uint64_t total_error = 0;
+
+			for (int yd = -3; yd <= 3; yd++)
+			{
+				const int y = wrap_y((int)by * 4 + 2 + yd);
+
+				for (int xd = -3; xd <= 3; xd++)
+				{
+					const int x = wrap_x((int)bx * 4 + 2 + xd);
+
+					total_error += color_distance(perceptual, get_pixel(x, y), orig_img(x, y), alpha_is_significant);
+
+					if ((threshold_error) && (total_error >= threshold_error))
+						return total_error;
+				}
+			}
+
+			return total_error;
+		}
+
+		uint64_t local_endpoint_optimization_opaque(uint32_t bx, uint32_t by, const image& orig_img, bool perceptual);
+
+		inline uint64_t map_all_pixels(const image& img, bool perceptual, bool alpha_is_significant)
+		{
+			assert(m_width == img.get_width());
+			assert(m_height == img.get_height());
+
+			uint64_t total_error = 0;
+			for (uint32_t y = 0; y < img.get_height(); y++)
+				for (uint32_t x = 0; x < img.get_width(); x++)
+					total_error += map_pixel(x, y, img(x, y), perceptual, alpha_is_significant);
+
+			return total_error;
+		}
+	
+	public:						
 		uint32_t m_width, m_height;
 		pvrtc4_block_vector2D m_blocks;
 		uint32_t m_block_width, m_block_height;
--- a/thirdparty/basis_universal/encoder/basisu_resample_filters.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_resample_filters.cpp
@ -1,5 +1,5 @@
 // basisu_resampler_filters.cpp
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -283,7 +283,7 @@ namespace basisu
 		return sum;
 	}

-	static const float KAISER_ALPHA = 4.0;
+	//static const float KAISER_ALPHA = 4.0;
 	static double kaiser(double alpha, double half_width, double x)
 	{
 		const double ratio = (x / half_width);
@ -310,10 +310,22 @@ namespace basisu

 	const resample_filter g_resample_filters[] =
 	{
-		 { "box", box_filter, BOX_FILTER_SUPPORT }, { "tent", tent_filter, TENT_FILTER_SUPPORT }, { "bell", bell_filter, BELL_SUPPORT }, { "b-spline", B_spline_filter, B_SPLINE_SUPPORT },
-		 { "mitchell", mitchell_filter, MITCHELL_SUPPORT }, { "lanczos3", lanczos3_filter, LANCZOS3_SUPPORT }, { "blackman", blackman_filter, BLACKMAN_SUPPORT }, { "lanczos4", lanczos4_filter, LANCZOS4_SUPPORT },
-		 { "lanczos6", lanczos6_filter, LANCZOS6_SUPPORT }, { "lanczos12", lanczos12_filter, LANCZOS12_SUPPORT }, { "kaiser", kaiser_filter, KAISER_SUPPORT }, { "gaussian", gaussian_filter, GAUSSIAN_SUPPORT },
-		 { "catmullrom", catmull_rom_filter, CATMULL_ROM_SUPPORT }, { "quadratic_interp", quadratic_interp_filter, QUADRATIC_SUPPORT }, { "quadratic_approx", quadratic_approx_filter, QUADRATIC_SUPPORT }, { "quadratic_mix", quadratic_mix_filter, QUADRATIC_SUPPORT },
+		{ "box", box_filter, BOX_FILTER_SUPPORT }, 
+		{ "tent", tent_filter, TENT_FILTER_SUPPORT }, 
+		{ "bell", bell_filter, BELL_SUPPORT }, 
+		{ "b-spline", B_spline_filter, B_SPLINE_SUPPORT },
+		{ "mitchell", mitchell_filter, MITCHELL_SUPPORT }, 
+		{ "blackman", blackman_filter, BLACKMAN_SUPPORT }, 
+		{ "lanczos3", lanczos3_filter, LANCZOS3_SUPPORT },
+		{ "lanczos4", lanczos4_filter, LANCZOS4_SUPPORT },
+		{ "lanczos6", lanczos6_filter, LANCZOS6_SUPPORT }, 
+		{ "lanczos12", lanczos12_filter, LANCZOS12_SUPPORT }, 
+		{ "kaiser", kaiser_filter, KAISER_SUPPORT }, 
+		{ "gaussian", gaussian_filter, GAUSSIAN_SUPPORT },
+		{ "catmullrom", catmull_rom_filter, CATMULL_ROM_SUPPORT }, 
+		{ "quadratic_interp", quadratic_interp_filter, QUADRATIC_SUPPORT }, 
+		{ "quadratic_approx", quadratic_approx_filter, QUADRATIC_SUPPORT }, 
+		{ "quadratic_mix", quadratic_mix_filter, QUADRATIC_SUPPORT },
 	};

 	const int g_num_resample_filters = BASISU_ARRAY_SIZE(g_resample_filters);
--- a/thirdparty/basis_universal/encoder/basisu_resampler.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_resampler.cpp
--- a/thirdparty/basis_universal/encoder/basisu_resampler.h
+++ b/thirdparty/basis_universal/encoder/basisu_resampler.h
@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
-#include "transcoder/basisu.h"
+#include "../transcoder/basisu.h"

 #define BASISU_RESAMPLER_DEBUG_OPS (0)
 #define BASISU_RESAMPLER_DEFAULT_FILTER "lanczos4"
--- a/thirdparty/basis_universal/encoder/basisu_resampler_filters.h
+++ b/thirdparty/basis_universal/encoder/basisu_resampler_filters.h
@ -14,7 +14,7 @@
 // limitations under the License.
 #pragma once

-#include "transcoder/basisu.h"
+#include "../transcoder/basisu.h"

 namespace basisu
 {
--- a/thirdparty/basis_universal/encoder/basisu_ssim.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_ssim.cpp
--- a/thirdparty/basis_universal/encoder/basisu_ssim.h
+++ b/thirdparty/basis_universal/encoder/basisu_ssim.h
--- a/thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp
--- a/thirdparty/basis_universal/encoder/basisu_uastc_enc.h
+++ b/thirdparty/basis_universal/encoder/basisu_uastc_enc.h
@ -0,0 +1,140 @@
+// basisu_uastc_enc.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "basisu_etc.h"
+
+#include "../transcoder/basisu_transcoder_uastc.h"
+
+namespace basisu
+{
+	const uint32_t TOTAL_PACK_UASTC_LEVELS = 5;
+
+	enum
+	{
+		// Fastest is the lowest quality, although it's stil substantially higher quality vs. BC1/ETC1. It supports 5 modes.
+		// The output may be somewhat blocky because this setting doesn't support 2/3-subset UASTC modes, but it should be less blocky vs. BC1/ETC1.
+		// This setting doesn't write BC1 hints, so BC1 transcoding will be slower. 
+		// Transcoded ETC1 quality will be lower because it only considers 2 hints out of 32.
+		// Avg. 43.45 dB
+		cPackUASTCLevelFastest = 0,
+		
+		// Faster is ~3x slower than fastest. It supports 9 modes.
+		// Avg. 46.49 dB
+		cPackUASTCLevelFaster = 1,
+		
+		// Default is ~5.5x slower than fastest. It supports 14 modes.
+		// Avg. 47.47 dB
+		cPackUASTCLevelDefault = 2,
+
+		// Slower is ~14.5x slower than fastest. It supports all 18 modes.
+		// Avg. 48.01 dB
+		cPackUASTCLevelSlower = 3,
+
+		// VerySlow is ~200x slower than fastest. 
+		// The best quality the codec is capable of, but you'll need to be patient or have a lot of cores.
+		// Avg. 48.24 dB
+		cPackUASTCLevelVerySlow = 4,
+
+		cPackUASTCLevelMask = 0xF,
+
+		// By default the encoder tries to strike a balance between UASTC and transcoded BC7 quality.
+		// These flags allow you to favor only optimizing for lowest UASTC error, or lowest BC7 error.
+		cPackUASTCFavorUASTCError = 8,
+		cPackUASTCFavorBC7Error = 16,
+						
+		cPackUASTCETC1FasterHints = 64,
+		cPackUASTCETC1FastestHints = 128,
+		cPackUASTCETC1DisableFlipAndIndividual = 256,
+		
+		// Favor UASTC modes 0 and 10 more than the others (this is experimental, it's useful for RDO compression)
+		cPackUASTCFavorSimplerModes = 512, 
+	};
+
+	// pRGBAPixels: Pointer to source 4x4 block of RGBA pixels (R first in memory).
+	// block: Reference to destination UASTC block.
+	// level: Controls compression speed vs. performance tradeoff.
+	void encode_uastc(const uint8_t* pRGBAPixels, basist::uastc_block& output_block, uint32_t flags = cPackUASTCLevelDefault);
+
+	struct uastc_encode_results
+	{
+		uint32_t m_uastc_mode;
+		uint32_t m_common_pattern;
+		basist::astc_block_desc m_astc;
+		color_rgba m_solid_color;
+		uint64_t m_astc_err;
+	};
+			  
+	void pack_uastc(basist::uastc_block& blk, const uastc_encode_results& result, const etc_block& etc1_blk, uint32_t etc1_bias, const eac_a8_block& etc_eac_a8_blk, bool bc1_hint0, bool bc1_hint1);
+
+	const uint32_t UASCT_RDO_DEFAULT_LZ_DICT_SIZE = 4096;
+
+	const float UASTC_RDO_DEFAULT_MAX_ALLOWED_RMS_INCREASE_RATIO = 10.0f;
+	const float UASTC_RDO_DEFAULT_SKIP_BLOCK_RMS_THRESH = 8.0f;
+	
+	// The RDO encoder computes a smoothness factor, from [0,1], for each block. To do this it computes each block's maximum component variance, then it divides this by this factor and clamps the result.
+	// Larger values will result in more blocks being protected from too much distortion.
+	const float UASTC_RDO_DEFAULT_MAX_SMOOTH_BLOCK_STD_DEV = 18.0f;
+	
+	// The RDO encoder can artifically boost the error of smooth blocks, in order to suppress distortions on smooth areas of the texture.
+	// The encoder will use this value as the maximum error scale to use on smooth blocks. The larger this value, the better smooth bocks will look. Set to 1.0 to disable this completely.
+	const float UASTC_RDO_DEFAULT_SMOOTH_BLOCK_MAX_ERROR_SCALE = 10.0f;
+
+	struct uastc_rdo_params
+	{
+		uastc_rdo_params()
+		{
+			clear();
+		}
+
+		void clear()
+		{
+			m_lz_dict_size = UASCT_RDO_DEFAULT_LZ_DICT_SIZE;
+			m_lambda = 0.5f;
+			m_max_allowed_rms_increase_ratio = UASTC_RDO_DEFAULT_MAX_ALLOWED_RMS_INCREASE_RATIO;
+			m_skip_block_rms_thresh = UASTC_RDO_DEFAULT_SKIP_BLOCK_RMS_THRESH;
+			m_endpoint_refinement = true;
+			m_lz_literal_cost = 100;
+						
+			m_max_smooth_block_std_dev = UASTC_RDO_DEFAULT_MAX_SMOOTH_BLOCK_STD_DEV;
+			m_smooth_block_max_error_scale = UASTC_RDO_DEFAULT_SMOOTH_BLOCK_MAX_ERROR_SCALE;
+		}
+				
+		// m_lz_dict_size: Size of LZ dictionary to simulate in bytes. The larger this value, the slower the encoder but the higher the quality per LZ compressed bit.
+		uint32_t m_lz_dict_size;
+
+		// m_lambda: The post-processor tries to reduce distortion+rate*lambda (rate is approximate LZ bits and distortion is scaled MS error).
+		// Larger values push the postprocessor towards optimizing more for lower rate, and smaller values more for distortion. 0=minimal distortion.
+		float m_lambda;
+		
+		// m_max_allowed_rms_increase_ratio: How much the RMS error of a block is allowed to increase before a trial is rejected. 1.0=no increase allowed, 1.05=5% increase allowed, etc.
+		float m_max_allowed_rms_increase_ratio;
+		
+		// m_skip_block_rms_thresh: Blocks with this much RMS error or more are completely skipped by the RDO encoder. 
+		float m_skip_block_rms_thresh;
+
+		// m_endpoint_refinement: If true, the post-process will attempt to refine the endpoints of blocks with modified selectors. 
+		bool m_endpoint_refinement;
+
+		float m_max_smooth_block_std_dev;
+		float m_smooth_block_max_error_scale;
+		
+		uint32_t m_lz_literal_cost;
+	};
+
+	// num_blocks, pBlocks: Number of blocks and pointer to UASTC blocks to process.
+	// pBlock_pixels: Pointer to an array of 4x4 blocks containing the original texture pixels. This is NOT a raster image, but a pointer to individual 4x4 blocks.
+	// flags: Pass in the same flags used to encode the UASTC blocks. The flags are used to reencode the transcode hints in the same way.
+	bool uastc_rdo(uint32_t num_blocks, basist::uastc_block* pBlocks, const color_rgba* pBlock_pixels, const uastc_rdo_params &params, uint32_t flags = cPackUASTCLevelDefault, job_pool* pJob_pool = nullptr, uint32_t total_jobs = 0);
+} // namespace basisu
--- a/thirdparty/basis_universal/encoder/cppspmd_flow.h
+++ b/thirdparty/basis_universal/encoder/cppspmd_flow.h
@ -0,0 +1,590 @@
+// Do not include this header directly.
+// Control flow functionality in common between all the headers.
+//
+// Copyright 2020-2021 Binomial LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef _DEBUG
+CPPSPMD_FORCE_INLINE void spmd_kernel::check_masks()
+{
+	assert(!any(andnot(m_kernel_exec, m_exec)));
+}
+#endif
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_break()
+{
+#ifdef _DEBUG
+	assert(m_in_loop);
+#endif
+
+	m_exec = exec_mask::all_off();
+}
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_continue()
+{
+#ifdef _DEBUG
+	assert(m_in_loop);
+#endif
+
+	// Kill any active lanes, and remember which lanes were active so we can re-enable them at the end of the loop body.
+	m_continue_mask = m_continue_mask | m_exec;
+	m_exec = exec_mask::all_off();
+}
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_return()
+{
+	// Permenantly kill all active lanes
+	m_kernel_exec = andnot(m_exec, m_kernel_exec);
+	m_exec = exec_mask::all_off();
+}
+			
+template<typename UnmaskedBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_unmasked(const UnmaskedBody& unmaskedBody)
+{
+	exec_mask orig_exec = m_exec, orig_kernel_exec = m_kernel_exec;
+
+	m_kernel_exec = exec_mask::all_on();
+	m_exec = exec_mask::all_on();
+
+	unmaskedBody();
+
+	m_kernel_exec = m_kernel_exec & orig_kernel_exec;
+	m_exec = m_exec & orig_exec;
+	
+	check_masks();
+}
+
+struct scoped_unmasked_restorer
+{
+	spmd_kernel *m_pKernel;
+	exec_mask m_orig_exec, m_orig_kernel_exec;
+				
+	CPPSPMD_FORCE_INLINE scoped_unmasked_restorer(spmd_kernel *pKernel) : 
+		m_pKernel(pKernel), 
+		m_orig_exec(pKernel->m_exec),
+		m_orig_kernel_exec(pKernel->m_kernel_exec)
+	{
+		pKernel->m_kernel_exec = exec_mask::all_on();
+		pKernel->m_exec = exec_mask::all_on();
+	}
+
+	CPPSPMD_FORCE_INLINE ~scoped_unmasked_restorer() 
+	{ 
+		m_pKernel->m_kernel_exec = m_pKernel->m_kernel_exec & m_orig_kernel_exec;
+		m_pKernel->m_exec = m_pKernel->m_exec & m_orig_exec;
+		m_pKernel->check_masks();
+	}
+};
+
+#define SPMD_UNMASKED_BEGIN { scoped_unmasked_restorer _unmasked_restorer(this); 
+#define SPMD_UNMASKED_END }
+
+#if 0
+template<typename SPMDKernel, typename... Args>
+CPPSPMD_FORCE_INLINE decltype(auto) spmd_kernel::spmd_call(Args&&... args)
+{
+	SPMDKernel kernel;
+	kernel.init(m_exec);
+	return kernel._call(std::forward<Args>(args)...);
+}
+#else
+template<typename SPMDKernel, typename... Args>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_call(Args&&... args)
+{
+	SPMDKernel kernel;
+	kernel.init(m_exec);
+	kernel._call(std::forward<Args>(args)...);
+}
+#endif
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_if_break(const vbool& cond)
+{
+#ifdef _DEBUG
+	assert(m_in_loop);
+#endif
+	
+	exec_mask cond_exec(cond);
+					
+	m_exec = andnot(m_exec & cond_exec, m_exec);
+
+	check_masks();
+}
+
+// No SPMD breaks, continues, etc. allowed
+template<typename IfBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_sif(const vbool& cond, const IfBody& ifBody)
+{
+	exec_mask im = m_exec & exec_mask(cond);
+
+	if (any(im))
+	{
+		const exec_mask orig_exec = m_exec;
+		m_exec = im;
+		ifBody();
+		m_exec = orig_exec;
+	}
+}
+
+// No SPMD breaks, continues, etc. allowed
+template<typename IfBody, typename ElseBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_sifelse(const vbool& cond, const IfBody& ifBody, const ElseBody &elseBody)
+{
+	const exec_mask orig_exec = m_exec;
+
+	exec_mask im = m_exec & exec_mask(cond);
+
+	if (any(im))
+	{
+		m_exec = im;
+		ifBody();
+	}
+
+	exec_mask em = orig_exec & exec_mask(!cond);
+
+	if (any(em))
+	{
+		m_exec = em;
+		elseBody();
+	}
+		
+	m_exec = orig_exec;
+}
+
+template<typename IfBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_if(const vbool& cond, const IfBody& ifBody)
+{
+	exec_mask cond_exec(cond);
+		
+	exec_mask pre_if_exec = cond_exec & m_exec;
+
+	if (any(pre_if_exec))
+	{
+		exec_mask unexecuted_lanes = andnot(cond_exec, m_exec);
+		m_exec = pre_if_exec;
+
+		ifBody();
+
+		// Propagate any lanes that got disabled inside the if body into the exec mask outside the if body, but turn on any lanes that didn't execute inside the if body.
+		m_exec = m_exec | unexecuted_lanes;
+
+		check_masks();
+	}
+}
+
+template<typename IfBody, typename ElseBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_ifelse(const vbool& cond, const IfBody& ifBody, const ElseBody& elseBody)
+{
+	bool all_flag = false;
+
+	exec_mask cond_exec(cond);
+		
+	{
+		exec_mask pre_if_exec = cond_exec & m_exec;
+
+		int mask = pre_if_exec.get_movemask();
+		if (mask != 0)
+		{
+			all_flag = ((uint32_t)mask == m_exec.get_movemask());
+
+			exec_mask unexecuted_lanes = andnot(cond_exec, m_exec);
+			m_exec = pre_if_exec;
+
+			ifBody();
+
+			// Propagate any lanes that got disabled inside the if body into the exec mask outside the if body, but turn on any lanes that didn't execute inside the if body.
+			m_exec = m_exec | unexecuted_lanes;
+
+			check_masks();
+		}
+	}
+
+	if (!all_flag)
+	{
+		exec_mask pre_if_exec = andnot(cond_exec, m_exec);
+
+		if (any(pre_if_exec))
+		{
+			exec_mask unexecuted_lanes = cond_exec & m_exec;
+			m_exec = pre_if_exec;
+
+			ifBody();
+
+			// Propagate any lanes that got disabled inside the if body into the exec mask outside the if body, but turn on any lanes that didn't execute inside the if body.
+			m_exec = m_exec | unexecuted_lanes;
+
+			check_masks();
+		}
+	}
+}
+
+struct scoped_exec_restorer
+{
+	exec_mask *m_pMask;
+	exec_mask m_prev_mask;
+	CPPSPMD_FORCE_INLINE scoped_exec_restorer(exec_mask *pExec_mask) : m_pMask(pExec_mask), m_prev_mask(*pExec_mask) { }
+	CPPSPMD_FORCE_INLINE ~scoped_exec_restorer() { *m_pMask = m_prev_mask; }
+};
+
+// Cannot use SPMD break, continue, or return inside "simple" if/else
+#define SPMD_SIF(cond) exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(vbool(cond))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ CPPSPMD::scoped_exec_restorer CPPSPMD_GLUER2(_exec_restore_, __LINE__)(&m_exec); m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SELSE(cond) } exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(!vbool(cond))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ CPPSPMD::scoped_exec_restorer CPPSPMD_GLUER2(_exec_restore_, __LINE__)(&m_exec); m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SENDIF }
+
+// Same as SPMD_SIF, except doesn't use a scoped object
+#define SPMD_SIF2(cond) exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(vbool(cond))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ exec_mask _orig_exec = m_exec; m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SELSE2(cond) m_exec = _orig_exec; } exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(!vbool(cond))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ exec_mask _orig_exec = m_exec; m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SEND_IF2 m_exec = _orig_exec; }
+
+// Same as SPMD_SIF(), except the if/else blocks are always executed
+#define SPMD_SAIF(cond) exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(vbool(cond))); { CPPSPMD::scoped_exec_restorer CPPSPMD_GLUER2(_exec_restore_, __LINE__)(&m_exec); \
+	m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SAELSE(cond) } exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(!vbool(cond))); { CPPSPMD::scoped_exec_restorer CPPSPMD_GLUER2(_exec_restore_, __LINE__)(&m_exec); \
+	m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SAENDIF }
+
+// Cannot use SPMD break, continue, or return inside sselect
+#define SPMD_SSELECT(var)		do { vint_t _select_var = var; scoped_exec_restorer _orig_exec(&m_exec); exec_mask _select_executed(exec_mask::all_off());
+#define SPMD_SCASE(value)		exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(_orig_exec.m_prev_mask & exec_mask(vbool(_select_var == (value)))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__); _select_executed = _select_executed | m_exec;
+
+//#define SPMD_SCASE_END			if (_select_executed.get_movemask() == _orig_exec.m_prev_mask.get_movemask()) break; }
+#define SPMD_SCASE_END			if (!any(_select_executed ^ _orig_exec.m_prev_mask)) break; }
+#define SPMD_SDEFAULT			exec_mask _all_other_lanes(andnot(_select_executed, _orig_exec.m_prev_mask)); if (any(_all_other_lanes)) { m_exec = _all_other_lanes;
+#define SPMD_SDEFAULT_END		}
+#define SPMD_SSELECT_END		} while(0);
+
+// Same as SPMD_SSELECT, except all cases are executed.
+// Cannot use SPMD break, continue, or return inside sselect
+#define SPMD_SASELECT(var)		do { vint_t _select_var = var; scoped_exec_restorer _orig_exec(&m_exec); exec_mask _select_executed(exec_mask::all_off());
+
+#define SPMD_SACASE(value)		exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(_orig_exec.m_prev_mask & exec_mask(vbool(_select_var == (value)))); { m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__); \
+	_select_executed = _select_executed | m_exec;
+
+#define SPMD_SACASE_END			}
+#define SPMD_SADEFAULT			exec_mask _all_other_lanes(andnot(_select_executed, _orig_exec.m_prev_mask)); { m_exec = _all_other_lanes;
+#define SPMD_SADEFAULT_END		}
+#define SPMD_SASELECT_END		} while(0);
+
+struct scoped_exec_restorer2
+{
+	spmd_kernel *m_pKernel;
+	exec_mask m_unexecuted_lanes;
+		
+	CPPSPMD_FORCE_INLINE scoped_exec_restorer2(spmd_kernel *pKernel, const vbool &cond) : 
+		m_pKernel(pKernel)
+	{ 
+		exec_mask cond_exec(cond);
+		m_unexecuted_lanes = andnot(cond_exec, pKernel->m_exec);
+		pKernel->m_exec = cond_exec & pKernel->m_exec;
+	}
+
+	CPPSPMD_FORCE_INLINE ~scoped_exec_restorer2() 
+	{ 
+		m_pKernel->m_exec = m_pKernel->m_exec | m_unexecuted_lanes;
+		m_pKernel->check_masks();
+	}
+};
+
+#define SPMD_IF(cond) { CPPSPMD::scoped_exec_restorer2 CPPSPMD_GLUER2(_exec_restore2_, __LINE__)(this, vbool(cond)); if (any(m_exec)) {
+#define SPMD_ELSE(cond) } } { CPPSPMD::scoped_exec_restorer2 CPPSPMD_GLUER2(_exec_restore2_, __LINE__)(this, !vbool(cond)); if (any(m_exec)) {
+#define SPMD_END_IF } }
+
+// Same as SPMD_IF, except the conditional block is always executed.
+#define SPMD_AIF(cond) { CPPSPMD::scoped_exec_restorer2 CPPSPMD_GLUER2(_exec_restore2_, __LINE__)(this, vbool(cond)); {
+#define SPMD_AELSE(cond) } } { CPPSPMD::scoped_exec_restorer2 CPPSPMD_GLUER2(_exec_restore2_, __LINE__)(this, !vbool(cond)); {
+#define SPMD_AEND_IF } }
+
+class scoped_exec_saver
+{
+	exec_mask m_exec, m_kernel_exec, m_continue_mask;
+	spmd_kernel *m_pKernel;
+#ifdef _DEBUG
+	bool m_in_loop;
+#endif
+
+public:
+	inline scoped_exec_saver(spmd_kernel *pKernel) :
+		m_exec(pKernel->m_exec), m_kernel_exec(pKernel->m_kernel_exec), m_continue_mask(pKernel->m_continue_mask),
+		m_pKernel(pKernel)
+	{ 
+#ifdef _DEBUG
+		m_in_loop = pKernel->m_in_loop;
+#endif
+	}
+		
+	inline ~scoped_exec_saver()
+	{ 
+		m_pKernel->m_exec = m_exec; 
+		m_pKernel->m_continue_mask = m_continue_mask; 
+		m_pKernel->m_kernel_exec = m_kernel_exec; 
+#ifdef _DEBUG
+		m_pKernel->m_in_loop = m_in_loop;
+		m_pKernel->check_masks();
+#endif
+	}
+};
+
+#define SPMD_BEGIN_CALL scoped_exec_saver CPPSPMD_GLUER2(_begin_call_scoped_exec_saver, __LINE__)(this); m_continue_mask = exec_mask::all_off();
+#define SPMD_BEGIN_CALL_ALL_LANES scoped_exec_saver CPPSPMD_GLUER2(_begin_call_scoped_exec_saver, __LINE__)(this); m_exec = exec_mask::all_on(); m_continue_mask = exec_mask::all_off();
+
+template<typename ForeachBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_foreach(int begin, int end, const ForeachBody& foreachBody)
+{
+	if (begin == end)
+		return;
+	
+	if (!any(m_exec))
+		return;
+
+	// We don't support iterating backwards.
+	if (begin > end)
+		std::swap(begin, end);
+
+	exec_mask prev_continue_mask = m_continue_mask, prev_exec = m_exec;
+	
+	int total_full = (end - begin) / PROGRAM_COUNT;
+	int total_partial = (end - begin) % PROGRAM_COUNT;
+
+	lint_t loop_index = begin + program_index;
+	
+	const int total_loops = total_full + (total_partial ? 1 : 0);
+
+	m_continue_mask = exec_mask::all_off();
+
+	for (int i = 0; i < total_loops; i++)
+	{
+		int n = PROGRAM_COUNT;
+		if ((i == (total_loops - 1)) && (total_partial))
+		{
+			exec_mask partial_mask = exec_mask(vint_t(total_partial) > vint_t(program_index));
+			m_exec = m_exec & partial_mask;
+			n = total_partial;
+		}
+
+		foreachBody(loop_index, n);
+
+		m_exec = m_exec | m_continue_mask;
+		if (!any(m_exec))
+			break;
+
+		m_continue_mask = exec_mask::all_off();
+		check_masks();
+				
+		store_all(loop_index, loop_index + PROGRAM_COUNT);
+	}
+
+	m_exec = prev_exec & m_kernel_exec;
+	m_continue_mask = prev_continue_mask;
+	check_masks();
+}
+
+template<typename WhileCondBody, typename WhileBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_while(const WhileCondBody& whileCondBody, const WhileBody& whileBody)
+{
+	exec_mask orig_exec = m_exec;
+
+	exec_mask orig_continue_mask = m_continue_mask;
+	m_continue_mask = exec_mask::all_off();
+
+#ifdef _DEBUG
+	const bool prev_in_loop = m_in_loop;
+	m_in_loop = true;
+#endif
+
+	while(true)
+	{
+		exec_mask cond_exec = exec_mask(whileCondBody());
+		m_exec = m_exec & cond_exec;
+
+		if (!any(m_exec))
+			break;
+
+		whileBody();
+
+		m_exec = m_exec | m_continue_mask;
+		m_continue_mask = exec_mask::all_off();
+		check_masks();
+	}
+
+#ifdef _DEBUG
+	m_in_loop = prev_in_loop;
+#endif
+
+	m_exec = orig_exec & m_kernel_exec;
+	m_continue_mask = orig_continue_mask;
+	check_masks();
+}
+
+struct scoped_while_restorer
+{
+	spmd_kernel *m_pKernel;
+	exec_mask m_orig_exec, m_orig_continue_mask;
+#ifdef _DEBUG
+	bool m_prev_in_loop;
+#endif
+				
+	CPPSPMD_FORCE_INLINE scoped_while_restorer(spmd_kernel *pKernel) : 
+		m_pKernel(pKernel), 
+		m_orig_exec(pKernel->m_exec),
+		m_orig_continue_mask(pKernel->m_continue_mask)
+	{
+		pKernel->m_continue_mask.all_off();
+
+#ifdef _DEBUG
+		m_prev_in_loop = pKernel->m_in_loop;
+		pKernel->m_in_loop = true;
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE ~scoped_while_restorer() 
+	{ 
+		m_pKernel->m_exec = m_orig_exec & m_pKernel->m_kernel_exec;
+		m_pKernel->m_continue_mask = m_orig_continue_mask;
+#ifdef _DEBUG
+		m_pKernel->m_in_loop = m_prev_in_loop;
+		m_pKernel->check_masks();
+#endif
+	}
+};
+
+#undef SPMD_WHILE
+#undef SPMD_WEND
+#define SPMD_WHILE(cond) { scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); while(true) { exec_mask CPPSPMD_GLUER2(cond_exec, __LINE__) = exec_mask(vbool(cond)); \
+	m_exec = m_exec & CPPSPMD_GLUER2(cond_exec, __LINE__); if (!any(m_exec)) break;
+
+#define SPMD_WEND m_exec = m_exec | m_continue_mask; m_continue_mask = exec_mask::all_off(); check_masks(); } }
+
+// Nesting is not supported (although it will compile, but the results won't make much sense).
+#define SPMD_FOREACH(loop_var, bi, ei) if (((bi) != (ei)) && (any(m_exec))) { \
+	scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); \
+	uint32_t b = (uint32_t)(bi), e = (uint32_t)(ei); if ((b) > (e)) { std::swap(b, e); } const uint32_t total_full = ((e) - (b)) >> PROGRAM_COUNT_SHIFT, total_partial = ((e) - (b)) & (PROGRAM_COUNT - 1); \
+	lint_t loop_var = program_index + (int)b; const uint32_t total_loops = total_full + (total_partial ? 1U : 0U); \
+	for (uint32_t CPPSPMD_GLUER2(_foreach_counter, __LINE__) = 0; CPPSPMD_GLUER2(_foreach_counter, __LINE__) < total_loops; ++CPPSPMD_GLUER2(_foreach_counter, __LINE__)) { \
+		if ((CPPSPMD_GLUER2(_foreach_counter, __LINE__) == (total_loops - 1)) && (total_partial)) { exec_mask partial_mask = exec_mask(vint_t((int)total_partial) > vint_t(program_index)); m_exec = m_exec & partial_mask; }
+
+#define SPMD_FOREACH_END(loop_var) m_exec = m_exec | m_continue_mask; if (!any(m_exec)) break; m_continue_mask = exec_mask::all_off(); check_masks(); store_all(loop_var, loop_var + PROGRAM_COUNT); } }
+
+// Okay to use spmd_continue or spmd_return, but not spmd_break
+#define SPMD_FOREACH_ACTIVE(index_var) int64_t index_var; { uint64_t _movemask = m_exec.get_movemask(); if (_movemask) { scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); \
+	for (uint32_t _i = 0; _i < PROGRAM_COUNT; ++_i) { \
+		if (_movemask & (1U << _i)) { \
+			m_exec.enable_lane(_i); m_exec = m_exec & m_kernel_exec; \
+			(index_var) = _i; \
+
+#define SPMD_FOREACH_ACTIVE_END } } } }
+
+// Okay to use spmd_continue, but not spmd_break/spmd_continue
+#define SPMD_FOREACH_UNIQUE_INT(index_var, var) { scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); \
+	CPPSPMD_DECL(int_t, _vals[PROGRAM_COUNT]); store_linear_all(_vals, var); std::sort(_vals, _vals + PROGRAM_COUNT); \
+	const int _n = (int)(std::unique(_vals, _vals + PROGRAM_COUNT) - _vals); \
+	for (int _i = 0; _i < _n; ++_i) { int index_var = _vals[_i]; vbool cond = (vint_t(var) == vint_t(index_var)); m_exec = exec_mask(cond);
+
+#define SPMD_FOREACH_UNIQUE_INT_END } }
+
+struct scoped_simple_while_restorer
+{
+	spmd_kernel* m_pKernel;
+	exec_mask m_orig_exec;
+#ifdef _DEBUG
+	bool m_prev_in_loop;
+#endif
+
+	CPPSPMD_FORCE_INLINE scoped_simple_while_restorer(spmd_kernel* pKernel) :
+		m_pKernel(pKernel),
+		m_orig_exec(pKernel->m_exec)
+	{
+			
+#ifdef _DEBUG
+		m_prev_in_loop = pKernel->m_in_loop;
+		pKernel->m_in_loop = true;
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE ~scoped_simple_while_restorer()
+	{
+		m_pKernel->m_exec = m_orig_exec;
+#ifdef _DEBUG
+		m_pKernel->m_in_loop = m_prev_in_loop;
+		m_pKernel->check_masks();
+#endif
+	}
+};
+
+// Cannot use SPMD break, continue, or return inside simple while
+
+#define SPMD_SWHILE(cond) { scoped_simple_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); \
+	while(true) { \
+		exec_mask CPPSPMD_GLUER2(cond_exec, __LINE__) = exec_mask(vbool(cond)); m_exec = m_exec & CPPSPMD_GLUER2(cond_exec, __LINE__); if (!any(m_exec)) break;
+#define SPMD_SWEND } }	
+
+// Cannot use SPMD break, continue, or return inside simple do
+#define SPMD_SDO { scoped_simple_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); while(true) {
+#define SPMD_SEND_DO(cond) exec_mask CPPSPMD_GLUER2(cond_exec, __LINE__) = exec_mask(vbool(cond)); m_exec = m_exec & CPPSPMD_GLUER2(cond_exec, __LINE__); if (!any(m_exec)) break; } }	
+
+#undef SPMD_FOR
+#undef SPMD_END_FOR
+#define SPMD_FOR(for_init, for_cond) { for_init; scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); while(true) { exec_mask CPPSPMD_GLUER2(cond_exec, __LINE__) = exec_mask(vbool(for_cond)); \
+	m_exec = m_exec & CPPSPMD_GLUER2(cond_exec, __LINE__); if (!any(m_exec)) break;
+#define SPMD_END_FOR(for_inc) m_exec = m_exec | m_continue_mask; m_continue_mask = exec_mask::all_off(); check_masks(); for_inc; } }
+		
+template<typename ForInitBody, typename ForCondBody, typename ForIncrBody, typename ForBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_for(const ForInitBody& forInitBody, const ForCondBody& forCondBody, const ForIncrBody& forIncrBody, const ForBody& forBody)
+{
+	exec_mask orig_exec = m_exec;
+
+	forInitBody();
+
+	exec_mask orig_continue_mask = m_continue_mask;
+	m_continue_mask = exec_mask::all_off();
+
+#ifdef _DEBUG
+	const bool prev_in_loop = m_in_loop;
+	m_in_loop = true;
+#endif
+
+	while(true)
+	{
+		exec_mask cond_exec = exec_mask(forCondBody());
+		m_exec = m_exec & cond_exec;
+
+		if (!any(m_exec))
+			break;
+
+		forBody();
+
+		m_exec = m_exec | m_continue_mask;
+		m_continue_mask = exec_mask::all_off();
+		check_masks();
+			
+		forIncrBody();
+	}
+
+	m_exec = orig_exec & m_kernel_exec;
+	m_continue_mask = orig_continue_mask;
+
+#ifdef _DEBUG
+	m_in_loop = prev_in_loop;
+	check_masks();
+#endif
+}
--- a/thirdparty/basis_universal/encoder/cppspmd_math.h
+++ b/thirdparty/basis_universal/encoder/cppspmd_math.h
@ -0,0 +1,725 @@
+// Do not include this header directly.
+//
+// Copyright 2020-2021 Binomial LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The general goal of these vectorized estimated math functions is scalability/performance.
+// There are explictly no checks NaN's/Inf's on the input arguments. There are no assertions either. 
+// These are fast estimate functions - if you need more than that, use stdlib. Please do a proper 
+// engineering analysis before relying on them.
+// I have chosen functions written by others, ported them to CppSPMD, then measured their abs/rel errors.
+// I compared each to the ones in DirectXMath and stdlib's for accuracy/performance.
+
+CPPSPMD_FORCE_INLINE vfloat fmod_inv(const vfloat& a, const vfloat& b, const vfloat& b_inv) 
+{ 
+	vfloat c = frac(abs(a * b_inv)) * abs(b); 
+	return spmd_ternaryf(a < 0, -c, c); 
+}
+
+CPPSPMD_FORCE_INLINE vfloat fmod_inv_p(const vfloat& a, const vfloat& b, const vfloat& b_inv) 
+{ 
+	return frac(a * b_inv) * b; 
+}
+
+// Avoids dividing by zero or very small values.
+CPPSPMD_FORCE_INLINE vfloat safe_div(vfloat a, vfloat b, float fDivThresh = 1e-7f)
+{
+	return a / spmd_ternaryf( abs(b) > fDivThresh, b, spmd_ternaryf(b < 0.0f, -fDivThresh, fDivThresh) );
+}
+
+/*
+	clang 9.0.0 for win /fp:precise release
+	f range: 0.0000000000001250 10000000000.0000000000000000, vals: 1073741824
+
+	log2_est():
+	max abs err: 0.0000023076808731
+	max rel err: 0.0000000756678881
+	avg abs err: 0.0000007535452724
+	avg rel err: 0.0000000235117843
+
+	XMVectorLog2():
+	max abs err: 0.0000023329709933
+	max rel err: 0.0000000826961046
+	avg abs err: 0.0000007564889684
+	avg rel err: 0.0000000236051899
+
+	std::log2f():
+	max abs err: 0.0000020265979401
+	max rel err: 0.0000000626647654
+	avg abs err: 0.0000007494445227
+	avg rel err: 0.0000000233800985
+*/
+
+// See https://tech.ebayinc.com/engineering/fast-approximate-logarithms-part-iii-the-formulas/
+inline vfloat spmd_kernel::log2_est(vfloat v)
+{
+	vfloat signif, fexp;
+
+	// Just clamp to a very small value, instead of checking for invalid inputs.
+	vfloat x = max(v, 2.2e-38f);
+
+	/*
+	 * Assume IEEE representation, which is sgn(1):exp(8):frac(23)
+	 * representing (1+frac)*2^(exp-127).  Call 1+frac the significand
+	 */
+
+	 // get exponent
+	vint ux1_i = cast_vfloat_to_vint(x);
+
+	vint exp = VUINT_SHIFT_RIGHT(ux1_i & 0x7F800000, 23);
+
+	// actual exponent is exp-127, will subtract 127 later
+
+	vint ux2_i;
+	vfloat ux2_f;
+
+	vint greater = ux1_i & 0x00400000;  // true if signif > 1.5
+	SPMD_SIF(greater != 0)
+	{
+		// signif >= 1.5 so need to divide by 2.  Accomplish this by stuffing exp = 126 which corresponds to an exponent of -1 
+		store_all(ux2_i, (ux1_i & 0x007FFFFF) | 0x3f000000);
+
+		store_all(ux2_f, cast_vint_to_vfloat(ux2_i));
+
+		// 126 instead of 127 compensates for division by 2
+		store_all(fexp, vfloat(exp - 126));    
+	}
+	SPMD_SELSE(greater != 0)
+	{
+		// get signif by stuffing exp = 127 which corresponds to an exponent of 0
+		store(ux2_i, (ux1_i & 0x007FFFFF) | 0x3f800000);
+
+		store(ux2_f, cast_vint_to_vfloat(ux2_i));
+
+		store(fexp, vfloat(exp - 127));
+	}
+	SPMD_SENDIF
+
+	store_all(signif, ux2_f);
+	store_all(signif, signif - 1.0f);
+
+	const float a = 0.1501692f, b = 3.4226132f, c = 5.0225057f, d = 4.1130283f, e = 3.4813372f;
+
+	vfloat xm1 = signif;
+	vfloat xm1sqr = xm1 * xm1;
+		
+	return fexp + ((a * (xm1sqr * xm1) + b * xm1sqr + c * xm1) / (xm1sqr + d * xm1 + e));
+	
+	// fma lowers accuracy for SSE4.1 - no idea why (compiler reordering?)
+	//return fexp + ((vfma(a, (xm1sqr * xm1), vfma(b, xm1sqr, c * xm1))) / (xm1sqr + vfma(d, xm1, e)));
+}
+
+// Uses log2_est(), so this function must be <= the precision of that.
+inline vfloat spmd_kernel::log_est(vfloat v)
+{
+	return log2_est(v) * 0.693147181f;
+}
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::reduce_expb(vfloat& arg, vfloat& two_int_a, vint& adjustment)
+{
+	// Assume we're using equation (2)
+	store_all(adjustment, 0);
+	
+	// integer part of the input argument
+	vint int_arg = (vint)arg;
+	
+	// if frac(arg) is in [0.5, 1.0]...
+	SPMD_SIF((arg - int_arg) > 0.5f)   
+	{
+		store(adjustment, 1);
+		
+		// then change it to [0.0, 0.5]
+		store(arg, arg - 0.5f);
+	}
+	SPMD_SENDIF
+
+	// arg == just the fractional part
+	store_all(arg, arg - (vfloat)int_arg);
+   
+	// Now compute 2** (int) arg. 
+	store_all(int_arg, min(int_arg + 127, 254));
+	
+	store_all(two_int_a, cast_vint_to_vfloat(VINT_SHIFT_LEFT(int_arg, 23)));
+}
+
+/*
+	clang 9.0.0 for win /fp:precise release
+	f range : -50.0000000000000000 49.9999940395355225, vals : 16777216
+	
+	exp2_est():
+	Total passed near - zero check : 16777216
+	Total sign diffs : 0
+	max abs err: 1668910609.7500000000000000
+	max rel err: 0.0000015642030031
+	avg abs err: 10793794.4007573910057545
+	avg rel err: 0.0000003890893282
+	 
+	XMVectorExp2():
+	Total passed near-zero check: 16777216
+	Total sign diffs: 0
+	max abs err: 1665552836.8750000000000000
+	max rel err: 0.0000114674862370
+	avg abs err: 10771868.2627860084176064
+	avg rel err: 0.0000011218880770
+
+	std::exp2f():
+	Total passed near-zero check: 16777216
+	Total sign diffs: 0
+	max abs err: 1591636585.6250000000000000
+	max rel err: 0.0000014849731018
+	avg abs err: 10775800.3204844966530800
+	avg rel err: 0.0000003851496422
+*/
+
+// http://www.ganssle.com/item/approximations-c-code-exponentiation-log.htm
+inline vfloat spmd_kernel::exp2_est(vfloat arg)
+{
+	SPMD_BEGIN_CALL
+
+	const vfloat P00 = +7.2152891521493f;
+	const vfloat P01 = +0.0576900723731f;
+	const vfloat Q00 = +20.8189237930062f;
+	const vfloat Q01 = +1.0f;
+	const vfloat sqrt2 = 1.4142135623730950488f; // sqrt(2) for scaling 
+
+	vfloat result = 0.0f;
+
+	// Return 0 if arg is too large. 
+	// We're not introducing inf/nan's into calculations, or risk doing so by returning huge default values.
+	SPMD_IF(abs(arg) > 126.0f)
+	{
+		spmd_return();
+	}
+	SPMD_END_IF
+
+	// 2**(int(a))
+	vfloat two_int_a;                
+	
+	// set to 1 by reduce_expb
+	vint adjustment;
+	
+	// 0 if arg is +; 1 if negative
+	vint negative = 0;                 
+
+	// If the input is negative, invert it. At the end we'll take the reciprocal, since n**(-1) = 1/(n**x).
+	SPMD_SIF(arg < 0.0f)
+	{
+		store(arg, -arg);
+		store(negative, 1);
+	}
+	SPMD_SENDIF
+
+	store_all(arg, min(arg, 126.0f));
+
+	// reduce to [0.0, 0.5]
+	reduce_expb(arg, two_int_a, adjustment);
+
+	// The format of the polynomial is:
+	//  answer=(Q(x**2) + x*P(x**2))/(Q(x**2) - x*P(x**2))
+	//
+	//  The following computes the polynomial in several steps:
+
+	// Q(x**2)
+	vfloat Q = vfma(Q01, (arg * arg), Q00);
+	
+	// x*P(x**2)
+	vfloat x_P = arg * (vfma(P01, arg * arg, P00));
+	
+	vfloat answer = (Q + x_P) / (Q - x_P);
+
+	// Now correct for the scaling factor of 2**(int(a))
+	store_all(answer, answer * two_int_a);
+			
+	// If the result had a fractional part > 0.5, correct for that
+	store_all(answer, spmd_ternaryf(adjustment != 0, answer * sqrt2, answer));
+
+	// Correct for a negative input
+	SPMD_SIF(negative != 0)
+	{
+		store(answer, 1.0f / answer);
+	}
+	SPMD_SENDIF
+
+	store(result, answer);
+
+	return result;
+}
+
+inline vfloat spmd_kernel::exp_est(vfloat arg)
+{
+	// e^x = exp2(x / log_base_e(2))
+	// constant is 1.0/(log(2)/log(e)) or 1/log(2)
+	return exp2_est(arg * 1.44269504f);
+}
+
+inline vfloat spmd_kernel::pow_est(vfloat arg1, vfloat arg2)
+{
+	return exp_est(log_est(arg1) * arg2);
+}
+
+/*
+	clang 9.0.0 for win /fp:precise release
+	Total near-zero: 144, output above near-zero tresh: 30
+	Total near-zero avg: 0.0000067941016621 max: 0.0000134706497192
+	Total near-zero sign diffs: 5
+	Total passed near-zero check: 16777072
+	Total sign diffs: 5
+	max abs err: 0.0000031375306036
+	max rel err: 0.1140846017075028
+	avg abs err: 0.0000003026226621
+	avg rel err: 0.0000033564977623
+*/
+
+// Math from this web page: http://developer.download.nvidia.com/cg/sin.html
+// This is ~2x slower than sin_est() or cos_est(), and less accurate, but I'm keeping it here for comparison purposes to help validate/sanity check sin_est() and cos_est().
+inline vfloat spmd_kernel::sincos_est_a(vfloat a, bool sin_flag)
+{
+	const float c0_x = 0.0f, c0_y = 0.5f, c0_z = 1.0f;
+	const float c1_x = 0.25f, c1_y = -9.0f, c1_z = 0.75f, c1_w = 0.159154943091f;
+	const float c2_x = 24.9808039603f, c2_y = -24.9808039603f, c2_z = -60.1458091736f, c2_w = 60.1458091736f;
+	const float c3_x = 85.4537887573f, c3_y = -85.4537887573f, c3_z = -64.9393539429f, c3_w = 64.9393539429f;
+	const float c4_x = 19.7392082214f, c4_y = -19.7392082214f, c4_z = -1.0f, c4_w = 1.0f;
+
+	vfloat r0_x, r0_y, r0_z, r1_x, r1_y, r1_z, r2_x, r2_y, r2_z;
+
+	store_all(r1_x, sin_flag ? vfms(c1_w, a, c1_x) : c1_w * a);
+
+	store_all(r1_y, frac(r1_x));                   
+	
+	store_all(r2_x, (vfloat)(r1_y < c1_x));        
+
+	store_all(r2_y, (vfloat)(r1_y >= c1_y));    
+	store_all(r2_z, (vfloat)(r1_y >= c1_z));    
+
+	store_all(r2_y, vfma(r2_x, c4_z, vfma(r2_y, c4_w, r2_z * c4_z)));
+
+	store_all(r0_x, c0_x - r1_y);                
+	store_all(r0_y, c0_y - r1_y);                
+	store_all(r0_z, c0_z - r1_y);                
+	
+	store_all(r0_x, r0_x * r0_x);
+	store_all(r0_y, r0_y * r0_y);
+	store_all(r0_z, r0_z * r0_z);
+
+	store_all(r1_x, vfma(c2_x, r0_x, c2_z));           
+	store_all(r1_y, vfma(c2_y, r0_y, c2_w));           
+	store_all(r1_z, vfma(c2_x, r0_z, c2_z));           
+	
+	store_all(r1_x, vfma(r1_x, r0_x, c3_x));
+	store_all(r1_y, vfma(r1_y, r0_y, c3_y));
+	store_all(r1_z, vfma(r1_z, r0_z, c3_x));
+		
+	store_all(r1_x, vfma(r1_x, r0_x, c3_z));
+	store_all(r1_y, vfma(r1_y, r0_y, c3_w));
+	store_all(r1_z, vfma(r1_z, r0_z, c3_z));
+	
+	store_all(r1_x, vfma(r1_x, r0_x, c4_x));
+	store_all(r1_y, vfma(r1_y, r0_y, c4_y));
+	store_all(r1_z, vfma(r1_z, r0_z, c4_x));
+
+	store_all(r1_x, vfma(r1_x, r0_x, c4_z));
+	store_all(r1_y, vfma(r1_y, r0_y, c4_w));
+	store_all(r1_z, vfma(r1_z, r0_z, c4_z));
+
+	store_all(r0_x, vfnma(r1_x, r2_x, vfnma(r1_y, r2_y, r1_z * -r2_z)));
+
+	return r0_x;
+}
+
+// positive values only
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::recip_est1(const vfloat& q)
+{
+	//const int mag = 0x7EF312AC; // 2 NR iters, 3 is  0x7EEEEBB3
+	const int mag = 0x7EF311C3;
+	const float fMinThresh = .0000125f;
+
+	vfloat l = spmd_ternaryf(q >= fMinThresh, q, cast_vint_to_vfloat(vint(mag)));
+
+	vint x_l = vint(mag) - cast_vfloat_to_vint(l);
+	
+	vfloat rcp_l = cast_vint_to_vfloat(x_l);
+	
+	return rcp_l * vfnma(rcp_l, q, 2.0f);
+}
+
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::recip_est1_pn(const vfloat& t)
+{
+	//const int mag = 0x7EF312AC; // 2 NR iters, 3 is  0x7EEEEBB3
+	const int mag = 0x7EF311C3;
+	const float fMinThresh = .0000125f;
+
+	vfloat s = sign(t);
+	vfloat q = abs(t);
+
+	vfloat l = spmd_ternaryf(q >= fMinThresh, q, cast_vint_to_vfloat(vint(mag)));
+
+	vint x_l = vint(mag) - cast_vfloat_to_vint(l);
+
+	vfloat rcp_l = cast_vint_to_vfloat(x_l);
+
+	return rcp_l * vfnma(rcp_l, q, 2.0f) * s;
+}
+
+// https://basesandframes.files.wordpress.com/2020/04/even_faster_math_functions_green_2020.pdf
+// https://github.com/hcs0/Hackers-Delight/blob/master/rsqrt.c.txt
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::rsqrt_est1(vfloat x0)
+{
+	vfloat xhalf = 0.5f * x0;
+	vfloat x = cast_vint_to_vfloat(vint(0x5F375A82) - (VINT_SHIFT_RIGHT(cast_vfloat_to_vint(x0), 1)));
+	return x * vfnma(xhalf * x, x, 1.5008909f);
+}
+
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::rsqrt_est2(vfloat x0)
+{
+	vfloat xhalf = 0.5f * x0;
+	vfloat x = cast_vint_to_vfloat(vint(0x5F37599E) - (VINT_SHIFT_RIGHT(cast_vfloat_to_vint(x0), 1)));
+	vfloat x1 = x * vfnma(xhalf * x, x, 1.5);
+	vfloat x2 = x1 * vfnma(xhalf * x1, x1, 1.5);
+	return x2;
+}
+
+// Math from: http://developer.download.nvidia.com/cg/atan2.html
+// TODO: Needs more validation, parameter checking.
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::atan2_est(vfloat y, vfloat x)
+{
+	vfloat t1 = abs(y);
+	vfloat t3 = abs(x);
+	
+	vfloat t0 = max(t3, t1);
+	store_all(t1, min(t3, t1));
+
+	store_all(t3, t1 / t0);
+	
+	vfloat t4 = t3 * t3;
+	store_all(t0, vfma(-0.013480470f, t4, 0.057477314f));
+	store_all(t0, vfms(t0, t4, 0.121239071f));
+	store_all(t0, vfma(t0, t4, 0.195635925f));
+	store_all(t0, vfms(t0, t4, 0.332994597f));
+	store_all(t0, vfma(t0, t4, 0.999995630f));
+	store_all(t3, t0 * t3);
+
+	store_all(t3, spmd_ternaryf(abs(y) > abs(x), vfloat(1.570796327f) - t3, t3));
+
+	store_all(t3, spmd_ternaryf(x < 0.0f, vfloat(3.141592654f) - t3, t3));
+	store_all(t3, spmd_ternaryf(y < 0.0f, -t3, t3));
+
+	return t3;
+}
+
+/*
+    clang 9.0.0 for win /fp:precise release
+	Tested range: -25.1327412287183449 25.1327382326621169, vals : 16777216
+	Skipped angles near 90/270 within +- .001 radians.
+	Near-zero threshold: .0000125f
+	Near-zero output above check threshold: 1e-6f
+
+	Total near-zero: 144, output above near-zero tresh: 20
+	Total near-zero avg: 0.0000067510751968 max: 0.0000133514404297
+	Total near-zero sign diffs: 5
+	Total passed near-zero check: 16766400
+	Total sign diffs: 5
+	max abs err: 1.4982600811139264
+	max rel err: 0.1459155900188041
+	avg rel err: 0.0000054659502568
+
+	XMVectorTan() precise:
+	Total near-zero: 144, output above near-zero tresh: 18
+	Total near-zero avg: 0.0000067641216186 max: 0.0000133524126795
+	Total near-zero sign diffs: 0
+	Total passed near-zero check: 16766400
+	Total sign diffs: 0
+	max abs err: 1.9883573246424930
+	max rel err: 0.1459724171926864
+	avg rel err: 0.0000054965766843
+
+	std::tanf():
+	Total near-zero: 144, output above near-zero tresh: 0
+	Total near-zero avg: 0.0000067116930779 max: 0.0000127713074107
+	Total near-zero sign diffs: 11
+	Total passed near-zero check: 16766400
+	Total sign diffs: 11
+	max abs err: 0.8989131818294709
+	max rel err: 0.0573181403173166
+	avg rel err: 0.0000030791301203
+	
+	Originally from:
+	http://www.ganssle.com/approx.htm
+*/
+
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::tan82(vfloat x)
+{
+	// Original double version was 8.2 digits
+	//double c1 = 211.849369664121f, c2 = -12.5288887278448f, c3 = 269.7350131214121f, c4 = -71.4145309347748f;
+	// Tuned float constants for lower avg rel error (without using FMA3):
+	const float c1 = 211.849350f, c2 = -12.5288887f, c3 = 269.734985f, c4 = -71.4145203f;
+	vfloat x2 = x * x;
+	return (x * (vfma(c2, x2, c1)) / (vfma(x2, (c4 + x2), c3)));
+}
+
+// Don't call this for angles close to 90/270!.
+inline vfloat spmd_kernel::tan_est(vfloat x)
+{
+	const float fPi = 3.141592653589793f, fOneOverPi = 0.3183098861837907f;
+	CPPSPMD_DECL(const uint8_t, s_table0[16]) =	{ 128 + 0, 128 + 2, 128 + -2, 128 + 4,    128 + 0, 128 + 2, 128 + -2, 128 + 4,	  128 + 0, 128 + 2, 128 + -2, 128 + 4,   128 + 0, 128 + 2, 128 + -2, 128 + 4 };
+
+	vint table = init_lookup4(s_table0); // a load
+	vint sgn = cast_vfloat_to_vint(x) & 0x80000000;
+
+	store_all(x, abs(x));
+	vfloat orig_x = x;
+
+	vfloat q = x * fOneOverPi;
+	store_all(x, q - floor(q));
+
+	vfloat x4 = x * 4.0f;
+	vint octant = (vint)(x4);
+
+	vfloat x0 = spmd_ternaryf((octant & 1) != 0, -x4, x4);
+
+	vint k = table_lookup4_8(octant, table) & 0xFF; // a shuffle
+
+	vfloat bias = (vfloat)k + -128.0f;
+	vfloat y = x0 + bias;
+
+	vfloat z = tan82(y);
+
+	vfloat r;
+	
+	vbool octant_one_or_two = (octant == 1) || (octant == 2);
+
+	// SPMD optimization - skip costly divide if we can
+	if (spmd_any(octant_one_or_two))
+	{
+		const float fDivThresh = .4371e-7f;
+		vfloat one_over_z = 1.0f / spmd_ternaryf(abs(z) > fDivThresh, z, spmd_ternaryf(z < 0.0f, -fDivThresh, fDivThresh));
+				
+		vfloat b = spmd_ternaryf(octant_one_or_two, one_over_z, z);
+		store_all(r, spmd_ternaryf((octant & 2) != 0, -b, b));
+	}
+	else
+	{
+		store_all(r, spmd_ternaryf(octant == 0, z, -z));
+	}
+		
+	// Small angle approximation, to decrease the max rel error near Pi.
+	SPMD_SIF(x >= (1.0f - .0003125f*4.0f))
+	{
+		store(r, vfnma(floor(q) + 1.0f, fPi, orig_x));
+	}
+	SPMD_SENDIF
+
+	return cast_vint_to_vfloat(cast_vfloat_to_vint(r) ^ sgn);
+}
+
+inline void spmd_kernel::seed_rand(rand_context& x, vint seed)
+{ 
+	store(x.a, 0xf1ea5eed); 
+	store(x.b, seed ^ 0xd8487b1f); 
+	store(x.c, seed ^ 0xdbadef9a); 
+	store(x.d, seed); 
+	for (int i = 0; i < 20; ++i) 
+		(void)get_randu(x); 
+}
+
+// https://burtleburtle.net/bob/rand/smallprng.html
+// Returns 32-bit unsigned random numbers.
+inline vint spmd_kernel::get_randu(rand_context& x)
+{ 
+	vint e = x.a - VINT_ROT(x.b, 27); 
+	store(x.a, x.b ^ VINT_ROT(x.c, 17)); 
+	store(x.b, x.c + x.d); 
+	store(x.c, x.d + e); 
+	store(x.d, e + x.a);	
+	return x.d; 
+}
+
+// Returns random numbers between [low, high), or low if low >= high
+inline vint spmd_kernel::get_randi(rand_context& x, vint low, vint high)
+{
+	vint rnd = get_randu(x);
+
+	vint range = high - low;
+
+	vint rnd_range = mulhiu(rnd, range);
+	
+	return spmd_ternaryi(low < high, low + rnd_range, low);
+}
+
+// Returns random numbers between [low, high), or low if low >= high
+inline vfloat spmd_kernel::get_randf(rand_context& x, vfloat low, vfloat high)
+{
+	vint rndi = get_randu(x) & 0x7fffff;
+
+	vfloat rnd = (vfloat)(rndi) * (1.0f / 8388608.0f);
+
+	return spmd_ternaryf(low < high, vfma(high - low, rnd, low), low);
+}
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::init_reverse_bits(vint& tab1, vint& tab2)
+{
+	const uint8_t tab1_bytes[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
+	const uint8_t tab2_bytes[16] = { 0, 8 << 4, 4 << 4, 12 << 4, 2 << 4, 10 << 4, 6 << 4, 14 << 4, 1 << 4, 9 << 4, 5 << 4, 13 << 4, 3 << 4, 11 << 4, 7 << 4, 15 << 4 };
+	store_all(tab1, init_lookup4(tab1_bytes));
+	store_all(tab2, init_lookup4(tab2_bytes));
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::reverse_bits(vint k, vint tab1, vint tab2)
+{
+	vint r0 = table_lookup4_8(k & 0x7F7F7F7F, tab2);
+	vint r1 = table_lookup4_8(VUINT_SHIFT_RIGHT(k, 4) & 0x7F7F7F7F, tab1);
+	vint r3 = r0 | r1;
+	return byteswap(r3);
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::count_leading_zeros(vint x)
+{
+	CPPSPMD_DECL(const uint8_t, s_tab[16]) = { 0, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+	vint tab = init_lookup4(s_tab);
+
+	//x <= 0x0000ffff
+	vbool c0 = (x & 0xFFFF0000) == 0;
+	vint n0 = spmd_ternaryi(c0, 16, 0);
+	vint x0 = spmd_ternaryi(c0, VINT_SHIFT_LEFT(x, 16), x);
+
+	//x <= 0x00ffffff
+	vbool c1 = (x0 & 0xFF000000) == 0;
+	vint n1 = spmd_ternaryi(c1, n0 + 8, n0);
+	vint x1 = spmd_ternaryi(c1, VINT_SHIFT_LEFT(x0, 8), x0);
+
+	//x <= 0x0fffffff
+	vbool c2 = (x1 & 0xF0000000) == 0;
+	vint n2 = spmd_ternaryi(c2, n1 + 4, n1);
+	vint x2 = spmd_ternaryi(c2, VINT_SHIFT_LEFT(x1, 4), x1);
+
+	return table_lookup4_8(VUINT_SHIFT_RIGHT(x2, 28), tab) + n2;
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::count_leading_zeros_alt(vint x)
+{
+	//x <= 0x0000ffff
+	vbool c0 = (x & 0xFFFF0000) == 0;
+	vint n0 = spmd_ternaryi(c0, 16, 0);
+	vint x0 = spmd_ternaryi(c0, VINT_SHIFT_LEFT(x, 16), x);
+
+	//x <= 0x00ffffff
+	vbool c1 = (x0 & 0xFF000000) == 0;
+	vint n1 = spmd_ternaryi(c1, n0 + 8, n0);
+	vint x1 = spmd_ternaryi(c1, VINT_SHIFT_LEFT(x0, 8), x0);
+
+	//x <= 0x0fffffff
+	vbool c2 = (x1 & 0xF0000000) == 0;
+	vint n2 = spmd_ternaryi(c2, n1 + 4, n1);
+	vint x2 = spmd_ternaryi(c2, VINT_SHIFT_LEFT(x1, 4), x1);
+
+	// x <= 0x3fffffff
+	vbool c3 = (x2 & 0xC0000000) == 0;
+	vint n3 = spmd_ternaryi(c3, n2 + 2, n2);
+	vint x3 = spmd_ternaryi(c3, VINT_SHIFT_LEFT(x2, 2), x2);
+
+	// x <= 0x7fffffff
+	vbool c4 = (x3 & 0x80000000) == 0;
+	return spmd_ternaryi(c4, n3 + 1, n3);
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::count_trailing_zeros(vint x)
+{
+	// cast the least significant bit in v to a float
+	vfloat f = (vfloat)(x & -x);
+	
+	// extract exponent and adjust
+	return VUINT_SHIFT_RIGHT(cast_vfloat_to_vint(f), 23) - 0x7F;
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::count_set_bits(vint x)
+{
+	vint v = x - (VUINT_SHIFT_RIGHT(x, 1) & 0x55555555);                    
+	vint v1 = (v & 0x33333333) + (VUINT_SHIFT_RIGHT(v, 2) & 0x33333333);     
+	return VUINT_SHIFT_RIGHT(((v1 + VUINT_SHIFT_RIGHT(v1, 4) & 0xF0F0F0F) * 0x1010101), 24);
+}
+
+CPPSPMD_FORCE_INLINE vint cmple_epu16(const vint &a, const vint &b) 
+{ 
+	return cmpeq_epi16(subs_epu16(a, b), vint(0)); 
+}
+
+CPPSPMD_FORCE_INLINE vint cmpge_epu16(const vint &a, const vint &b) 
+{ 
+	return cmple_epu16(b, a);
+}
+
+CPPSPMD_FORCE_INLINE vint cmpgt_epu16(const vint &a, const vint &b)
+{
+	return andnot(cmpeq_epi16(a, b), cmple_epu16(b, a));
+}
+
+CPPSPMD_FORCE_INLINE vint cmplt_epu16(const vint &a, const vint &b)
+{
+	return cmpgt_epu16(b, a);
+}
+
+CPPSPMD_FORCE_INLINE vint cmpge_epi16(const vint &a, const vint &b)
+{
+	return cmpeq_epi16(a, b) | cmpgt_epi16(a, b);
+}
+
+CPPSPMD_FORCE_INLINE vint cmple_epi16(const vint &a, const vint &b)
+{
+	return cmpge_epi16(b, a);
+}
+
+void spmd_kernel::print_vint(vint v) 
+{ 
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+		printf("%i ", extract(v, i)); 
+	printf("\n"); 
+}
+
+void spmd_kernel::print_vbool(vbool v) 
+{ 
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+		printf("%i ", extract(v, i) ? 1 : 0); 
+	printf("\n"); 
+}
+	
+void spmd_kernel::print_vint_hex(vint v) 
+{ 
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+		printf("0x%X ", extract(v, i)); 
+	printf("\n"); 
+}
+
+void spmd_kernel::print_active_lanes(const char *pPrefix) 
+{ 
+	CPPSPMD_DECL(int, flags[PROGRAM_COUNT]);
+	memset(flags, 0, sizeof(flags));
+	storeu_linear(flags, vint(1));
+
+	if (pPrefix)
+		printf("%s", pPrefix);
+
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+	{
+		if (flags[i])
+			printf("%u ", i);
+	}
+	printf("\n");
+}
+	
+void spmd_kernel::print_vfloat(vfloat v) 
+{ 
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+		printf("%f ", extract(v, i)); 
+	printf("\n"); 
+}
--- a/thirdparty/basis_universal/encoder/cppspmd_math_declares.h
+++ b/thirdparty/basis_universal/encoder/cppspmd_math_declares.h
@ -0,0 +1,89 @@
+// Do not include this header directly.
+// This header defines shared struct spmd_kernel helpers.
+//
+// Copyright 2020-2021 Binomial LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// See cppspmd_math.h for detailed error statistics.
+
+CPPSPMD_FORCE_INLINE void reduce_expb(vfloat& arg, vfloat& two_int_a, vint& adjustment);
+CPPSPMD_FORCE_INLINE vfloat tan56(vfloat x);
+CPPSPMD_FORCE_INLINE vfloat tan82(vfloat x);
+
+inline vfloat log2_est(vfloat v);
+
+inline vfloat log_est(vfloat v);
+
+inline vfloat exp2_est(vfloat arg);
+
+inline vfloat exp_est(vfloat arg);
+
+inline vfloat pow_est(vfloat arg1, vfloat arg2);
+
+CPPSPMD_FORCE_INLINE vfloat recip_est1(const vfloat& q);
+CPPSPMD_FORCE_INLINE vfloat recip_est1_pn(const vfloat& q);
+
+inline vfloat mod_angles(vfloat a);
+
+inline vfloat sincos_est_a(vfloat a, bool sin_flag);
+CPPSPMD_FORCE_INLINE vfloat sin_est_a(vfloat a) { return sincos_est_a(a, true); }
+CPPSPMD_FORCE_INLINE vfloat cos_est_a(vfloat a) { return sincos_est_a(a, false); }
+
+inline vfloat sin_est(vfloat a);
+
+inline vfloat cos_est(vfloat a);
+
+// Don't call with values <= 0.
+CPPSPMD_FORCE_INLINE vfloat rsqrt_est1(vfloat x0);
+
+// Don't call with values <= 0.
+CPPSPMD_FORCE_INLINE vfloat rsqrt_est2(vfloat x0);
+
+CPPSPMD_FORCE_INLINE vfloat atan2_est(vfloat y, vfloat x);
+
+CPPSPMD_FORCE_INLINE vfloat atan_est(vfloat x) { return atan2_est(x, vfloat(1.0f)); }
+
+// Don't call this for angles close to 90/270! 
+inline vfloat tan_est(vfloat x);
+
+// https://burtleburtle.net/bob/rand/smallprng.html
+struct rand_context { vint a, b, c, d; };
+
+inline void seed_rand(rand_context& x, vint seed);
+
+// Returns 32-bit unsigned random numbers.
+inline vint get_randu(rand_context& x);
+
+// Returns random numbers between [low, high), or low if low >= high
+inline vint get_randi(rand_context& x, vint low, vint high);
+
+// Returns random numbers between [low, high), or low if low >= high
+inline vfloat get_randf(rand_context& x, vfloat low, vfloat high);
+
+CPPSPMD_FORCE_INLINE void init_reverse_bits(vint& tab1, vint& tab2);
+CPPSPMD_FORCE_INLINE vint reverse_bits(vint k, vint tab1, vint tab2);
+
+CPPSPMD_FORCE_INLINE vint count_leading_zeros(vint x);
+CPPSPMD_FORCE_INLINE vint count_leading_zeros_alt(vint x);
+
+CPPSPMD_FORCE_INLINE vint count_trailing_zeros(vint x);
+
+CPPSPMD_FORCE_INLINE vint count_set_bits(vint x);
+
+void print_vint(vint v);
+void print_vbool(vbool v);
+void print_vint_hex(vint v);
+void print_active_lanes(const char *pPrefix);
+void print_vfloat(vfloat v);
+
--- a/thirdparty/basis_universal/encoder/cppspmd_sse.h
+++ b/thirdparty/basis_universal/encoder/cppspmd_sse.h
--- a/thirdparty/basis_universal/encoder/cppspmd_type_aliases.h
+++ b/thirdparty/basis_universal/encoder/cppspmd_type_aliases.h
@ -0,0 +1,47 @@
+// cppspmd_type_aliases.h
+// Do not include this file directly
+//
+// Copyright 2020-2021 Binomial LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifndef CPPSPMD_TYPES
+#define CPPSPMD_TYPES
+
+using exec_mask = CPPSPMD::exec_mask;
+
+#if CPPSPMD_INT16
+using vint16 = CPPSPMD::vint16;
+using int16_lref = CPPSPMD::int16_lref;
+using cint16_vref = CPPSPMD::cint16_vref;
+using int16_vref = CPPSPMD::int16_vref;
+using lint16 = CPPSPMD::lint16;
+using vint16_vref = CPPSPMD::vint16_vref;
+#else
+using vint = CPPSPMD::vint;
+using int_lref = CPPSPMD::int_lref;
+using cint_vref = CPPSPMD::cint_vref;
+using int_vref = CPPSPMD::int_vref;
+using lint = CPPSPMD::lint;
+using vint_vref = CPPSPMD::vint_vref;
+#endif
+
+using vbool = CPPSPMD::vbool;
+using vfloat = CPPSPMD::vfloat;
+using float_lref = CPPSPMD::float_lref;
+using float_vref = CPPSPMD::float_vref;
+using vfloat_vref = CPPSPMD::vfloat_vref;
+
+#endif // CPPSPMD_TYPES
--- a/thirdparty/basis_universal/encoder/jpgd.cpp
+++ b/thirdparty/basis_universal/encoder/jpgd.cpp
--- a/thirdparty/basis_universal/encoder/jpgd.h
+++ b/thirdparty/basis_universal/encoder/jpgd.h
@ -0,0 +1,347 @@
+// jpgd.h - C++ class for JPEG decompression.
+// Public domain, Rich Geldreich <richgel99@gmail.com>
+#ifndef JPEG_DECODER_H
+#define JPEG_DECODER_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <setjmp.h>
+#include <assert.h>
+#include <stdint.h>
+
+#ifdef _MSC_VER
+#define JPGD_NORETURN __declspec(noreturn) 
+#elif defined(__GNUC__)
+#define JPGD_NORETURN __attribute__ ((noreturn))
+#else
+#define JPGD_NORETURN
+#endif
+
+#define JPGD_HUFF_TREE_MAX_LENGTH 512
+#define JPGD_HUFF_CODE_SIZE_MAX_LENGTH 256
+
+namespace jpgd
+{
+	typedef unsigned char  uint8;
+	typedef   signed short int16;
+	typedef unsigned short uint16;
+	typedef unsigned int   uint;
+	typedef   signed int   int32;
+
+	// Loads a JPEG image from a memory buffer or a file.
+	// req_comps can be 1 (grayscale), 3 (RGB), or 4 (RGBA).
+	// On return, width/height will be set to the image's dimensions, and actual_comps will be set to the either 1 (grayscale) or 3 (RGB).
+	// Notes: For more control over where and how the source data is read, see the decompress_jpeg_image_from_stream() function below, or call the jpeg_decoder class directly.
+	// Requesting a 8 or 32bpp image is currently a little faster than 24bpp because the jpeg_decoder class itself currently always unpacks to either 8 or 32bpp.
+	unsigned char* decompress_jpeg_image_from_memory(const unsigned char* pSrc_data, int src_data_size, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags = 0);
+	unsigned char* decompress_jpeg_image_from_file(const char* pSrc_filename, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags = 0);
+
+	// Success/failure error codes.
+	enum jpgd_status
+	{
+		JPGD_SUCCESS = 0, JPGD_FAILED = -1, JPGD_DONE = 1,
+		JPGD_BAD_DHT_COUNTS = -256, JPGD_BAD_DHT_INDEX, JPGD_BAD_DHT_MARKER, JPGD_BAD_DQT_MARKER, JPGD_BAD_DQT_TABLE,
+		JPGD_BAD_PRECISION, JPGD_BAD_HEIGHT, JPGD_BAD_WIDTH, JPGD_TOO_MANY_COMPONENTS,
+		JPGD_BAD_SOF_LENGTH, JPGD_BAD_VARIABLE_MARKER, JPGD_BAD_DRI_LENGTH, JPGD_BAD_SOS_LENGTH,
+		JPGD_BAD_SOS_COMP_ID, JPGD_W_EXTRA_BYTES_BEFORE_MARKER, JPGD_NO_ARITHMITIC_SUPPORT, JPGD_UNEXPECTED_MARKER,
+		JPGD_NOT_JPEG, JPGD_UNSUPPORTED_MARKER, JPGD_BAD_DQT_LENGTH, JPGD_TOO_MANY_BLOCKS,
+		JPGD_UNDEFINED_QUANT_TABLE, JPGD_UNDEFINED_HUFF_TABLE, JPGD_NOT_SINGLE_SCAN, JPGD_UNSUPPORTED_COLORSPACE,
+		JPGD_UNSUPPORTED_SAMP_FACTORS, JPGD_DECODE_ERROR, JPGD_BAD_RESTART_MARKER,
+		JPGD_BAD_SOS_SPECTRAL, JPGD_BAD_SOS_SUCCESSIVE, JPGD_STREAM_READ, JPGD_NOTENOUGHMEM, JPGD_TOO_MANY_SCANS
+	};
+
+	// Input stream interface.
+	// Derive from this class to read input data from sources other than files or memory. Set m_eof_flag to true when no more data is available.
+	// The decoder is rather greedy: it will keep on calling this method until its internal input buffer is full, or until the EOF flag is set.
+	// It the input stream contains data after the JPEG stream's EOI (end of image) marker it will probably be pulled into the internal buffer.
+	// Call the get_total_bytes_read() method to determine the actual size of the JPEG stream after successful decoding.
+	class jpeg_decoder_stream
+	{
+	public:
+		jpeg_decoder_stream() { }
+		virtual ~jpeg_decoder_stream() { }
+
+		// The read() method is called when the internal input buffer is empty.
+		// Parameters:
+		// pBuf - input buffer
+		// max_bytes_to_read - maximum bytes that can be written to pBuf
+		// pEOF_flag - set this to true if at end of stream (no more bytes remaining)
+		// Returns -1 on error, otherwise return the number of bytes actually written to the buffer (which may be 0).
+		// Notes: This method will be called in a loop until you set *pEOF_flag to true or the internal buffer is full.
+		virtual int read(uint8* pBuf, int max_bytes_to_read, bool* pEOF_flag) = 0;
+	};
+
+	// stdio FILE stream class.
+	class jpeg_decoder_file_stream : public jpeg_decoder_stream
+	{
+		jpeg_decoder_file_stream(const jpeg_decoder_file_stream&);
+		jpeg_decoder_file_stream& operator =(const jpeg_decoder_file_stream&);
+
+		FILE* m_pFile;
+		bool m_eof_flag, m_error_flag;
+
+	public:
+		jpeg_decoder_file_stream();
+		virtual ~jpeg_decoder_file_stream();
+
+		bool open(const char* Pfilename);
+		void close();
+
+		virtual int read(uint8* pBuf, int max_bytes_to_read, bool* pEOF_flag);
+	};
+
+	// Memory stream class.
+	class jpeg_decoder_mem_stream : public jpeg_decoder_stream
+	{
+		const uint8* m_pSrc_data;
+		uint m_ofs, m_size;
+
+	public:
+		jpeg_decoder_mem_stream() : m_pSrc_data(NULL), m_ofs(0), m_size(0) { }
+		jpeg_decoder_mem_stream(const uint8* pSrc_data, uint size) : m_pSrc_data(pSrc_data), m_ofs(0), m_size(size) { }
+
+		virtual ~jpeg_decoder_mem_stream() { }
+
+		bool open(const uint8* pSrc_data, uint size);
+		void close() { m_pSrc_data = NULL; m_ofs = 0; m_size = 0; }
+
+		virtual int read(uint8* pBuf, int max_bytes_to_read, bool* pEOF_flag);
+	};
+
+	// Loads JPEG file from a jpeg_decoder_stream.
+	unsigned char* decompress_jpeg_image_from_stream(jpeg_decoder_stream* pStream, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags = 0);
+
+	enum
+	{
+		JPGD_IN_BUF_SIZE = 8192, JPGD_MAX_BLOCKS_PER_MCU = 10, JPGD_MAX_HUFF_TABLES = 8, JPGD_MAX_QUANT_TABLES = 4,
+		JPGD_MAX_COMPONENTS = 4, JPGD_MAX_COMPS_IN_SCAN = 4, JPGD_MAX_BLOCKS_PER_ROW = 16384, JPGD_MAX_HEIGHT = 32768, JPGD_MAX_WIDTH = 32768
+	};
+
+	typedef int16 jpgd_quant_t;
+	typedef int16 jpgd_block_t;
+
+	class jpeg_decoder
+	{
+	public:
+		enum
+		{
+			cFlagLinearChromaFiltering = 1
+		};
+
+		// Call get_error_code() after constructing to determine if the stream is valid or not. You may call the get_width(), get_height(), etc.
+		// methods after the constructor is called. You may then either destruct the object, or begin decoding the image by calling begin_decoding(), then decode() on each scanline.
+		jpeg_decoder(jpeg_decoder_stream* pStream, uint32_t flags = cFlagLinearChromaFiltering);
+
+		~jpeg_decoder();
+
+		// Call this method after constructing the object to begin decompression.
+		// If JPGD_SUCCESS is returned you may then call decode() on each scanline.
+
+		int begin_decoding();
+
+		// Returns the next scan line.
+		// For grayscale images, pScan_line will point to a buffer containing 8-bit pixels (get_bytes_per_pixel() will return 1). 
+		// Otherwise, it will always point to a buffer containing 32-bit RGBA pixels (A will always be 255, and get_bytes_per_pixel() will return 4).
+		// Returns JPGD_SUCCESS if a scan line has been returned.
+		// Returns JPGD_DONE if all scan lines have been returned.
+		// Returns JPGD_FAILED if an error occurred. Call get_error_code() for a more info.
+		int decode(const void** pScan_line, uint* pScan_line_len);
+
+		inline jpgd_status get_error_code() const { return m_error_code; }
+
+		inline int get_width() const { return m_image_x_size; }
+		inline int get_height() const { return m_image_y_size; }
+
+		inline int get_num_components() const { return m_comps_in_frame; }
+
+		inline int get_bytes_per_pixel() const { return m_dest_bytes_per_pixel; }
+		inline int get_bytes_per_scan_line() const { return m_image_x_size * get_bytes_per_pixel(); }
+
+		// Returns the total number of bytes actually consumed by the decoder (which should equal the actual size of the JPEG file).
+		inline int get_total_bytes_read() const { return m_total_bytes_read; }
+
+	private:
+		jpeg_decoder(const jpeg_decoder&);
+		jpeg_decoder& operator =(const jpeg_decoder&);
+
+		typedef void (*pDecode_block_func)(jpeg_decoder*, int, int, int);
+
+		struct huff_tables
+		{
+			bool ac_table;
+			uint  look_up[256];
+			uint  look_up2[256];
+			uint8 code_size[JPGD_HUFF_CODE_SIZE_MAX_LENGTH];
+			uint  tree[JPGD_HUFF_TREE_MAX_LENGTH];
+		};
+
+		struct coeff_buf
+		{
+			uint8* pData;
+			int block_num_x, block_num_y;
+			int block_len_x, block_len_y;
+			int block_size;
+		};
+
+		struct mem_block
+		{
+			mem_block* m_pNext;
+			size_t m_used_count;
+			size_t m_size;
+			char m_data[1];
+		};
+
+		jmp_buf m_jmp_state;
+		uint32_t m_flags;
+		mem_block* m_pMem_blocks;
+		int m_image_x_size;
+		int m_image_y_size;
+		jpeg_decoder_stream* m_pStream;
+
+		int m_progressive_flag;
+
+		uint8 m_huff_ac[JPGD_MAX_HUFF_TABLES];
+		uint8* m_huff_num[JPGD_MAX_HUFF_TABLES];      // pointer to number of Huffman codes per bit size
+		uint8* m_huff_val[JPGD_MAX_HUFF_TABLES];      // pointer to Huffman codes per bit size
+		jpgd_quant_t* m_quant[JPGD_MAX_QUANT_TABLES]; // pointer to quantization tables
+		int m_scan_type;                              // Gray, Yh1v1, Yh1v2, Yh2v1, Yh2v2 (CMYK111, CMYK4114 no longer supported)
+		int m_comps_in_frame;                         // # of components in frame
+		int m_comp_h_samp[JPGD_MAX_COMPONENTS];       // component's horizontal sampling factor
+		int m_comp_v_samp[JPGD_MAX_COMPONENTS];       // component's vertical sampling factor
+		int m_comp_quant[JPGD_MAX_COMPONENTS];        // component's quantization table selector
+		int m_comp_ident[JPGD_MAX_COMPONENTS];        // component's ID
+		int m_comp_h_blocks[JPGD_MAX_COMPONENTS];
+		int m_comp_v_blocks[JPGD_MAX_COMPONENTS];
+		int m_comps_in_scan;                          // # of components in scan
+		int m_comp_list[JPGD_MAX_COMPS_IN_SCAN];      // components in this scan
+		int m_comp_dc_tab[JPGD_MAX_COMPONENTS];       // component's DC Huffman coding table selector
+		int m_comp_ac_tab[JPGD_MAX_COMPONENTS];       // component's AC Huffman coding table selector
+		int m_spectral_start;                         // spectral selection start
+		int m_spectral_end;                           // spectral selection end
+		int m_successive_low;                         // successive approximation low
+		int m_successive_high;                        // successive approximation high
+		int m_max_mcu_x_size;                         // MCU's max. X size in pixels
+		int m_max_mcu_y_size;                         // MCU's max. Y size in pixels
+		int m_blocks_per_mcu;
+		int m_max_blocks_per_row;
+		int m_mcus_per_row, m_mcus_per_col;
+		int m_mcu_org[JPGD_MAX_BLOCKS_PER_MCU];
+		int m_total_lines_left;                       // total # lines left in image
+		int m_mcu_lines_left;                         // total # lines left in this MCU
+		int m_num_buffered_scanlines;
+		int m_real_dest_bytes_per_scan_line;
+		int m_dest_bytes_per_scan_line;               // rounded up
+		int m_dest_bytes_per_pixel;                   // 4 (RGB) or 1 (Y)
+		huff_tables* m_pHuff_tabs[JPGD_MAX_HUFF_TABLES];
+		coeff_buf* m_dc_coeffs[JPGD_MAX_COMPONENTS];
+		coeff_buf* m_ac_coeffs[JPGD_MAX_COMPONENTS];
+		int m_eob_run;
+		int m_block_y_mcu[JPGD_MAX_COMPONENTS];
+		uint8* m_pIn_buf_ofs;
+		int m_in_buf_left;
+		int m_tem_flag;
+
+		uint8 m_in_buf_pad_start[64];
+		uint8 m_in_buf[JPGD_IN_BUF_SIZE + 128];
+		uint8 m_in_buf_pad_end[64];
+
+		int m_bits_left;
+		uint m_bit_buf;
+		int m_restart_interval;
+		int m_restarts_left;
+		int m_next_restart_num;
+		int m_max_mcus_per_row;
+		int m_max_blocks_per_mcu;
+
+		int m_max_mcus_per_col;
+		uint m_last_dc_val[JPGD_MAX_COMPONENTS];
+		jpgd_block_t* m_pMCU_coefficients;
+		int m_mcu_block_max_zag[JPGD_MAX_BLOCKS_PER_MCU];
+		uint8* m_pSample_buf;
+		uint8* m_pSample_buf_prev;
+		int m_crr[256];
+		int m_cbb[256];
+		int m_crg[256];
+		int m_cbg[256];
+		uint8* m_pScan_line_0;
+		uint8* m_pScan_line_1;
+		jpgd_status m_error_code;
+		int m_total_bytes_read;
+
+		bool m_ready_flag;
+		bool m_eof_flag;
+		bool m_sample_buf_prev_valid;
+
+		inline int check_sample_buf_ofs(int ofs) const { assert(ofs >= 0); assert(ofs < m_max_blocks_per_row * 64); return ofs; }
+		void free_all_blocks();
+		JPGD_NORETURN void stop_decoding(jpgd_status status);
+		void* alloc(size_t n, bool zero = false);
+		void word_clear(void* p, uint16 c, uint n);
+		void prep_in_buffer();
+		void read_dht_marker();
+		void read_dqt_marker();
+		void read_sof_marker();
+		void skip_variable_marker();
+		void read_dri_marker();
+		void read_sos_marker();
+		int next_marker();
+		int process_markers();
+		void locate_soi_marker();
+		void locate_sof_marker();
+		int locate_sos_marker();
+		void init(jpeg_decoder_stream* pStream, uint32_t flags);
+		void create_look_ups();
+		void fix_in_buffer();
+		void transform_mcu(int mcu_row);
+		coeff_buf* coeff_buf_open(int block_num_x, int block_num_y, int block_len_x, int block_len_y);
+		inline jpgd_block_t* coeff_buf_getp(coeff_buf* cb, int block_x, int block_y);
+		void load_next_row();
+		void decode_next_row();
+		void make_huff_table(int index, huff_tables* pH);
+		void check_quant_tables();
+		void check_huff_tables();
+		bool calc_mcu_block_order();
+		int init_scan();
+		void init_frame();
+		void process_restart();
+		void decode_scan(pDecode_block_func decode_block_func);
+		void init_progressive();
+		void init_sequential();
+		void decode_start();
+		void decode_init(jpeg_decoder_stream* pStream, uint32_t flags);
+		void H2V2Convert();
+		uint32_t H2V2ConvertFiltered();
+		void H2V1Convert();
+		void H2V1ConvertFiltered();
+		void H1V2Convert();
+		void H1V2ConvertFiltered();
+		void H1V1Convert();
+		void gray_convert();
+		void find_eoi();
+		inline uint get_char();
+		inline uint get_char(bool* pPadding_flag);
+		inline void stuff_char(uint8 q);
+		inline uint8 get_octet();
+		inline uint get_bits(int num_bits);
+		inline uint get_bits_no_markers(int numbits);
+		inline int huff_decode(huff_tables* pH);
+		inline int huff_decode(huff_tables* pH, int& extrabits);
+
+		// Clamps a value between 0-255.
+		static inline uint8 clamp(int i)
+		{
+			if (static_cast<uint>(i) > 255)
+				i = (((~i) >> 31) & 0xFF);
+			return static_cast<uint8>(i);
+		}
+		int decode_next_mcu_row();
+
+		static void decode_block_dc_first(jpeg_decoder* pD, int component_id, int block_x, int block_y);
+		static void decode_block_dc_refine(jpeg_decoder* pD, int component_id, int block_x, int block_y);
+		static void decode_block_ac_first(jpeg_decoder* pD, int component_id, int block_x, int block_y);
+		static void decode_block_ac_refine(jpeg_decoder* pD, int component_id, int block_x, int block_y);
+	};
+
+} // namespace jpgd
+
+#endif // JPEG_DECODER_H
--- a/thirdparty/basis_universal/encoder/lodepng.cpp
+++ b/thirdparty/basis_universal/encoder/lodepng.cpp
@ -29,6 +29,7 @@ Rename this file to lodepng.cpp to use it for C++, or to lodepng.c to use it for
 */

 #ifdef _MSC_VER
+#define _CRT_SECURE_NO_DEPRECATE
 #pragma warning (disable : 4201)

 #ifndef BASISU_NO_ITERATOR_DEBUG_LEVEL
@ -200,6 +201,7 @@ static void uivector_init(uivector* p) {
 /*returns 1 if success, 0 if failure ==> nothing done*/
 static unsigned uivector_push_back(uivector* p, unsigned c) {
  if(!uivector_resize(p, p->size + 1)) return 0;
+  if (!p->data) return 0;
  p->data[p->size - 1] = c;
  return 1;
 }
--- a/thirdparty/basis_universal/encoder/lodepng.h
+++ b/thirdparty/basis_universal/encoder/lodepng.h
--- a/thirdparty/basis_universal/transcoder/basisu.h
+++ b/thirdparty/basis_universal/transcoder/basisu.h
@ -1,5 +1,5 @@
 // basisu.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 // Important: If compiling with gcc, be sure strict aliasing is disabled: -fno-strict-aliasing
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@ -41,10 +41,6 @@
 			#endif
 		#endif // defined(_DEBUG) || defined(DEBUG)

-		#ifndef NOMINMAX
-			#define NOMINMAX
-		#endif
-
 	#endif // BASISU_NO_ITERATOR_DEBUG_LEVEL

 #endif // _MSC_VER
@ -63,10 +59,11 @@
 #include <functional>
 #include <iterator>
 #include <type_traits>
-#include <vector>
 #include <assert.h>
 #include <random>

+#include "basisu_containers.h"
+
 #ifdef max
 #undef max
 #endif
@ -79,20 +76,20 @@
 #define strcasecmp _stricmp
 #endif

-// Set to one to enable debug printf()'s when any errors occur, for development/debugging.
-#ifndef BASISU_DEVEL_MESSAGES
-#define BASISU_DEVEL_MESSAGES 0
+// Set to one to enable debug printf()'s when any errors occur, for development/debugging. Especially useful for WebGL development.
+#ifndef BASISU_FORCE_DEVEL_MESSAGES
+#define BASISU_FORCE_DEVEL_MESSAGES 0
 #endif

 #define BASISU_NOTE_UNUSED(x) (void)(x)
 #define BASISU_ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
 #define BASISU_NO_EQUALS_OR_COPY_CONSTRUCT(x) x(const x &) = delete; x& operator= (const x &) = delete;
 #define BASISU_ASSUME(x) static_assert(x, #x);
-#define BASISU_OFFSETOF(s, m) (uint32_t)(intptr_t)(&((s *)(0))->m)
+#define BASISU_OFFSETOF(s, m) offsetof(s, m)
 #define BASISU_STRINGIZE(x) #x
 #define BASISU_STRINGIZE2(x) BASISU_STRINGIZE(x)

-#if BASISU_DEVEL_MESSAGES
+#if BASISU_FORCE_DEVEL_MESSAGES
 	#define BASISU_DEVEL_ERROR(...) do { basisu::debug_printf(__VA_ARGS__); } while(0)
 #else
 	#define BASISU_DEVEL_ERROR(...)
@ -108,26 +105,43 @@ namespace basisu
 	const char BASISU_PATH_SEPERATOR_CHAR = '/';
 #endif

-	typedef std::vector<uint8_t> uint8_vec;
-	typedef std::vector<int16_t> int16_vec;
-	typedef std::vector<uint16_t> uint16_vec;
-	typedef std::vector<uint32_t> uint_vec;
-	typedef std::vector<uint64_t> uint64_vec;
-	typedef std::vector<int> int_vec;
-	typedef std::vector<bool> bool_vec;
+	typedef basisu::vector<uint8_t> uint8_vec;
+	typedef basisu::vector<int16_t> int16_vec;
+	typedef basisu::vector<uint16_t> uint16_vec;
+	typedef basisu::vector<uint32_t> uint_vec;
+	typedef basisu::vector<uint64_t> uint64_vec;
+	typedef basisu::vector<int> int_vec;
+	typedef basisu::vector<bool> bool_vec;

 	void enable_debug_printf(bool enabled);
 	void debug_printf(const char *pFmt, ...);
 		
+
 	template <typename T> inline void clear_obj(T& obj) { memset(&obj, 0, sizeof(obj)); }

 	template <typename T0, typename T1> inline T0 lerp(T0 a, T0 b, T1 c) { return a + (b - a) * c; }

 	template <typename S> inline S maximum(S a, S b) { return (a > b) ? a : b; }
 	template <typename S> inline S maximum(S a, S b, S c) { return maximum(maximum(a, b), c); }
+	template <typename S> inline S maximum(S a, S b, S c, S d) { return maximum(maximum(maximum(a, b), c), d); }
 	
 	template <typename S> inline S minimum(S a, S b) {	return (a < b) ? a : b; }
 	template <typename S> inline S minimum(S a, S b, S c) {	return minimum(minimum(a, b), c); }
+	template <typename S> inline S minimum(S a, S b, S c, S d) { return minimum(minimum(minimum(a, b), c), d); }
+
+	inline float clampf(float value, float low, float high) { if (value < low) value = low; else if (value > high) value = high;	return value; }
+	inline float saturate(float value) { return clampf(value, 0, 1.0f); }
+	inline uint8_t minimumub(uint8_t a, uint8_t b) { return (a < b) ? a : b; }
+	inline uint32_t minimumu(uint32_t a, uint32_t b) { return (a < b) ? a : b; }
+	inline int32_t minimumi(int32_t a, int32_t b) { return (a < b) ? a : b; }
+	inline float minimumf(float a, float b) { return (a < b) ? a : b; }
+	inline uint8_t maximumub(uint8_t a, uint8_t b) { return (a > b) ? a : b; }
+	inline uint32_t maximumu(uint32_t a, uint32_t b) { return (a > b) ? a : b; }
+	inline int32_t maximumi(int32_t a, int32_t b) { return (a > b) ? a : b; }
+	inline float maximumf(float a, float b) { return (a > b) ? a : b; }
+	inline int squarei(int i) { return i * i; }
+	inline float squaref(float i) { return i * i; }
+	template<typename T> inline T square(T a) { return a * a; }

 	template <typename S> inline S clamp(S value, S low, S high) { return (value < low) ? low : ((value > high) ? high : value); }

@ -137,12 +151,10 @@ namespace basisu
 	template<typename T> inline void clear_vector(T &vec) { vec.erase(vec.begin(), vec.end()); }		
 	template<typename T> inline typename T::value_type *enlarge_vector(T &vec, size_t n) { size_t cs = vec.size(); vec.resize(cs + n); return &vec[cs]; }

-	template<typename S> inline S square(S val) { return val * val; }
-
 	inline bool is_pow2(uint32_t x) { return x && ((x & (x - 1U)) == 0U); }
 	inline bool is_pow2(uint64_t x) { return x && ((x & (x - 1U)) == 0U); }

-	template<typename T> inline T open_range_check(T v, T minv, T maxv) { assert(v >= minv && v < maxv); return v; }
+	template<typename T> inline T open_range_check(T v, T minv, T maxv) { assert(v >= minv && v < maxv); BASISU_NOTE_UNUSED(minv); BASISU_NOTE_UNUSED(maxv); return v; }
 	template<typename T> inline T open_range_check(T v, T maxv) { assert(v < maxv); BASISU_NOTE_UNUSED(maxv); return v; }

 	inline uint32_t total_bits(uint32_t v) { uint32_t l = 0; for ( ; v > 0U; ++l) v >>= 1; return l; }
@ -244,27 +256,92 @@ namespace basisu
 		if ((ha <= lb) || (la >= hb)) return false;
 		return true;
 	}
+
+	static inline uint32_t read_le_dword(const uint8_t *pBytes)
+	{
+		return (pBytes[3] << 24U) | (pBytes[2] << 16U) | (pBytes[1] << 8U) | (pBytes[0]);
+	}
+
+	static inline void write_le_dword(uint8_t* pBytes, uint32_t val)
+	{
+		pBytes[0] = (uint8_t)val;
+		pBytes[1] = (uint8_t)(val >> 8U);
+		pBytes[2] = (uint8_t)(val >> 16U);
+		pBytes[3] = (uint8_t)(val >> 24U);
+	}
 		
-	// Always little endian 2-4 byte unsigned int
+	// Always little endian 1-8 byte unsigned int
 	template<uint32_t NumBytes>
 	struct packed_uint
 	{
 		uint8_t m_bytes[NumBytes];

-		inline packed_uint() { static_assert(NumBytes <= 4, "NumBytes <= 4"); }
-		inline packed_uint(uint32_t v) { *this = v; }
+		inline packed_uint() { static_assert(NumBytes <= sizeof(uint64_t), "Invalid NumBytes"); }
+		inline packed_uint(uint64_t v) { *this = v; }
 		inline packed_uint(const packed_uint& other) { *this = other; }
+						
+		inline packed_uint& operator= (uint64_t v) 
+		{ 
+			for (uint32_t i = 0; i < NumBytes; i++) 
+				m_bytes[i] = static_cast<uint8_t>(v >> (i * 8)); 
+			return *this; 
+		}

-		inline packed_uint& operator= (uint32_t v) { for (uint32_t i = 0; i < NumBytes; i++) m_bytes[i] = static_cast<uint8_t>(v >> (i * 8)); return *this; }
+		inline packed_uint& operator= (const packed_uint& rhs) 
+		{ 
+			memcpy(m_bytes, rhs.m_bytes, sizeof(m_bytes)); 
+			return *this;
+		}

 		inline operator uint32_t() const
 		{
 			switch (NumBytes)
 			{
-				case 1:  return  m_bytes[0];
-				case 2:  return (m_bytes[1] << 8U) | m_bytes[0];
-				case 3:  return (m_bytes[2] << 16U) | (m_bytes[1] << 8U) | (m_bytes[0]);
-				default: return (m_bytes[3] << 24U) | (m_bytes[2] << 16U) | (m_bytes[1] << 8U) | (m_bytes[0]);
+				case 1:  
+				{
+					return  m_bytes[0];
+				}
+				case 2:  
+				{
+					return (m_bytes[1] << 8U) | m_bytes[0];
+				}
+				case 3:  
+				{
+					return (m_bytes[2] << 16U) | (m_bytes[1] << 8U) | m_bytes[0];
+				}
+				case 4:  
+				{
+					return read_le_dword(m_bytes);
+				}
+				case 5:
+				{
+					uint32_t l = read_le_dword(m_bytes);
+					uint32_t h = m_bytes[4];
+					return static_cast<uint64_t>(l) | (static_cast<uint64_t>(h) << 32U);
+				}
+				case 6:
+				{
+					uint32_t l = read_le_dword(m_bytes);
+					uint32_t h = (m_bytes[5] << 8U) | m_bytes[4];
+					return static_cast<uint64_t>(l) | (static_cast<uint64_t>(h) << 32U);
+				}
+				case 7:
+				{
+					uint32_t l = read_le_dword(m_bytes);
+					uint32_t h = (m_bytes[6] << 16U) | (m_bytes[5] << 8U) | m_bytes[4];
+					return static_cast<uint64_t>(l) | (static_cast<uint64_t>(h) << 32U);
+				}
+				case 8:  
+				{
+					uint32_t l = read_le_dword(m_bytes);
+					uint32_t h = read_le_dword(m_bytes + 4);
+					return static_cast<uint64_t>(l) | (static_cast<uint64_t>(h) << 32U);
+				}
+				default: 
+				{
+					assert(0);
+					return 0;
+				}
 			}
 		}
 	};
@ -278,7 +355,7 @@ namespace basisu
 	enum
 	{
 		cHuffmanMaxSupportedCodeSize = 16, cHuffmanMaxSupportedInternalCodeSize = 31, 
-		cHuffmanFastLookupBits = 10, cHuffmanFastLookupSize = 1 << cHuffmanFastLookupBits,
+		cHuffmanFastLookupBits = 10, 
 		cHuffmanMaxSymsLog2 = 14, cHuffmanMaxSyms = 1 << cHuffmanMaxSymsLog2,

 		// Small zero runs
@ -308,15 +385,15 @@ namespace basisu
 		// Block-based formats
 		cETC1,			// ETC1
 		cETC1S,			// ETC1 (subset: diff colors only, no subblocks)
-		cETC2_RGB,		// ETC2 color block
-		cETC2_RGBA,		// ETC2 alpha block followed by ETC2 color block
+		cETC2_RGB,		// ETC2 color block (basisu doesn't support ETC2 planar/T/H modes - just basic ETC1)
+		cETC2_RGBA,		// ETC2 EAC alpha block followed by ETC2 color block
 		cETC2_ALPHA,	// ETC2 EAC alpha block 
 		cBC1,				// DXT1
-		cBC3,				// DXT5 (DXT5A block followed by a DXT1 block)
+		cBC3,				// DXT5 (BC4/DXT5A block followed by a BC1/DXT1 block)
 		cBC4,				// DXT5A
-		cBC5,				// 3DC/DXN (two DXT5A blocks)
+		cBC5,				// 3DC/DXN (two BC4/DXT5A blocks)
 		cBC7,
-		cASTC4x4,		
+		cASTC4x4,		// LDR only
 		cPVRTC1_4_RGB,
 		cPVRTC1_4_RGBA,
 		cATC_RGB,
@ -325,6 +402,9 @@ namespace basisu
 		cPVRTC2_4_RGBA,
 		cETC2_R11_EAC,
 		cETC2_RG11_EAC,
+		cUASTC4x4,		
+		cBC1_NV,
+		cBC1_AMD,
 		
 		// Uncompressed/raw pixels
 		cRGBA32,
@ -343,6 +423,8 @@ namespace basisu
 		case texture_format::cETC2_RGB:
 		case texture_format::cETC2_ALPHA:
 		case texture_format::cBC1:
+		case texture_format::cBC1_NV:
+		case texture_format::cBC1_AMD:
 		case texture_format::cBC4:
 		case texture_format::cPVRTC1_4_RGB:
 		case texture_format::cPVRTC1_4_RGBA:
--- a/thirdparty/basis_universal/transcoder/basisu_containers.h
+++ b/thirdparty/basis_universal/transcoder/basisu_containers.h
--- a/thirdparty/basis_universal/transcoder/basisu_containers_impl.h
+++ b/thirdparty/basis_universal/transcoder/basisu_containers_impl.h
@ -0,0 +1,311 @@
+// basisu_containers_impl.h
+// Do not include directly
+
+#ifdef _MSC_VER
+#pragma warning (disable:4127) // warning C4127: conditional expression is constant
+#endif
+
+namespace basisu
+{
+   bool elemental_vector::increase_capacity(uint32_t min_new_capacity, bool grow_hint, uint32_t element_size, object_mover pMover, bool nofail)
+   {
+      assert(m_size <= m_capacity);
+
+      if (sizeof(void *) == sizeof(uint64_t))
+         assert(min_new_capacity < (0x400000000ULL / element_size));
+      else
+         assert(min_new_capacity < (0x7FFF0000U / element_size));
+
+      if (m_capacity >= min_new_capacity)
+         return true;
+
+      size_t new_capacity = min_new_capacity;
+      if ((grow_hint) && (!helpers::is_power_of_2((uint64_t)new_capacity)))
+      {
+         new_capacity = (size_t)helpers::next_pow2((uint64_t)new_capacity);
+
+         assert(new_capacity && (new_capacity > m_capacity));
+
+         if (new_capacity < min_new_capacity)
+         {
+            if (nofail)
+               return false;
+            fprintf(stderr, "vector too large\n");
+            abort();
+         }
+      }
+            
+      const size_t desired_size = element_size * new_capacity;
+      size_t actual_size = 0;
+      if (!pMover)
+      {
+         void* new_p = realloc(m_p, desired_size);
+         if (!new_p)
+         {
+            if (nofail)
+               return false;
+
+            char buf[256];
+#ifdef _MSC_VER
+            sprintf_s(buf, sizeof(buf), "vector: realloc() failed allocating %u bytes", (uint32_t)desired_size);
+#else
+            sprintf(buf, "vector: realloc() failed allocating %u bytes", (uint32_t)desired_size);
+#endif
+            fprintf(stderr, "%s", buf);
+            abort();
+         }
+
+#ifdef _MSC_VER
+         actual_size = _msize(new_p);
+#elif HAS_MALLOC_USABLE_SIZE
+         actual_size = malloc_usable_size(new_p);
+#else
+         actual_size = desired_size;
+#endif
+         m_p = new_p;
+      }
+      else
+      {
+         void* new_p = malloc(desired_size);
+         if (!new_p)
+         {
+            if (nofail)
+               return false;
+
+            char buf[256];
+#ifdef _MSC_VER
+            sprintf_s(buf, sizeof(buf), "vector: malloc() failed allocating %u bytes", (uint32_t)desired_size);
+#else
+            sprintf(buf, "vector: malloc() failed allocating %u bytes", (uint32_t)desired_size);
+#endif
+            fprintf(stderr, "%s", buf);
+            abort();
+         }
+
+#ifdef _MSC_VER
+         actual_size = _msize(new_p);
+#elif HAS_MALLOC_USABLE_SIZE
+         actual_size = malloc_usable_size(new_p);
+#else
+         actual_size = desired_size;
+#endif
+
+         (*pMover)(new_p, m_p, m_size);
+
+         if (m_p)
+            free(m_p);
+         
+         m_p = new_p;
+      }
+
+      if (actual_size > desired_size)
+         m_capacity = static_cast<uint32_t>(actual_size / element_size);
+      else
+         m_capacity = static_cast<uint32_t>(new_capacity);
+
+      return true;
+   }
+
+#if BASISU_HASHMAP_TEST
+
+#define HASHMAP_TEST_VERIFY(c) do { if (!(c)) handle_hashmap_test_verify_failure(__LINE__); } while(0)
+
+   static void handle_hashmap_test_verify_failure(int line)
+   {
+      fprintf(stderr, "HASHMAP_TEST_VERIFY() faild on line %i\n", line);
+      abort();
+   }
+
+   class counted_obj
+   {
+   public:
+      counted_obj(uint32_t v = 0) :
+         m_val(v)
+      {
+         m_count++;
+      }
+
+      counted_obj(const counted_obj& obj) :
+         m_val(obj.m_val)
+      {
+         m_count++;
+      }
+
+      ~counted_obj()
+      {
+         assert(m_count > 0);
+         m_count--;
+      }
+
+      static uint32_t m_count;
+
+      uint32_t m_val;
+
+      operator size_t() const { return m_val; }
+
+      bool operator== (const counted_obj& rhs) const { return m_val == rhs.m_val; }
+      bool operator== (const uint32_t rhs) const { return m_val == rhs; }
+
+   };
+
+   uint32_t counted_obj::m_count;
+
+   static uint32_t urand32()
+   {
+      uint32_t a = rand();
+      uint32_t b = rand() << 15;
+      uint32_t c = rand() << (32 - 15);
+      return a ^ b ^ c;
+   }
+
+   static int irand32(int l, int h)
+   {
+      assert(l < h);
+      if (l >= h)
+         return l;
+
+      uint32_t range = static_cast<uint32_t>(h - l);
+
+      uint32_t rnd = urand32();
+
+      uint32_t rnd_range = static_cast<uint32_t>((((uint64_t)range) * ((uint64_t)rnd)) >> 32U);
+
+      int result = l + rnd_range;
+      assert((result >= l) && (result < h));
+      return result;
+   }
+
+   void hash_map_test()
+   {
+      {
+         basisu::hash_map<uint64_t, uint64_t> k;
+         basisu::hash_map<uint64_t, uint64_t> l;
+         std::swap(k, l);
+
+         k.begin();
+         k.end();
+         k.clear();
+         k.empty();
+         k.erase(0);
+         k.insert(0, 1);
+         k.find(0);
+         k.get_equals();
+         k.get_hasher();
+         k.get_table_size();
+         k.reset();
+         k.reserve(1);
+         k = l;
+         k.set_equals(l.get_equals());
+         k.set_hasher(l.get_hasher());
+         k.get_table_size();
+      }
+
+      uint32_t seed = 0;
+      for (; ; )
+      {
+         seed++;
+
+         typedef basisu::hash_map<counted_obj, counted_obj> my_hash_map;
+         my_hash_map m;
+
+         const uint32_t n = irand32(0, 100000);
+
+         printf("%u\n", n);
+
+         srand(seed); // r1.seed(seed);
+
+         basisu::vector<int> q;
+
+         uint32_t count = 0;
+         for (uint32_t i = 0; i < n; i++)
+         {
+            uint32_t v = urand32() & 0x7FFFFFFF;
+            my_hash_map::insert_result res = m.insert(counted_obj(v), counted_obj(v ^ 0xdeadbeef));
+            if (res.second)
+            {
+               count++;
+               q.push_back(v);
+            }
+         }
+
+         HASHMAP_TEST_VERIFY(m.size() == count);
+
+         srand(seed);
+
+         my_hash_map cm(m);
+         m.clear();
+         m = cm;
+         cm.reset();
+
+         for (uint32_t i = 0; i < n; i++)
+         {
+            uint32_t v = urand32() & 0x7FFFFFFF;
+            my_hash_map::const_iterator it = m.find(counted_obj(v));
+            HASHMAP_TEST_VERIFY(it != m.end());
+            HASHMAP_TEST_VERIFY(it->first == v);
+            HASHMAP_TEST_VERIFY(it->second == (v ^ 0xdeadbeef));
+         }
+
+         for (uint32_t t = 0; t < 2; t++)
+         {
+            const uint32_t nd = irand32(1, q.size() + 1);
+            for (uint32_t i = 0; i < nd; i++)
+            {
+               uint32_t p = irand32(0, q.size());
+
+               int k = q[p];
+               if (k >= 0)
+               {
+                  q[p] = -k - 1;
+
+                  bool s = m.erase(counted_obj(k));
+                  HASHMAP_TEST_VERIFY(s);
+               }
+            }
+
+            typedef basisu::hash_map<uint32_t, empty_type> uint_hash_set;
+            uint_hash_set s;
+
+            for (uint32_t i = 0; i < q.size(); i++)
+            {
+               int v = q[i];
+
+               if (v >= 0)
+               {
+                  my_hash_map::const_iterator it = m.find(counted_obj(v));
+                  HASHMAP_TEST_VERIFY(it != m.end());
+                  HASHMAP_TEST_VERIFY(it->first == (uint32_t)v);
+                  HASHMAP_TEST_VERIFY(it->second == ((uint32_t)v ^ 0xdeadbeef));
+
+                  s.insert(v);
+               }
+               else
+               {
+                  my_hash_map::const_iterator it = m.find(counted_obj(-v - 1));
+                  HASHMAP_TEST_VERIFY(it == m.end());
+               }
+            }
+
+            uint32_t found_count = 0;
+            for (my_hash_map::const_iterator it = m.begin(); it != m.end(); ++it)
+            {
+               HASHMAP_TEST_VERIFY(it->second == ((uint32_t)it->first ^ 0xdeadbeef));
+
+               uint_hash_set::const_iterator fit(s.find((uint32_t)it->first));
+               HASHMAP_TEST_VERIFY(fit != s.end());
+
+               HASHMAP_TEST_VERIFY(fit->first == it->first);
+
+               found_count++;
+            }
+
+            HASHMAP_TEST_VERIFY(found_count == s.size());
+         }
+
+         HASHMAP_TEST_VERIFY(counted_obj::m_count == m.size() * 2);
+      }
+   }
+
+#endif // BASISU_HASHMAP_TEST
+
+} // namespace basisu
--- a/thirdparty/basis_universal/transcoder/basisu_file_headers.h
+++ b/thirdparty/basis_universal/transcoder/basisu_file_headers.h
@ -1,5 +1,5 @@
 // basis_file_headers.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -20,8 +20,11 @@ namespace basist
 	// Slice desc header flags
 	enum basis_slice_desc_flags
 	{
-		cSliceDescFlagsIsAlphaData = 1,
-		cSliceDescFlagsFrameIsIFrame = 2			// Video only: Frame doesn't refer to previous frame (no usage of conditional replenishment pred symbols)
+		cSliceDescFlagsHasAlpha = 1,
+		
+		// Video only: Frame doesn't refer to previous frame (no usage of conditional replenishment pred symbols)
+		// Currently the first frame is always an I-Frame, all subsequent frames are P-Frames. This will eventually be changed to periodic I-Frames.
+		cSliceDescFlagsFrameIsIFrame = 2			
 	};

 #pragma pack(push)
@ -38,7 +41,7 @@ namespace basist
 		basisu::packed_uint<2> m_num_blocks_x;	// The slice's block X dimensions. Each block is 4x4 pixels. The slice's pixel resolution may or may not be a power of 2.
 		basisu::packed_uint<2> m_num_blocks_y;	// The slice's block Y dimensions. 

-		basisu::packed_uint<4> m_file_ofs;		// Offset from the header to the start of the slice's data
+		basisu::packed_uint<4> m_file_ofs;		// Offset from the start of the file to the start of the slice's data
 		basisu::packed_uint<4> m_file_size;		// The size of the compressed slice data in bytes

 		basisu::packed_uint<2> m_slice_data_crc16; // The CRC16 of the compressed slice data, for extra-paranoid use cases
@ -47,9 +50,21 @@ namespace basist
 	// File header files
 	enum basis_header_flags
 	{
-		cBASISHeaderFlagETC1S = 1,					// Always set for basis universal files
-		cBASISHeaderFlagYFlipped = 2,				// Set if the texture had to be Y flipped before encoding
-		cBASISHeaderFlagHasAlphaSlices = 4		// True if the odd slices contain alpha data
+		// Always set for ETC1S files. Not set for UASTC files.
+		cBASISHeaderFlagETC1S = 1,					 
+		
+		// Set if the texture had to be Y flipped before encoding. The actual interpretation of this (is Y up or down?) is up to the user.
+		cBASISHeaderFlagYFlipped = 2,				 
+		
+		// Set if any slices contain alpha (for ETC1S, if the odd slices contain alpha data)
+		cBASISHeaderFlagHasAlphaSlices = 4,		 
+		
+		// For ETC1S files, this will be true if the file utilizes a codebook from another .basis file. 
+		cBASISHeaderFlagUsesGlobalCodebook = 8, 
+		
+		// Set if the texture data is sRGB, otherwise it's linear. 
+		// In reality, we have no idea if the texture data is actually linear or sRGB. This is the m_perceptual parameter passed to the compressor.
+		cBASISHeaderFlagSRGB = 16,					 
 	};

 	// The image type field attempts to describe how to interpret the image data in a Basis file.
@ -71,6 +86,12 @@ namespace basist
 		cBASISMaxUSPerFrame = 0xFFFFFF
 	};

+	enum class basis_tex_format
+	{
+		cETC1S = 0,
+		cUASTC4x4 = 1
+	};
+
 	struct basis_file_header
 	{
 		enum
@ -82,16 +103,16 @@ namespace basist
 		basisu::packed_uint<2>      m_sig;				// 2 byte file signature
 		basisu::packed_uint<2>      m_ver;				// Baseline file version
 		basisu::packed_uint<2>      m_header_size;	// Header size in bytes, sizeof(basis_file_header)
-		basisu::packed_uint<2>      m_header_crc16;	// crc16 of the remaining header data
+		basisu::packed_uint<2>      m_header_crc16;	// CRC16 of the remaining header data

 		basisu::packed_uint<4>      m_data_size;		// The total size of all data after the header
 		basisu::packed_uint<2>      m_data_crc16;		// The CRC16 of all data after the header

-		basisu::packed_uint<3>      m_total_slices;	// The total # of compressed slices (1 slice per image, or 2 for alpha basis files)
+		basisu::packed_uint<3>      m_total_slices;	// The total # of compressed slices (1 slice per image, or 2 for alpha .basis files)

 		basisu::packed_uint<3>      m_total_images;	// The total # of images
 				
-		basisu::packed_uint<1>      m_format;			// enum basist::block_format
+		basisu::packed_uint<1>      m_tex_format;		// enum basis_tex_format
 		basisu::packed_uint<2>      m_flags;			// enum basist::header_flags
 		basisu::packed_uint<1>      m_tex_type;		// enum basist::basis_texture_type
 		basisu::packed_uint<3>      m_us_per_frame;	// Framerate of video, in microseconds per frame
@ -101,11 +122,11 @@ namespace basist
 		basisu::packed_uint<4>      m_userdata1;		// For client use

 		basisu::packed_uint<2>      m_total_endpoints;			// The number of endpoints in the endpoint codebook 
-		basisu::packed_uint<4>      m_endpoint_cb_file_ofs;	// The compressed endpoint codebook's file offset relative to the header
+		basisu::packed_uint<4>      m_endpoint_cb_file_ofs;	// The compressed endpoint codebook's file offset relative to the start of the file
 		basisu::packed_uint<3>      m_endpoint_cb_file_size;	// The compressed endpoint codebook's size in bytes

 		basisu::packed_uint<2>      m_total_selectors;			// The number of selectors in the endpoint codebook 
-		basisu::packed_uint<4>      m_selector_cb_file_ofs;	// The compressed selectors codebook's file offset relative to the header
+		basisu::packed_uint<4>      m_selector_cb_file_ofs;	// The compressed selectors codebook's file offset relative to the start of the file
 		basisu::packed_uint<3>      m_selector_cb_file_size;	// The compressed selector codebook's size in bytes

 		basisu::packed_uint<4>      m_tables_file_ofs;			// The file offset of the compressed Huffman codelength tables, for decompressing slices
--- a/thirdparty/basis_universal/transcoder/basisu_global_selector_cb.h
+++ b/thirdparty/basis_universal/transcoder/basisu_global_selector_cb.h
@ -1,4 +1,4 @@
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2020 Binomial LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
--- a/thirdparty/basis_universal/transcoder/basisu_global_selector_palette.h
+++ b/thirdparty/basis_universal/transcoder/basisu_global_selector_palette.h
@ -1,5 +1,7 @@
 // basisu_global_selector_palette.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+// 
+// TODO: NONE of this is used in .basis/.ktx2 files. It will be deleted soon.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -609,7 +611,7 @@ namespace basist
 		uint8_t m_selectors[16];
 	};

-	typedef std::vector<etc1_selector_palette_entry> etc1_selector_palette_entry_vec;
+	typedef basisu::vector<etc1_selector_palette_entry> etc1_selector_palette_entry_vec;

 	extern const uint32_t g_global_selector_cb[];
 	extern const uint32_t g_global_selector_cb_size;
@ -628,7 +630,7 @@ namespace basist
 		void set(uint32_t palette_index, const etc1_global_palette_entry_modifier &modifier) { m_palette_index = palette_index; m_modifier = modifier; }
 	};

-	typedef std::vector<etc1_global_selector_codebook_entry_id> etc1_global_selector_codebook_entry_id_vec;
+	typedef basisu::vector<etc1_global_selector_codebook_entry_id> etc1_global_selector_codebook_entry_id_vec;

 	class etc1_global_selector_codebook
 	{
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder.h
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder.h
@ -1,5 +1,5 @@
 // basisu_transcoder.h
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 // Important: If compiling with gcc, be sure strict aliasing is disabled: -fno-strict-aliasing
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@ -15,10 +15,25 @@
 // limitations under the License.
 #pragma once

-// Set BASISU_DEVEL_MESSAGES to 1 to enable debug printf()'s whenever an error occurs, for easier debugging during development.
-//#define BASISU_DEVEL_MESSAGES 1
+// By default KTX2 support is enabled to simplify compilation. This implies the need for the Zstandard library (which we distribute as a single source file in the "zstd" directory) by default.
+// Set BASISD_SUPPORT_KTX2 to 0 to completely disable KTX2 support as well as Zstd/miniz usage which is only required for UASTC supercompression in KTX2 files.
+// Also see BASISD_SUPPORT_KTX2_ZSTD in basisu_transcoder.cpp, which individually disables Zstd usage.
+#ifndef BASISD_SUPPORT_KTX2
+	#define BASISD_SUPPORT_KTX2 1
+#endif
+
+// Set BASISD_SUPPORT_KTX2_ZSTD to 0 to disable Zstd usage and KTX2 UASTC Zstd supercompression support 
+#ifndef BASISD_SUPPORT_KTX2_ZSTD
+	#define BASISD_SUPPORT_KTX2_ZSTD 1
+#endif
+
+// Set BASISU_FORCE_DEVEL_MESSAGES to 1 to enable debug printf()'s whenever an error occurs, for easier debugging during development.
+#ifndef BASISU_FORCE_DEVEL_MESSAGES
+	#define BASISU_FORCE_DEVEL_MESSAGES 0
+#endif

 #include "basisu_transcoder_internal.h"
+#include "basisu_transcoder_uastc.h"
 #include "basisu_global_selector_palette.h"
 #include "basisu_file_headers.h"

@ -45,12 +60,11 @@ namespace basist
 		cTFBC3_RGBA = 3, 							// Opaque+alpha, BC4 followed by a BC1 block, alpha channel will be opaque for opaque .basis files
 		cTFBC4_R = 4,								// Red only, alpha slice is transcoded to output if cDecodeFlagsTranscodeAlphaDataToOpaqueFormats flag is specified
 		cTFBC5_RG = 5,								// XY: Two BC4 blocks, X=R and Y=Alpha, .basis file should have alpha data (if not Y will be all 255's)
-		cTFBC7_M6_RGB = 6,						// Opaque only, RGB or alpha if cDecodeFlagsTranscodeAlphaDataToOpaqueFormats flag is specified. Highest quality of all the non-ETC1 formats.
-		cTFBC7_M5_RGBA = 7,						// Opaque+alpha, alpha channel will be opaque for opaque .basis files
+		cTFBC7_RGBA = 6,							// RGB or RGBA, mode 5 for ETC1S, modes (1,2,3,5,6,7) for UASTC

 		// PVRTC1 4bpp (mobile, PowerVR devices)
 		cTFPVRTC1_4_RGB = 8,						// Opaque only, RGB or alpha if cDecodeFlagsTranscodeAlphaDataToOpaqueFormats flag is specified, nearly lowest quality of any texture format.
-		cTFPVRTC1_4_RGBA = 9,					// Opaque+alpha, most useful for simple opacity maps. If .basis file doens't have alpha cTFPVRTC1_4_RGB will be used instead. Lowest quality of any supported texture format.
+		cTFPVRTC1_4_RGBA = 9,					// Opaque+alpha, most useful for simple opacity maps. If .basis file doesn't have alpha cTFPVRTC1_4_RGB will be used instead. Lowest quality of any supported texture format.

 		// ASTC (mobile, Intel devices, hopefully all desktop GPU's one day)
 		cTFASTC_4x4_RGBA = 10,					// Opaque+alpha, ASTC 4x4, alpha channel will be opaque for opaque .basis files. Transcoder uses RGB/RGBA/L/LA modes, void extent, and up to two ([0,47] and [0,255]) endpoint precisions.
@ -69,10 +83,10 @@ namespace basist

 		cTFETC2_EAC_R11 = 20,					// R only (ETC2 EAC R11 unsigned)
 		cTFETC2_EAC_RG11 = 21,					// RG only (ETC2 EAC RG11 unsigned), R=opaque.r, G=alpha - for tangent space normal maps
-		
+
 		// Uncompressed (raw pixel) formats
 		cTFRGBA32 = 13,							// 32bpp RGBA image stored in raster (not block) order in memory, R is first byte, A is last byte.
-		cTFRGB565 = 14,							// 166pp RGB image stored in raster (not block) order in memory, R at bit position 11
+		cTFRGB565 = 14,							// 16bpp RGB image stored in raster (not block) order in memory, R at bit position 11
 		cTFBGR565 = 15,							// 16bpp RGB image stored in raster (not block) order in memory, R at bit position 0
 		cTFRGBA4444 = 16,							// 16bpp RGBA image stored in raster (not block) order in memory, R at bit position 12, A at bit position 0

@ -85,27 +99,62 @@ namespace basist
 		cTFBC3 = cTFBC3_RGBA,
 		cTFBC4 = cTFBC4_R,
 		cTFBC5 = cTFBC5_RG,
-		cTFBC7_M6_OPAQUE_ONLY = cTFBC7_M6_RGB,
-		cTFBC7_M5 = cTFBC7_M5_RGBA,
+
+		// Previously, the caller had some control over which BC7 mode the transcoder output. We've simplified this due to UASTC, which supports numerous modes.
+		cTFBC7_M6_RGB = cTFBC7_RGBA,			// Opaque only, RGB or alpha if cDecodeFlagsTranscodeAlphaDataToOpaqueFormats flag is specified. Highest quality of all the non-ETC1 formats.
+		cTFBC7_M5_RGBA = cTFBC7_RGBA,			// Opaque+alpha, alpha channel will be opaque for opaque .basis files
+		cTFBC7_M6_OPAQUE_ONLY = cTFBC7_RGBA,
+		cTFBC7_M5 = cTFBC7_RGBA,
+		cTFBC7_ALT = 7,
+
 		cTFASTC_4x4 = cTFASTC_4x4_RGBA,
+
 		cTFATC_RGBA_INTERPOLATED_ALPHA = cTFATC_RGBA,
 	};

-	uint32_t basis_get_bytes_per_block(transcoder_texture_format fmt);
+	// For compressed texture formats, this returns the # of bytes per block. For uncompressed, it returns the # of bytes per pixel.
+	// NOTE: Previously, this function was called basis_get_bytes_per_block(), and it always returned 16*bytes_per_pixel for uncompressed formats which was confusing.
+	uint32_t basis_get_bytes_per_block_or_pixel(transcoder_texture_format fmt);
+
+	// Returns format's name in ASCII
 	const char* basis_get_format_name(transcoder_texture_format fmt);
+
+	// Returns block format name in ASCII
+	const char* basis_get_block_format_name(block_format fmt);
+
+	// Returns true if the format supports an alpha channel.
 	bool basis_transcoder_format_has_alpha(transcoder_texture_format fmt);
+
+	// Returns the basisu::texture_format corresponding to the specified transcoder_texture_format.
 	basisu::texture_format basis_get_basisu_texture_format(transcoder_texture_format fmt);
+
+	// Returns the texture type's name in ASCII.
 	const char* basis_get_texture_type_name(basis_texture_type tex_type);
-	
+
+	// Returns true if the transcoder texture type is an uncompressed (raw pixel) format.
 	bool basis_transcoder_format_is_uncompressed(transcoder_texture_format tex_type);
+
+	// Returns the # of bytes per pixel for uncompressed formats, or 0 for block texture formats.
 	uint32_t basis_get_uncompressed_bytes_per_pixel(transcoder_texture_format fmt);
-	
+
+	// Returns the block width for the specified texture format, which is currently either 4 or 8 for FXT1.
 	uint32_t basis_get_block_width(transcoder_texture_format tex_type);
+
+	// Returns the block height for the specified texture format, which is currently always 4.
 	uint32_t basis_get_block_height(transcoder_texture_format tex_type);

 	// Returns true if the specified format was enabled at compile time.
-	bool basis_is_format_supported(transcoder_texture_format tex_type);
-		
+	bool basis_is_format_supported(transcoder_texture_format tex_type, basis_tex_format fmt = basis_tex_format::cETC1S);
+
+	// Validates that the output buffer is large enough to hold the entire transcoded texture.
+	// For uncompressed texture formats, most input parameters are in pixels, not blocks. Blocks are 4x4 pixels.
+	bool basis_validate_output_buffer_size(transcoder_texture_format target_format,
+		uint32_t output_blocks_buf_size_in_blocks_or_pixels,
+		uint32_t orig_width, uint32_t orig_height,
+		uint32_t output_row_pitch_in_blocks_or_pixels,
+		uint32_t output_rows_in_pixels,
+		uint32_t total_slice_blocks);
+
 	class basisu_transcoder;

 	// This struct holds all state used during transcoding. For video, it needs to persist between image transcodes (it holds the previous frame).
@ -118,46 +167,161 @@ namespace basist
 			uint8_t m_pred_bits;
 		};

-		std::vector<block_preds> m_block_endpoint_preds[2];
-		
+		basisu::vector<block_preds> m_block_endpoint_preds[2];
+
 		enum { cMaxPrevFrameLevels = 16 };
-		std::vector<uint32_t> m_prev_frame_indices[2][cMaxPrevFrameLevels]; // [alpha_flag][level_index] 
+		basisu::vector<uint32_t> m_prev_frame_indices[2][cMaxPrevFrameLevels]; // [alpha_flag][level_index] 
+
+		void clear()
+		{
+			for (uint32_t i = 0; i < 2; i++)
+			{
+				m_block_endpoint_preds[i].clear();
+
+				for (uint32_t j = 0; j < cMaxPrevFrameLevels; j++)
+					m_prev_frame_indices[i][j].clear();
+			}
+		}
 	};
-	
+
 	// Low-level helper class that does the actual transcoding.
-	class basisu_lowlevel_transcoder
+	class basisu_lowlevel_etc1s_transcoder
 	{
 		friend class basisu_transcoder;
-	
+
 	public:
-		basisu_lowlevel_transcoder(const basist::etc1_global_selector_codebook *pGlobal_sel_codebook);
+		basisu_lowlevel_etc1s_transcoder(const basist::etc1_global_selector_codebook* pGlobal_sel_codebook);
+
+		void set_global_codebooks(const basisu_lowlevel_etc1s_transcoder* pGlobal_codebook) { m_pGlobal_codebook = pGlobal_codebook; }
+		const basisu_lowlevel_etc1s_transcoder* get_global_codebooks() const { return m_pGlobal_codebook; }

 		bool decode_palettes(
-			uint32_t num_endpoints, const uint8_t *pEndpoints_data, uint32_t endpoints_data_size,
-			uint32_t num_selectors, const uint8_t *pSelectors_data, uint32_t selectors_data_size);
+			uint32_t num_endpoints, const uint8_t* pEndpoints_data, uint32_t endpoints_data_size,
+			uint32_t num_selectors, const uint8_t* pSelectors_data, uint32_t selectors_data_size);

-		bool decode_tables(const uint8_t *pTable_data, uint32_t table_data_size);
+		bool decode_tables(const uint8_t* pTable_data, uint32_t table_data_size);

-		bool transcode_slice(void *pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t *pImage_data, uint32_t image_data_size, block_format fmt, 
-			uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, const basis_file_header &header, const basis_slice_desc& slice_desc, uint32_t output_row_pitch_in_blocks_or_pixels = 0,
-			basisu_transcoder_state *pState = nullptr, bool astc_transcode_alpha = false, void* pAlpha_blocks = nullptr, uint32_t output_rows_in_pixels = 0);
+		bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
+			uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, const bool is_video, const bool is_alpha_slice, const uint32_t level_index, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels = 0,
+			basisu_transcoder_state* pState = nullptr, bool astc_transcode_alpha = false, void* pAlpha_blocks = nullptr, uint32_t output_rows_in_pixels = 0);
+
+		bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
+			uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, const basis_file_header& header, const basis_slice_desc& slice_desc, uint32_t output_row_pitch_in_blocks_or_pixels = 0,
+			basisu_transcoder_state* pState = nullptr, bool astc_transcode_alpha = false, void* pAlpha_blocks = nullptr, uint32_t output_rows_in_pixels = 0)
+		{
+			return transcode_slice(pDst_blocks, num_blocks_x, num_blocks_y, pImage_data, image_data_size, fmt, output_block_or_pixel_stride_in_bytes, bc1_allow_threecolor_blocks,
+				header.m_tex_type == cBASISTexTypeVideoFrames, (slice_desc.m_flags & cSliceDescFlagsHasAlpha) != 0, slice_desc.m_level_index,
+				slice_desc.m_orig_width, slice_desc.m_orig_height, output_row_pitch_in_blocks_or_pixels, pState,
+				astc_transcode_alpha,
+				pAlpha_blocks,
+				output_rows_in_pixels);
+		}
+
+		// Container independent transcoding
+		bool transcode_image(
+			transcoder_texture_format target_format,
+			void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels,
+			const uint8_t* pCompressed_data, uint32_t compressed_data_length,
+			uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t orig_width, uint32_t orig_height, uint32_t level_index,
+			uint32_t rgb_offset, uint32_t rgb_length, uint32_t alpha_offset, uint32_t alpha_length,
+			uint32_t decode_flags = 0,
+			bool basis_file_has_alpha_slices = false,
+			bool is_video = false,
+			uint32_t output_row_pitch_in_blocks_or_pixels = 0,
+			basisu_transcoder_state* pState = nullptr,
+			uint32_t output_rows_in_pixels = 0);
+
+		void clear()
+		{
+			m_local_endpoints.clear();
+			m_local_selectors.clear();
+			m_endpoint_pred_model.clear();
+			m_delta_endpoint_model.clear();
+			m_selector_model.clear();
+			m_selector_history_buf_rle_model.clear();
+			m_selector_history_buf_size = 0;
+		}
+
+		// Low-level methods
+		typedef basisu::vector<endpoint> endpoint_vec;
+		const endpoint_vec& get_endpoints() const { return m_local_endpoints; }
+
+		typedef basisu::vector<selector> selector_vec;
+		const selector_vec& get_selectors() const { return m_local_selectors; }
+
+		const etc1_global_selector_codebook* get_global_sel_codebook() const { return m_pGlobal_sel_codebook; }

 	private:
-		typedef std::vector<endpoint> endpoint_vec;
-		endpoint_vec m_endpoints;
+		const basisu_lowlevel_etc1s_transcoder* m_pGlobal_codebook;

-		typedef std::vector<selector> selector_vec;
-		selector_vec m_selectors;
+		endpoint_vec m_local_endpoints;
+		selector_vec m_local_selectors;

-		const etc1_global_selector_codebook *m_pGlobal_sel_codebook;
+		const etc1_global_selector_codebook* m_pGlobal_sel_codebook;

 		huffman_decoding_table m_endpoint_pred_model, m_delta_endpoint_model, m_selector_model, m_selector_history_buf_rle_model;

 		uint32_t m_selector_history_buf_size;
-		
+
 		basisu_transcoder_state m_def_state;
 	};

+	enum basisu_decode_flags
+	{
+		// PVRTC1: decode non-pow2 ETC1S texture level to the next larger power of 2 (not implemented yet, but we're going to support it). Ignored if the slice's dimensions are already a power of 2.
+		cDecodeFlagsPVRTCDecodeToNextPow2 = 2,
+
+		// When decoding to an opaque texture format, if the basis file has alpha, decode the alpha slice instead of the color slice to the output texture format.
+		// This is primarily to allow decoding of textures with alpha to multiple ETC1 textures (one for color, another for alpha).
+		cDecodeFlagsTranscodeAlphaDataToOpaqueFormats = 4,
+
+		// Forbid usage of BC1 3 color blocks (we don't support BC1 punchthrough alpha yet).
+		// This flag is used internally when decoding to BC3.
+		cDecodeFlagsBC1ForbidThreeColorBlocks = 8,
+
+		// The output buffer contains alpha endpoint/selector indices. 
+		// Used internally when decoding formats like ASTC that require both color and alpha data to be available when transcoding to the output format.
+		cDecodeFlagsOutputHasAlphaIndices = 16,
+
+		cDecodeFlagsHighQuality = 32
+	};
+
+	class basisu_lowlevel_uastc_transcoder
+	{
+		friend class basisu_transcoder;
+
+	public:
+		basisu_lowlevel_uastc_transcoder();
+
+		bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
+			uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, bool has_alpha, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels = 0,
+			basisu_transcoder_state* pState = nullptr, uint32_t output_rows_in_pixels = 0, int channel0 = -1, int channel1 = -1, uint32_t decode_flags = 0);
+
+		bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt,
+			uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, const basis_file_header& header, const basis_slice_desc& slice_desc, uint32_t output_row_pitch_in_blocks_or_pixels = 0,
+			basisu_transcoder_state* pState = nullptr, uint32_t output_rows_in_pixels = 0, int channel0 = -1, int channel1 = -1, uint32_t decode_flags = 0)
+		{
+			return transcode_slice(pDst_blocks, num_blocks_x, num_blocks_y, pImage_data, image_data_size, fmt,
+				output_block_or_pixel_stride_in_bytes, bc1_allow_threecolor_blocks, (header.m_flags & cBASISHeaderFlagHasAlphaSlices) != 0, slice_desc.m_orig_width, slice_desc.m_orig_height, output_row_pitch_in_blocks_or_pixels,
+				pState, output_rows_in_pixels, channel0, channel1, decode_flags);
+		}
+
+		// Container independent transcoding
+		bool transcode_image(
+			transcoder_texture_format target_format,
+			void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels,
+			const uint8_t* pCompressed_data, uint32_t compressed_data_length,
+			uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t orig_width, uint32_t orig_height, uint32_t level_index,
+			uint32_t slice_offset, uint32_t slice_length,
+			uint32_t decode_flags = 0,
+			bool has_alpha = false,
+			bool is_video = false,
+			uint32_t output_row_pitch_in_blocks_or_pixels = 0,
+			basisu_transcoder_state* pState = nullptr,
+			uint32_t output_rows_in_pixels = 0,
+			int channel0 = -1, int channel1 = -1);
+	};
+
 	struct basisu_slice_info
 	{
 		uint32_t m_orig_width;
@ -175,19 +339,19 @@ namespace basist
 		uint32_t m_slice_index;	// the slice index in the .basis file
 		uint32_t m_image_index;	// the source image index originally provided to the encoder
 		uint32_t m_level_index;	// the mipmap level within this image
-		
+
 		uint32_t m_unpacked_slice_crc16;
-		
+
 		bool m_alpha_flag;		// true if the slice has alpha data
 		bool m_iframe_flag;		// true if the slice is an I-Frame
 	};

-	typedef std::vector<basisu_slice_info> basisu_slice_info_vec;
+	typedef basisu::vector<basisu_slice_info> basisu_slice_info_vec;

 	struct basisu_image_info
 	{
 		uint32_t m_image_index;
-		uint32_t m_total_levels;	
+		uint32_t m_total_levels;

 		uint32_t m_orig_width;
 		uint32_t m_orig_height;
@ -199,8 +363,8 @@ namespace basist
 		uint32_t m_num_blocks_y;
 		uint32_t m_total_blocks;

-		uint32_t m_first_slice_index;	
-								
+		uint32_t m_first_slice_index;
+
 		bool m_alpha_flag;		// true if the image has alpha data
 		bool m_iframe_flag;		// true if the image is an I-Frame
 	};
@ -220,8 +384,13 @@ namespace basist
 		uint32_t m_num_blocks_y;
 		uint32_t m_total_blocks;

-		uint32_t m_first_slice_index;	
-								
+		uint32_t m_first_slice_index;
+
+		uint32_t m_rgb_file_ofs;
+		uint32_t m_rgb_file_len;
+		uint32_t m_alpha_file_ofs;
+		uint32_t m_alpha_file_len;
+
 		bool m_alpha_flag;		// true if the image has alpha data
 		bool m_iframe_flag;		// true if the image is an I-Frame
 	};
@ -232,13 +401,19 @@ namespace basist
 		uint32_t m_total_header_size;

 		uint32_t m_total_selectors;
+		// will be 0 for UASTC or if the file uses global codebooks
+		uint32_t m_selector_codebook_ofs;
 		uint32_t m_selector_codebook_size;

 		uint32_t m_total_endpoints;
+		// will be 0 for UASTC or if the file uses global codebooks
+		uint32_t m_endpoint_codebook_ofs;
 		uint32_t m_endpoint_codebook_size;

+		uint32_t m_tables_ofs;
 		uint32_t m_tables_size;
-		uint32_t m_slices_size;	
+
+		uint32_t m_slices_size;

 		basis_texture_type m_tex_type;
 		uint32_t m_us_per_frame;
@ -247,14 +422,16 @@ namespace basist
 		basisu_slice_info_vec m_slice_info;

 		uint32_t m_total_images;	 // total # of images
-		std::vector<uint32_t> m_image_mipmap_levels; // the # of mipmap levels for each image
+		basisu::vector<uint32_t> m_image_mipmap_levels; // the # of mipmap levels for each image

 		uint32_t m_userdata0;
 		uint32_t m_userdata1;
-		
-		bool m_etc1s;					// always true for basis universal
+
+		basis_tex_format m_tex_format; // ETC1S, UASTC, etc.
+
 		bool m_y_flipped;				// true if the image was Y flipped
-		bool m_has_alpha_slices;	// true if the texture has alpha slices (even slices RGB, odd slices alpha)
+		bool m_etc1s;					// true if the file is ETC1S
+		bool m_has_alpha_slices;	// true if the texture has alpha slices (for ETC1S: even slices RGB, odd slices alpha)
 	};

 	// High-level transcoder class which accepts .basis file data and allows the caller to query information about the file and transcode image levels to various texture formats.
@ -265,81 +442,67 @@ namespace basist
 		basisu_transcoder& operator= (const basisu_transcoder&);

 	public:
-		basisu_transcoder(const etc1_global_selector_codebook *pGlobal_sel_codebook);
+		basisu_transcoder(const etc1_global_selector_codebook* pGlobal_sel_codebook);

 		// Validates the .basis file. This computes a crc16 over the entire file, so it's slow.
-		bool validate_file_checksums(const void *pData, uint32_t data_size, bool full_validation) const;
+		bool validate_file_checksums(const void* pData, uint32_t data_size, bool full_validation) const;

 		// Quick header validation - no crc16 checks.
-		bool validate_header(const void *pData, uint32_t data_size) const;
+		bool validate_header(const void* pData, uint32_t data_size) const;
+
+		basis_texture_type get_texture_type(const void* pData, uint32_t data_size) const;
+		bool get_userdata(const void* pData, uint32_t data_size, uint32_t& userdata0, uint32_t& userdata1) const;

-		basis_texture_type get_texture_type(const void *pData, uint32_t data_size) const;
-		bool get_userdata(const void *pData, uint32_t data_size, uint32_t &userdata0, uint32_t &userdata1) const;
-		
 		// Returns the total number of images in the basis file (always 1 or more).
 		// Note that the number of mipmap levels for each image may differ, and that images may have different resolutions.
-		uint32_t get_total_images(const void *pData, uint32_t data_size) const;
+		uint32_t get_total_images(const void* pData, uint32_t data_size) const;
+
+		basis_tex_format get_tex_format(const void* pData, uint32_t data_size) const;

 		// Returns the number of mipmap levels in an image.
-		uint32_t get_total_image_levels(const void *pData, uint32_t data_size, uint32_t image_index) const;
-		
+		uint32_t get_total_image_levels(const void* pData, uint32_t data_size, uint32_t image_index) const;
+
 		// Returns basic information about an image. Note that orig_width/orig_height may not be a multiple of 4.
-		bool get_image_level_desc(const void *pData, uint32_t data_size, uint32_t image_index, uint32_t level_index, uint32_t &orig_width, uint32_t &orig_height, uint32_t &total_blocks) const;
+		bool get_image_level_desc(const void* pData, uint32_t data_size, uint32_t image_index, uint32_t level_index, uint32_t& orig_width, uint32_t& orig_height, uint32_t& total_blocks) const;

 		// Returns information about the specified image.
-		bool get_image_info(const void *pData, uint32_t data_size, basisu_image_info &image_info, uint32_t image_index) const;
+		bool get_image_info(const void* pData, uint32_t data_size, basisu_image_info& image_info, uint32_t image_index) const;

 		// Returns information about the specified image's mipmap level.
-		bool get_image_level_info(const void *pData, uint32_t data_size, basisu_image_level_info &level_info, uint32_t image_index, uint32_t level_index) const;
-				
+		bool get_image_level_info(const void* pData, uint32_t data_size, basisu_image_level_info& level_info, uint32_t image_index, uint32_t level_index) const;
+
 		// Get a description of the basis file and low-level information about each slice.
-		bool get_file_info(const void *pData, uint32_t data_size, basisu_file_info &file_info) const;
-				
+		bool get_file_info(const void* pData, uint32_t data_size, basisu_file_info& file_info) const;
+
 		// start_transcoding() must be called before calling transcode_slice() or transcode_image_level().
-		// This decompresses the selector/endpoint codebooks, so ideally you would only call this once per .basis file (not each image/mipmap level).
-		bool start_transcoding(const void *pData, uint32_t data_size) const;
-		
+		// For ETC1S files, this call decompresses the selector/endpoint codebooks, so ideally you would only call this once per .basis file (not each image/mipmap level).
+		bool start_transcoding(const void* pData, uint32_t data_size);
+
+		bool stop_transcoding();
+
 		// Returns true if start_transcoding() has been called.
-		bool get_ready_to_transcode() const { return m_lowlevel_decoder.m_endpoints.size() > 0; }
+		bool get_ready_to_transcode() const { return m_ready_to_transcode; }

-		enum 
-		{
-			// PVRTC1: decode non-pow2 ETC1S texture level to the next larger power of 2 (not implemented yet, but we're going to support it). Ignored if the slice's dimensions are already a power of 2.
-			cDecodeFlagsPVRTCDecodeToNextPow2 = 2,	
-			
-			// When decoding to an opaque texture format, if the basis file has alpha, decode the alpha slice instead of the color slice to the output texture format.
-			// This is primarily to allow decoding of textures with alpha to multiple ETC1 textures (one for color, another for alpha).
-			cDecodeFlagsTranscodeAlphaDataToOpaqueFormats = 4,
-
-			// Forbid usage of BC1 3 color blocks (we don't support BC1 punchthrough alpha yet).
-			// This flag is used internally when decoding to BC3.
-			cDecodeFlagsBC1ForbidThreeColorBlocks = 8,
-
-			// The output buffer contains alpha endpoint/selector indices. 
-			// Used internally when decoding formats like ASTC that require both color and alpha data to be available when transcoding to the output format.
-			cDecodeFlagsOutputHasAlphaIndices = 16
-		};
-								
 		// transcode_image_level() decodes a single mipmap level from the .basis file to any of the supported output texture formats.
 		// It'll first find the slice(s) to transcode, then call transcode_slice() one or two times to decode both the color and alpha texture data (or RG texture data from two slices for BC5).
 		// If the .basis file doesn't have alpha slices, the output alpha blocks will be set to fully opaque (all 255's).
 		// Currently, to decode to PVRTC1 the basis texture's dimensions in pixels must be a power of 2, due to PVRTC1 format requirements. 
 		// output_blocks_buf_size_in_blocks_or_pixels should be at least the image level's total_blocks (num_blocks_x * num_blocks_y), or the total number of output pixels if fmt==cTFRGBA32.
 		// output_row_pitch_in_blocks_or_pixels: Number of blocks or pixels per row. If 0, the transcoder uses the slice's num_blocks_x or orig_width (NOT num_blocks_x * 4). Ignored for PVRTC1 (due to texture swizzling).
-		// output_rows_in_pixels: Ignored unless fmt is cRGBA32. The total number of output rows in the output buffer. If 0, the transcoder assumes the slice's orig_height (NOT num_blocks_y * 4).
+		// output_rows_in_pixels: Ignored unless fmt is uncompressed (cRGBA32, etc.). The total number of output rows in the output buffer. If 0, the transcoder assumes the slice's orig_height (NOT num_blocks_y * 4).
 		// Notes: 
 		// - basisu_transcoder_init() must have been called first to initialize the transcoder lookup tables before calling this function.
 		// - This method assumes the output texture buffer is readable. In some cases to handle alpha, the transcoder will write temporary data to the output texture in
 		// a first pass, which will be read in a second pass.
 		bool transcode_image_level(
-			const void *pData, uint32_t data_size, 
-			uint32_t image_index, uint32_t level_index, 
-			void *pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels,
+			const void* pData, uint32_t data_size,
+			uint32_t image_index, uint32_t level_index,
+			void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels,
 			transcoder_texture_format fmt,
-			uint32_t decode_flags = 0, uint32_t output_row_pitch_in_blocks_or_pixels = 0, basisu_transcoder_state *pState = nullptr, uint32_t output_rows_in_pixels = 0) const;
+			uint32_t decode_flags = 0, uint32_t output_row_pitch_in_blocks_or_pixels = 0, basisu_transcoder_state* pState = nullptr, uint32_t output_rows_in_pixels = 0) const;

 		// Finds the basis slice corresponding to the specified image/level/alpha params, or -1 if the slice can't be found.
-		int find_slice(const void *pData, uint32_t data_size, uint32_t image_index, uint32_t level_index, bool alpha_data) const;
+		int find_slice(const void* pData, uint32_t data_size, uint32_t image_index, uint32_t level_index, bool alpha_data) const;

 		// transcode_slice() decodes a single slice from the .basis file. It's a low-level API - most likely you want to use transcode_image_level().
 		// This is a low-level API, and will be needed to be called multiple times to decode some texture formats (like BC3, BC5, or ETC2).
@ -350,21 +513,39 @@ namespace basist
 		// output_rows_in_pixels: Ignored unless fmt is cRGBA32. The total number of output rows in the output buffer. If 0, the transcoder assumes the slice's orig_height (NOT num_blocks_y * 4).
 		// Notes:
 		// - basisu_transcoder_init() must have been called first to initialize the transcoder lookup tables before calling this function.
-		bool transcode_slice(const void *pData, uint32_t data_size, uint32_t slice_index, 
-			void *pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels,
-			block_format fmt, uint32_t output_block_stride_in_bytes, uint32_t decode_flags = 0, uint32_t output_row_pitch_in_blocks_or_pixels = 0, basisu_transcoder_state * pState = nullptr, void* pAlpha_blocks = nullptr, uint32_t output_rows_in_pixels = 0) const;
+		bool transcode_slice(const void* pData, uint32_t data_size, uint32_t slice_index,
+			void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels,
+			block_format fmt, uint32_t output_block_stride_in_bytes, uint32_t decode_flags = 0, uint32_t output_row_pitch_in_blocks_or_pixels = 0, basisu_transcoder_state* pState = nullptr, void* pAlpha_blocks = nullptr,
+			uint32_t output_rows_in_pixels = 0, int channel0 = -1, int channel1 = -1) const;
+
+		static void write_opaque_alpha_blocks(
+			uint32_t num_blocks_x, uint32_t num_blocks_y,
+			void* pOutput_blocks, block_format fmt,
+			uint32_t block_stride_in_bytes, uint32_t output_row_pitch_in_blocks_or_pixels);
+
+		void set_global_codebooks(const basisu_lowlevel_etc1s_transcoder* pGlobal_codebook) { m_lowlevel_etc1s_decoder.set_global_codebooks(pGlobal_codebook); }
+		const basisu_lowlevel_etc1s_transcoder* get_global_codebooks() const { return m_lowlevel_etc1s_decoder.get_global_codebooks(); }
+
+		const basisu_lowlevel_etc1s_transcoder& get_lowlevel_etc1s_decoder() const { return m_lowlevel_etc1s_decoder; }
+		basisu_lowlevel_etc1s_transcoder& get_lowlevel_etc1s_decoder() { return m_lowlevel_etc1s_decoder; }
+
+		const basisu_lowlevel_uastc_transcoder& get_lowlevel_uastc_decoder() const { return m_lowlevel_uastc_decoder; }
+		basisu_lowlevel_uastc_transcoder& get_lowlevel_uastc_decoder() { return m_lowlevel_uastc_decoder; }

 	private:
-		mutable basisu_lowlevel_transcoder m_lowlevel_decoder;
+		mutable basisu_lowlevel_etc1s_transcoder m_lowlevel_etc1s_decoder;
+		mutable basisu_lowlevel_uastc_transcoder m_lowlevel_uastc_decoder;
+
+		bool m_ready_to_transcode;

 		int find_first_slice_index(const void* pData, uint32_t data_size, uint32_t image_index, uint32_t level_index) const;
-		
+
 		bool validate_header_quick(const void* pData, uint32_t data_size) const;
 	};

-	// basisu_transcoder_init() must be called before a .basis file can be transcoded.
+	// basisu_transcoder_init() MUST be called before a .basis file can be transcoded.
 	void basisu_transcoder_init();
-
+		
 	enum debug_flags_t
 	{
 		cDebugFlagVisCRs = 1,
@ -374,4 +555,387 @@ namespace basist
 	uint32_t get_debug_flags();
 	void set_debug_flags(uint32_t f);

+	// ------------------------------------------------------------------------------------------------------ 
+	// Optional .KTX2 file format support
+	// KTX2 reading optionally requires miniz or Zstd decompressors for supercompressed UASTC files.
+	// ------------------------------------------------------------------------------------------------------ 
+#if BASISD_SUPPORT_KTX2
+#pragma pack(push)
+#pragma pack(1)
+	struct ktx2_header
+	{
+		uint8_t m_identifier[12];
+		basisu::packed_uint<4> m_vk_format;
+		basisu::packed_uint<4> m_type_size;
+		basisu::packed_uint<4> m_pixel_width;
+		basisu::packed_uint<4> m_pixel_height;
+		basisu::packed_uint<4> m_pixel_depth;
+		basisu::packed_uint<4> m_layer_count;
+		basisu::packed_uint<4> m_face_count;
+		basisu::packed_uint<4> m_level_count;
+		basisu::packed_uint<4> m_supercompression_scheme;
+		basisu::packed_uint<4> m_dfd_byte_offset;
+		basisu::packed_uint<4> m_dfd_byte_length;
+		basisu::packed_uint<4> m_kvd_byte_offset;
+		basisu::packed_uint<4> m_kvd_byte_length;
+		basisu::packed_uint<8> m_sgd_byte_offset;
+		basisu::packed_uint<8> m_sgd_byte_length;
+	};
+
+	struct ktx2_level_index
+	{
+		basisu::packed_uint<8> m_byte_offset;
+		basisu::packed_uint<8> m_byte_length;
+		basisu::packed_uint<8> m_uncompressed_byte_length;
+	};
+
+	struct ktx2_etc1s_global_data_header
+	{
+		basisu::packed_uint<2> m_endpoint_count;
+		basisu::packed_uint<2> m_selector_count;
+		basisu::packed_uint<4> m_endpoints_byte_length;
+		basisu::packed_uint<4> m_selectors_byte_length;
+		basisu::packed_uint<4> m_tables_byte_length;
+		basisu::packed_uint<4> m_extended_byte_length;
+	};
+
+	struct ktx2_etc1s_image_desc
+	{
+		basisu::packed_uint<4> m_image_flags;
+		basisu::packed_uint<4> m_rgb_slice_byte_offset;
+		basisu::packed_uint<4> m_rgb_slice_byte_length;
+		basisu::packed_uint<4> m_alpha_slice_byte_offset;
+		basisu::packed_uint<4> m_alpha_slice_byte_length;
+	};
+
+	struct ktx2_animdata
+	{
+		basisu::packed_uint<4> m_duration;
+		basisu::packed_uint<4> m_timescale;
+		basisu::packed_uint<4> m_loopcount;
+	};
+#pragma pack(pop)
+
+	const uint32_t KTX2_VK_FORMAT_UNDEFINED = 0;
+	const uint32_t KTX2_KDF_DF_MODEL_UASTC = 166;
+	const uint32_t KTX2_KDF_DF_MODEL_ETC1S = 163;
+	const uint32_t KTX2_IMAGE_IS_P_FRAME = 2;
+	const uint32_t KTX2_UASTC_BLOCK_SIZE = 16;
+	const uint32_t KTX2_MAX_SUPPORTED_LEVEL_COUNT = 16; // this is an implementation specific constraint and can be increased
+
+	// The KTX2 transfer functions supported by KTX2
+	const uint32_t KTX2_KHR_DF_TRANSFER_LINEAR = 1;
+	const uint32_t KTX2_KHR_DF_TRANSFER_SRGB = 2;
+
+	enum ktx2_supercompression
+	{
+		KTX2_SS_NONE = 0,
+		KTX2_SS_BASISLZ = 1,
+		KTX2_SS_ZSTANDARD = 2
+	};
+
+	extern const uint8_t g_ktx2_file_identifier[12];
+
+	enum ktx2_df_channel_id
+	{
+		KTX2_DF_CHANNEL_ETC1S_RGB = 0U,
+		KTX2_DF_CHANNEL_ETC1S_RRR = 3U,
+		KTX2_DF_CHANNEL_ETC1S_GGG = 4U,
+		KTX2_DF_CHANNEL_ETC1S_AAA = 15U,
+
+		KTX2_DF_CHANNEL_UASTC_DATA = 0U,
+		KTX2_DF_CHANNEL_UASTC_RGB = 0U,
+		KTX2_DF_CHANNEL_UASTC_RGBA = 3U,
+		KTX2_DF_CHANNEL_UASTC_RRR = 4U,
+		KTX2_DF_CHANNEL_UASTC_RRRG = 5U,
+		KTX2_DF_CHANNEL_UASTC_RG = 6U,
+	};
+
+	inline const char* ktx2_get_etc1s_df_channel_id_str(ktx2_df_channel_id id)
+	{
+		switch (id)
+		{
+		case KTX2_DF_CHANNEL_ETC1S_RGB: return "RGB";
+		case KTX2_DF_CHANNEL_ETC1S_RRR: return "RRR";
+		case KTX2_DF_CHANNEL_ETC1S_GGG: return "GGG";
+		case KTX2_DF_CHANNEL_ETC1S_AAA: return "AAA";
+		default: break;
+		}
+		return "?";
+	}
+
+	inline const char* ktx2_get_uastc_df_channel_id_str(ktx2_df_channel_id id)
+	{
+		switch (id)
+		{
+		case KTX2_DF_CHANNEL_UASTC_RGB: return "RGB";
+		case KTX2_DF_CHANNEL_UASTC_RGBA: return "RGBA";
+		case KTX2_DF_CHANNEL_UASTC_RRR: return "RRR";
+		case KTX2_DF_CHANNEL_UASTC_RRRG: return "RRRG";
+		case KTX2_DF_CHANNEL_UASTC_RG: return "RG";
+		default: break;
+		}
+		return "?";
+	}
+
+	enum ktx2_df_color_primaries
+	{
+		KTX2_DF_PRIMARIES_UNSPECIFIED = 0,
+		KTX2_DF_PRIMARIES_BT709 = 1,
+		KTX2_DF_PRIMARIES_SRGB = 1,
+		KTX2_DF_PRIMARIES_BT601_EBU = 2,
+		KTX2_DF_PRIMARIES_BT601_SMPTE = 3,
+		KTX2_DF_PRIMARIES_BT2020 = 4,
+		KTX2_DF_PRIMARIES_CIEXYZ = 5,
+		KTX2_DF_PRIMARIES_ACES = 6,
+		KTX2_DF_PRIMARIES_ACESCC = 7,
+		KTX2_DF_PRIMARIES_NTSC1953 = 8,
+		KTX2_DF_PRIMARIES_PAL525 = 9,
+		KTX2_DF_PRIMARIES_DISPLAYP3 = 10,
+		KTX2_DF_PRIMARIES_ADOBERGB = 11
+	};
+
+	inline const char* ktx2_get_df_color_primaries_str(ktx2_df_color_primaries p)
+	{
+		switch (p)
+		{
+		case KTX2_DF_PRIMARIES_UNSPECIFIED: return "UNSPECIFIED";
+		case KTX2_DF_PRIMARIES_BT709: return "BT709";
+		case KTX2_DF_PRIMARIES_BT601_EBU: return "EBU"; 
+		case KTX2_DF_PRIMARIES_BT601_SMPTE: return "SMPTE";
+		case KTX2_DF_PRIMARIES_BT2020: return "BT2020";
+		case KTX2_DF_PRIMARIES_CIEXYZ: return "CIEXYZ";
+		case KTX2_DF_PRIMARIES_ACES: return "ACES";
+		case KTX2_DF_PRIMARIES_ACESCC: return "ACESCC"; 
+		case KTX2_DF_PRIMARIES_NTSC1953: return "NTSC1953";
+		case KTX2_DF_PRIMARIES_PAL525: return "PAL525";
+		case KTX2_DF_PRIMARIES_DISPLAYP3: return "DISPLAYP3";
+		case KTX2_DF_PRIMARIES_ADOBERGB: return "ADOBERGB";
+		default: break;
+		}
+		return "?";
+	}	
+
+	// Information about a single 2D texture "image" in a KTX2 file.
+	struct ktx2_image_level_info
+	{
+		// The mipmap level index (0=largest), texture array layer index, and cubemap face index of the image.
+		uint32_t m_level_index;
+		uint32_t m_layer_index;
+		uint32_t m_face_index;
+
+		// The image's actual (or the original source image's) width/height in pixels, which may not be divisible by 4 pixels.
+		uint32_t m_orig_width;
+		uint32_t m_orig_height;
+
+		// The image's physical width/height, which will always be divisible by 4 pixels.
+		uint32_t m_width;
+		uint32_t m_height;
+
+		// The texture's dimensions in 4x4 texel blocks.
+		uint32_t m_num_blocks_x;
+		uint32_t m_num_blocks_y;
+
+		// The total number of blocks
+		uint32_t m_total_blocks;
+
+		// true if the image has alpha data
+		bool m_alpha_flag;
+
+		// true if the image is an I-Frame. Currently, for ETC1S textures, the first frame will always be an I-Frame, and subsequent frames will always be P-Frames.
+		bool m_iframe_flag;
+	};
+		
+	// Thread-specific ETC1S/supercompressed UASTC transcoder state. (If you're not doing multithreading transcoding you can ignore this.)
+	struct ktx2_transcoder_state
+	{
+		basist::basisu_transcoder_state m_transcoder_state;
+		basisu::uint8_vec m_level_uncomp_data;
+		int m_uncomp_data_level_index;
+
+		void clear()
+		{
+			m_transcoder_state.clear();
+			m_level_uncomp_data.clear();
+			m_uncomp_data_level_index = -1;
+		}
+	};
+
+	// This class is quite similar to basisu_transcoder. It treats KTX2 files as a simple container for ETC1S/UASTC texture data.
+	// It does not support 1D or 3D textures.
+	// It only supports 2D and cubemap textures, with or without mipmaps, texture arrays of 2D/cubemap textures, and texture video files. 
+	// It only supports raw non-supercompressed UASTC, ETC1S, UASTC+Zstd, or UASTC+zlib compressed files.
+	// DFD (Data Format Descriptor) parsing is purposely as simple as possible. 
+	// If you need to know how to interpret the texture channels you'll need to parse the DFD yourself after calling get_dfd().
+	class ktx2_transcoder
+	{
+	public:
+		ktx2_transcoder(basist::etc1_global_selector_codebook* pGlobal_sel_codebook);
+
+		// Frees all allocations, resets object.
+		void clear();
+
+		// init() parses the KTX2 header, level index array, DFD, and key values, but nothing else.
+		// Importantly, it does not parse or decompress the ETC1S global supercompressed data, so some things (like which frames are I/P-Frames) won't be available until start_transcoding() is called.
+		// This method holds a pointer to the file data until clear() is called.
+		bool init(const void* pData, uint32_t data_size);
+
+		// Returns the data/size passed to init().
+		const uint8_t* get_data() const { return m_pData; }
+		uint32_t get_data_size() const { return m_data_size; }
+
+		// Returns the KTX2 header. Valid after init().
+		const ktx2_header& get_header() const { return m_header; }
+
+		// Returns the KTX2 level index array. There will be one entry for each mipmap level. Valid after init().
+		const basisu::vector<ktx2_level_index>& get_level_index() const { return m_levels; }
+
+		// Returns the texture's width in texels. Always non-zero, might not be divisible by 4. Valid after init().
+		uint32_t get_width() const { return m_header.m_pixel_width; }
+
+		// Returns the texture's height in texels. Always non-zero, might not be divisible by 4. Valid after init().
+		uint32_t get_height() const { return m_header.m_pixel_height; }
+
+		// Returns the texture's number of mipmap levels. Always returns 1 or higher. Valid after init().
+		uint32_t get_levels() const { return m_header.m_level_count; }
+
+		// Returns the number of faces. Returns 1 for 2D textures and or 6 for cubemaps. Valid after init().
+		uint32_t get_faces() const { return m_header.m_face_count; }
+
+		// Returns 0 or the number of layers in the texture array or texture video. Valid after init().
+		uint32_t get_layers() const { return m_header.m_layer_count; }
+
+		// Returns cETC1S or cUASTC4x4. Valid after init().
+		basist::basis_tex_format get_format() const { return m_format; } 
+
+		bool is_etc1s() const { return get_format() == basist::basis_tex_format::cETC1S; }
+
+		bool is_uastc() const { return get_format() == basist::basis_tex_format::cUASTC4x4; }
+
+		// Returns true if the ETC1S file has two planes (typically RGBA, or RRRG), or true if the UASTC file has alpha data. Valid after init().
+		uint32_t get_has_alpha() const { return m_has_alpha; }
+
+		// Returns the entire Data Format Descriptor (DFD) from the KTX2 file. Valid after init().
+		// See https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.html#_the_khronos_data_format_descriptor_overview
+		const basisu::uint8_vec& get_dfd() const { return m_dfd; }
+
+		// Some basic DFD accessors. Valid after init().
+		uint32_t get_dfd_color_model() const { return m_dfd_color_model; }
+
+		// Returns the DFD color primary.
+		// We do not validate the color primaries, so the returned value may not be in the ktx2_df_color_primaries enum.
+		ktx2_df_color_primaries get_dfd_color_primaries() const { return m_dfd_color_prims; }
+		
+		// Returns KTX2_KHR_DF_TRANSFER_LINEAR or KTX2_KHR_DF_TRANSFER_SRGB.
+		uint32_t get_dfd_transfer_func() const { return m_dfd_transfer_func; }
+
+		uint32_t get_dfd_flags() const { return m_dfd_flags; }
+
+		// Returns 1 (ETC1S/UASTC) or 2 (ETC1S with an internal alpha channel).
+		uint32_t get_dfd_total_samples() const { return m_dfd_samples;	}
+		
+		// Returns the channel mapping for each DFD "sample". UASTC always has 1 sample, ETC1S can have one or two. 
+		// Note the returned value SHOULD be one of the ktx2_df_channel_id enums, but we don't validate that. 
+		// It's up to the caller to decide what to do if the value isn't in the enum.
+		ktx2_df_channel_id get_dfd_channel_id0() const { return m_dfd_chan0; }
+		ktx2_df_channel_id get_dfd_channel_id1() const { return m_dfd_chan1; }
+
+		// Key value field data.
+		struct key_value
+		{
+			// The key field is UTF8 and always zero terminated.
+			basisu::uint8_vec m_key;
+
+			// The value may be empty. It consists of raw bytes which may or may not be zero terminated.
+			basisu::uint8_vec m_value;
+
+			bool operator< (const key_value& rhs) const { return strcmp((const char*)m_key.data(), (const char *)rhs.m_key.data()) < 0; }
+		};
+		typedef basisu::vector<key_value> key_value_vec;
+
+		// Returns the array of key-value entries. This may be empty. Valid after init().
+		// The order of key values fields in this array exactly matches the order they were stored in the file. The keys are supposed to be sorted by their Unicode code points.
+		const key_value_vec& get_key_values() const { return m_key_values; }
+
+		const basisu::uint8_vec *find_key(const std::string& key_name) const;
+
+		// Low-level ETC1S specific accessors
+
+		// Returns the ETC1S global supercompression data header, which is only valid after start_transcoding() is called.
+		const ktx2_etc1s_global_data_header& get_etc1s_header() const { return m_etc1s_header; }
+
+		// Returns the array of ETC1S image descriptors, which is only valid after get_etc1s_image_descs() is called.
+		const basisu::vector<ktx2_etc1s_image_desc>& get_etc1s_image_descs() const { return m_etc1s_image_descs; }
+
+		// Must have called startTranscoding() first
+		uint32_t get_etc1s_image_descs_image_flags(uint32_t level_index, uint32_t layer_index, uint32_t face_index) const;
+
+		// is_video() is only valid after start_transcoding() is called.
+		// For ETC1S data, if this returns true you must currently transcode the file from first to last frame, in order, without skipping any frames.
+		bool is_video() const { return m_is_video; }
+				
+		// start_transcoding() MUST be called before calling transcode_image().
+		// This method decompresses the ETC1S global endpoint/selector codebooks, which is not free, so try to avoid calling it excessively.
+		bool start_transcoding();
+								
+		// get_image_level_info() be called after init(), but the m_iframe_flag's won't be valid until start_transcoding() is called.
+		// You can call this method before calling transcode_image_level() to retrieve basic information about the mipmap level's dimensions, etc.
+		bool get_image_level_info(ktx2_image_level_info& level_info, uint32_t level_index, uint32_t layer_index, uint32_t face_index) const;
+
+		// transcode_image_level() transcodes a single 2D texture or cubemap face from the KTX2 file.
+		// Internally it uses the same low-level transcode API's as basisu_transcoder::transcode_image_level().
+		// If the file is UASTC and is supercompressed with Zstandard, and the file is a texture array or cubemap, it's highly recommended that each mipmap level is 
+		// completely transcoded before switching to another level. Every time the mipmap level is changed all supercompressed level data must be decompressed using Zstandard as a single unit.
+		// Currently ETC1S videos must always be transcoded from first to last frame (or KTX2 "layer"), in order, with no skipping of frames.
+		// By default this method is not thread safe unless you specify a pointer to a user allocated thread-specific transcoder_state struct.
+		bool transcode_image_level(
+			uint32_t level_index, uint32_t layer_index, uint32_t face_index,
+			void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels,
+			basist::transcoder_texture_format fmt,
+			uint32_t decode_flags = 0, uint32_t output_row_pitch_in_blocks_or_pixels = 0, uint32_t output_rows_in_pixels = 0, int channel0 = -1, int channel1 = -1,
+			ktx2_transcoder_state *pState = nullptr);
+				
+	private:
+		const uint8_t* m_pData;
+		uint32_t m_data_size;
+
+		ktx2_header m_header;
+		basisu::vector<ktx2_level_index> m_levels;
+		basisu::uint8_vec m_dfd;
+		key_value_vec m_key_values;
+		
+		ktx2_etc1s_global_data_header m_etc1s_header;
+		basisu::vector<ktx2_etc1s_image_desc> m_etc1s_image_descs;
+
+		basist::basis_tex_format m_format;
+					
+		uint32_t m_dfd_color_model;
+		ktx2_df_color_primaries m_dfd_color_prims;
+		uint32_t m_dfd_transfer_func;
+		uint32_t m_dfd_flags;
+		uint32_t m_dfd_samples;
+		ktx2_df_channel_id m_dfd_chan0, m_dfd_chan1;
+								
+		basist::basisu_lowlevel_etc1s_transcoder m_etc1s_transcoder;
+		basist::basisu_lowlevel_uastc_transcoder m_uastc_transcoder;
+				
+		ktx2_transcoder_state m_def_transcoder_state;
+
+		bool m_has_alpha;
+		bool m_is_video;
+
+		bool decompress_level_data(uint32_t level_index, basisu::uint8_vec& uncomp_data);
+		bool decompress_etc1s_global_data();
+		bool read_key_values();
+	};
+
+#endif // BASISD_SUPPORT_KTX2
+
+	// Returns true if the transcoder was compiled with KTX2 support.
+	bool basisu_transcoder_supports_ktx2();
+
+	// Returns true if the transcoder was compiled with Zstandard support.
+	bool basisu_transcoder_supports_ktx2_zstd();
+
 } // namespace basisu
+
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_internal.h
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_internal.h
@ -1,5 +1,5 @@
 // basisu_transcoder_internal.h - Universal texture format transcoder library.
-// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
 //
 // Important: If compiling with gcc, be sure strict aliasing is disabled: -fno-strict-aliasing
 //
@ -20,8 +20,8 @@
 #pragma warning (disable: 4127) //  conditional expression is constant
 #endif

-#define BASISD_LIB_VERSION 107
-#define BASISD_VERSION_STRING "01.11"
+#define BASISD_LIB_VERSION 115
+#define BASISD_VERSION_STRING "01.15"

 #ifdef _DEBUG
 #define BASISD_BUILD_DEBUG
@ -45,38 +45,44 @@ namespace basist
 	enum class block_format
 	{
 		cETC1,								// ETC1S RGB 
+		cETC2_RGBA,							// full ETC2 EAC RGBA8 block
 		cBC1,									// DXT1 RGB 
+		cBC3,									// BC4 block followed by a four color BC1 block
 		cBC4,									// DXT5A (alpha block only)
+		cBC5,									// two BC4 blocks
 		cPVRTC1_4_RGB,						// opaque-only PVRTC1 4bpp
 		cPVRTC1_4_RGBA,					// PVRTC1 4bpp RGBA
-		cBC7_M6_OPAQUE_ONLY,				// RGB BC7 mode 6
+		cBC7,									// Full BC7 block, any mode
 		cBC7_M5_COLOR,						// RGB BC7 mode 5 color (writes an opaque mode 5 block)
 		cBC7_M5_ALPHA,						// alpha portion of BC7 mode 5 (cBC7_M5_COLOR output data must have been written to the output buffer first to set the mode/rot fields etc.)
 		cETC2_EAC_A8,						// alpha block of ETC2 EAC (first 8 bytes of the 16-bit ETC2 EAC RGBA format)
 		cASTC_4x4,							// ASTC 4x4 (either color-only or color+alpha). Note that the transcoder always currently assumes sRGB is not enabled when outputting ASTC 
 												// data. If you use a sRGB ASTC format you'll get ~1 LSB of additional error, because of the different way ASTC decoders scale 8-bit endpoints to 16-bits during unpacking.
+		
 		cATC_RGB,
 		cATC_RGBA_INTERPOLATED_ALPHA,
 		cFXT1_RGB,							// Opaque-only, has oddball 8x4 pixel block size
+
+		cPVRTC2_4_RGB,
+		cPVRTC2_4_RGBA,
+
+		cETC2_EAC_R11,
+		cETC2_EAC_RG11,
 												
 		cIndices,							// Used internally: Write 16-bit endpoint and selector indices directly to output (output block must be at least 32-bits)

 		cRGB32,								// Writes RGB components to 32bpp output pixels
 		cRGBA32,								// Writes RGB255 components to 32bpp output pixels
 		cA32,									// Writes alpha component to 32bpp output pixels
-
+				
 		cRGB565,
 		cBGR565,
 		
 		cRGBA4444_COLOR,
 		cRGBA4444_ALPHA,
 		cRGBA4444_COLOR_OPAQUE,
-
-		cPVRTC2_4_RGB,
-		cPVRTC2_4_RGBA,
-
-		cETC2_EAC_R11,
-		
+		cRGBA4444,
+						
 		cTotalBlockFormats
 	};

@ -116,7 +122,7 @@ namespace basist
 			basisu::clear_vector(m_tree);
 		}

-		bool init(uint32_t total_syms, const uint8_t *pCode_sizes)
+		bool init(uint32_t total_syms, const uint8_t *pCode_sizes, uint32_t fast_lookup_bits = basisu::cHuffmanFastLookupBits)
 		{
 			if (!total_syms)
 			{
@ -127,8 +133,10 @@ namespace basist
 			m_code_sizes.resize(total_syms);
 			memcpy(&m_code_sizes[0], pCode_sizes, total_syms);

+			const uint32_t huffman_fast_lookup_size = 1 << fast_lookup_bits;
+
 			m_lookup.resize(0);
-			m_lookup.resize(basisu::cHuffmanFastLookupSize);
+			m_lookup.resize(huffman_fast_lookup_size);

 			m_tree.resize(0);
 			m_tree.resize(total_syms * 2);
@ -166,10 +174,10 @@ namespace basist
 				for (l = code_size; l > 0; l--, cur_code >>= 1)
 					rev_code = (rev_code << 1) | (cur_code & 1);

-				if (code_size <= basisu::cHuffmanFastLookupBits)
+				if (code_size <= fast_lookup_bits)
 				{
 					uint32_t k = (code_size << 16) | sym_index;
-					while (rev_code < basisu::cHuffmanFastLookupSize)
+					while (rev_code < huffman_fast_lookup_size)
 					{
 						if (m_lookup[rev_code] != 0)
 						{
@ -184,9 +192,9 @@ namespace basist
 				}

 				int tree_cur;
-				if (0 == (tree_cur = m_lookup[rev_code & (basisu::cHuffmanFastLookupSize - 1)]))
+				if (0 == (tree_cur = m_lookup[rev_code & (huffman_fast_lookup_size - 1)]))
 				{
-					const uint32_t idx = rev_code & (basisu::cHuffmanFastLookupSize - 1);
+					const uint32_t idx = rev_code & (huffman_fast_lookup_size - 1);
 					if (m_lookup[idx] != 0)
 					{
 						// Supplied codesizes can't create a valid prefix code.
@ -204,9 +212,9 @@ namespace basist
 					return false;
 				}

-				rev_code >>= (basisu::cHuffmanFastLookupBits - 1);
+				rev_code >>= (fast_lookup_bits - 1);

-				for (int j = code_size; j > (basisu::cHuffmanFastLookupBits + 1); j--)
+				for (int j = code_size; j > ((int)fast_lookup_bits + 1); j--)
 				{
 					tree_cur -= ((rev_code >>= 1) & 1);

@ -254,6 +262,8 @@ namespace basist
 		}

 		const basisu::uint8_vec &get_code_sizes() const { return m_code_sizes; }
+		const basisu::int_vec get_lookup() const { return m_lookup; }
+		const basisu::int16_vec get_tree() const { return m_tree; }

 		bool is_valid() const { return m_code_sizes.size() > 0; }

@ -430,9 +440,11 @@ namespace basist
 			return v;
 		}

-		inline uint32_t decode_huffman(const huffman_decoding_table &ct)
+		inline uint32_t decode_huffman(const huffman_decoding_table &ct, int fast_lookup_bits = basisu::cHuffmanFastLookupBits)
 		{
 			assert(ct.m_code_sizes.size());
+
+			const uint32_t huffman_fast_lookup_size = 1 << fast_lookup_bits;
 						
 			while (m_bit_buf_size < 16)
 			{
@ -448,14 +460,14 @@ namespace basist
 			int code_len;

 			int sym;
-			if ((sym = ct.m_lookup[m_bit_buf & (basisu::cHuffmanFastLookupSize - 1)]) >= 0)
+			if ((sym = ct.m_lookup[m_bit_buf & (huffman_fast_lookup_size - 1)]) >= 0)
 			{
 				code_len = sym >> 16;
 				sym &= 0xFFFF;
 			}
 			else
 			{
-				code_len = basisu::cHuffmanFastLookupBits;
+				code_len = fast_lookup_bits;
 				do
 				{
 					sym = ct.m_tree[~sym + ((m_bit_buf >> code_len++) & 1)]; // ~sym = -sym - 1
@ -635,6 +647,11 @@ namespace basist
 		return (uint8_t)((i & 0xFFFFFF00U) ? (~(i >> 31)) : i);
 	}

+	enum eNoClamp
+	{
+		cNoClamp = 0
+	};
+
 	struct color32
 	{
 		union
@ -655,21 +672,33 @@ namespace basist
 		color32() { }

 		color32(uint32_t vr, uint32_t vg, uint32_t vb, uint32_t va) { set(vr, vg, vb, va); }
+		color32(eNoClamp unused, uint32_t vr, uint32_t vg, uint32_t vb, uint32_t va) { (void)unused; set_noclamp_rgba(vr, vg, vb, va); }

 		void set(uint32_t vr, uint32_t vg, uint32_t vb, uint32_t va) { c[0] = static_cast<uint8_t>(vr); c[1] = static_cast<uint8_t>(vg); c[2] = static_cast<uint8_t>(vb); c[3] = static_cast<uint8_t>(va); }

+		void set_noclamp_rgb(uint32_t vr, uint32_t vg, uint32_t vb) { c[0] = static_cast<uint8_t>(vr); c[1] = static_cast<uint8_t>(vg); c[2] = static_cast<uint8_t>(vb); }
+		void set_noclamp_rgba(uint32_t vr, uint32_t vg, uint32_t vb, uint32_t va) { set(vr, vg, vb, va); }
+
 		void set_clamped(int vr, int vg, int vb, int va) { c[0] = clamp255(vr); c[1] = clamp255(vg);	c[2] = clamp255(vb); c[3] = clamp255(va); }

 		uint8_t operator[] (uint32_t idx) const { assert(idx < 4); return c[idx]; }
 		uint8_t &operator[] (uint32_t idx) { assert(idx < 4); return c[idx]; }

 		bool operator== (const color32&rhs) const { return m == rhs.m; }
+
+		static color32 comp_min(const color32& a, const color32& b) { return color32(cNoClamp, basisu::minimum(a[0], b[0]), basisu::minimum(a[1], b[1]), basisu::minimum(a[2], b[2]), basisu::minimum(a[3], b[3])); }
+		static color32 comp_max(const color32& a, const color32& b) { return color32(cNoClamp, basisu::maximum(a[0], b[0]), basisu::maximum(a[1], b[1]), basisu::maximum(a[2], b[2]), basisu::maximum(a[3], b[3])); }
 	};

 	struct endpoint
 	{
 		color32 m_color5;
 		uint8_t m_inten5;
+		bool operator== (const endpoint& rhs) const
+		{
+			return (m_color5.r == rhs.m_color5.r) && (m_color5.g == rhs.m_color5.g) && (m_color5.b == rhs.m_color5.b) && (m_inten5 == rhs.m_inten5);
+		}
+		bool operator!= (const endpoint& rhs) const { return !(*this == rhs); }
 	};

 	struct selector
@ -682,6 +711,17 @@ namespace basist

 		uint8_t m_lo_selector, m_hi_selector;
 		uint8_t m_num_unique_selectors;
+		bool operator== (const selector& rhs) const
+		{
+			return (m_selectors[0] == rhs.m_selectors[0]) &&
+				(m_selectors[1] == rhs.m_selectors[1]) &&
+				(m_selectors[2] == rhs.m_selectors[2]) &&
+				(m_selectors[3] == rhs.m_selectors[3]);
+		}
+		bool operator!= (const selector& rhs) const
+		{
+			return !(*this == rhs);
+		}

 		void init_flags()
 		{
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_astc.inc
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_astc.inc
@ -478,4 +478,4 @@
 {31,1,10801},{47,1,12162},{14,1,6117},{14,1,6117},{8,1,50},{20,1,7322},{0,1,1241},{21,1,914},{21,1,914},{21,1,914},{7,1,274},{35,5,1513},{9,1,585},{9,1,585},{26,1,0},{27,1,1513},{26,1,0},{1,1,0},{1,1,0},{1,1,0},{1,1,0},{1,1,0},{1,1,0},{1,1,0},{0,1,0},{1,1,0},{0,1,0},{47,0,9250},{47,0,9250},{47,0,9250},{47,0,9250},{12,1,3690},
 {12,1,3690},{12,1,3690},{8,1,50},{0,1,1241},{0,1,1241},{45,1,65535},{14,1,33274},{42,1,19608},{42,1,13375},{47,1,62627},{42,1,22211},{10,1,6045},{24,1,138},{36,1,39015},{0,1,1732},{35,1,1048},{5,1,766},{5,1,666},{37,1,212},{3,3,1473},{7,1,675},{23,1,410},{14,1,1},{3,3,1473},{14,1,1},{13,1,14121},{13,1,14121},{13,1,14121},{45,1,10571},{45,1,11434},{30,1,6081},{30,1,6081},
 {40,1,137},{36,1,6926},{2,1,1445},{5,1,666},{5,1,666},{5,1,666},{37,1,212},{35,3,1105},{23,1,410},{23,1,410},{14,1,1},{25,1,1105},{14,1,1},{1,1,0},{1,1,0},{1,1,0},{1,1,0},{1,1,0},{1,1,0},{1,1,0},{0,1,0},{1,1,0},{0,1,0},{15,0,9256},{15,0,9256},{15,0,9256},{15,0,9256},{14,1,3985},{14,1,3985},{14,1,3985},{40,1,137},{2,1,1445},
-{2,1,1445},
+{2,1,1445},
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_astc_0_255.inc
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_astc_0_255.inc
@ -478,4 +478,4 @@
 {137,255,10742},{135,255,12066},{107,255,6089},{107,255,6089},{67,255,45},{37,255,7233},{1,255,1184},{218,255,900},{218,255,900},{218,255,900},{204,255,272},{255,167,1513},{189,255,562},{189,255,562},{86,255,0},{253,213,1513},{86,255,0},{255,252,0},{255,254,0},{254,255,0},{252,255,0},{255,252,0},{255,254,0},{252,255,0},{0,255,0},{255,254,0},{0,255,0},{132,0,9248},{132,0,9248},{132,0,9248},{132,0,9248},{98,255,3656},
 {98,255,3656},{98,255,3656},{67,255,45},{1,255,1184},{1,255,1184},{138,255,65535},{107,255,33448},{95,255,19729},{89,255,13446},{135,255,62717},{95,255,22307},{79,255,6021},{73,255,105},{40,255,38959},{0,254,1627},{230,255,996},{224,255,756},{221,255,653},{213,255,194},{255,204,1473},{207,255,675},{198,255,405},{110,255,0},{255,230,1473},{110,255,0},{162,255,14060},{162,255,14060},{162,255,14060},{146,255,10545},{141,255,11378},{116,255,6077},{116,255,6077},
 {76,255,137},{40,255,6873},{7,255,1412},{221,255,653},{221,255,653},{221,255,653},{213,255,194},{255,180,1105},{198,255,405},{198,255,405},{110,255,0},{255,218,1105},{110,255,0},{255,252,0},{255,254,0},{254,255,0},{252,255,0},{255,252,0},{255,254,0},{252,255,0},{0,255,0},{255,254,0},{0,255,0},{140,0,9248},{140,0,9248},{140,0,9248},{140,0,9248},{107,255,3929},{107,255,3929},{107,255,3929},{76,255,137},{7,255,1412},
-{7,255,1412},
+{7,255,1412},
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_atc_55.inc
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_atc_55.inc
@ -478,4 +478,4 @@
 {17,31,11312},{16,31,11037},{13,31,6429},{13,31,6429},{8,31,260},{6,31,10457},{0,31,2642},{26,31,872},{26,31,872},{26,31,872},{25,31,397},{31,22,1513},{23,31,794},{23,31,794},{13,31,1},{29,27,1513},{13,31,1},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{0,31,0},{31,31,0},{0,31,0},{16,0,9248},{16,0,9248},{16,0,9248},{16,0,9248},{12,31,3074},
 {12,31,3074},{12,31,3074},{8,31,260},{0,31,2642},{0,31,2642},{17,31,58848},{15,31,39619},{13,31,24975},{12,31,19007},{16,31,54474},{13,31,27057},{10,31,8569},{9,31,461},{8,31,51302},{0,31,5046},{28,31,979},{27,31,806},{27,31,637},{26,31,292},{31,26,1473},{26,31,953},{24,31,605},{16,31,0},{29,29,1473},{16,31,0},{19,31,13604},{19,31,13604},{19,31,13604},{18,31,11057},{16,31,10429},{14,31,6339},{14,31,6339},
 {10,31,424},{8,31,9713},{1,31,2900},{27,31,637},{27,31,637},{27,31,637},{26,31,292},{30,25,1105},{24,31,605},{24,31,605},{16,31,0},{30,27,1105},{16,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{0,31,0},{31,31,0},{0,31,0},{17,0,9248},{17,0,9248},{17,0,9248},{17,0,9248},{12,31,3330},{12,31,3330},{12,31,3330},{10,31,424},{1,31,2900},
-{1,31,2900},
+{1,31,2900},
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_atc_56.inc
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_atc_56.inc
@ -478,4 +478,4 @@
 {17,63,11312},{16,63,11037},{13,63,6429},{13,63,6429},{8,63,260},{6,63,10457},{0,63,2642},{26,63,872},{26,63,872},{26,63,872},{25,63,397},{31,45,1513},{23,63,794},{23,63,794},{13,63,1},{31,52,1513},{13,63,1},{31,63,0},{31,63,0},{31,63,0},{31,63,0},{31,63,0},{31,63,0},{31,63,0},{0,63,0},{31,63,0},{0,63,0},{16,0,9248},{16,0,9248},{16,0,9248},{16,0,9248},{12,63,3074},
 {12,63,3074},{12,63,3074},{8,63,260},{0,63,2642},{0,63,2642},{17,63,58848},{15,63,39619},{13,63,24975},{12,63,19007},{16,63,54474},{13,63,27057},{10,63,8569},{9,63,461},{8,63,51302},{0,63,5046},{28,63,979},{27,63,806},{27,63,637},{26,63,292},{30,56,1473},{26,63,953},{24,63,605},{16,63,0},{30,58,1473},{16,63,0},{19,63,13604},{19,63,13604},{19,63,13604},{18,63,11057},{16,63,10429},{14,63,6339},{14,63,6339},
 {10,63,424},{8,63,9713},{1,63,2900},{27,63,637},{27,63,637},{27,63,637},{26,63,292},{31,48,1105},{24,63,605},{24,63,605},{16,63,0},{31,54,1105},{16,63,0},{31,63,0},{31,63,0},{31,63,0},{31,63,0},{31,63,0},{31,63,0},{31,63,0},{0,63,0},{31,63,0},{0,63,0},{17,0,9248},{17,0,9248},{17,0,9248},{17,0,9248},{12,63,3330},{12,63,3330},{12,63,3330},{10,63,424},{1,63,2900},
-{1,63,2900},
+{1,63,2900},
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_bc7_m5_alpha.inc
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_bc7_m5_alpha.inc
@ -46,4 +46,4 @@
 {76,0,3},{255,1,27},{255,7,24},{255,1,27},{179,39,8},{255,22,16},{85,0,3},{255,2,27},{255,22,24},{255,7,27},{187,47,8},{255,47,16},{93,0,3},{255,4,27},{251,100,28},{182,0,7},{195,55,8},{255,71,16},{101,0,3},{255,4,27},{253,108,28},{191,0,7},{203,63,8},{255,95,16},{109,0,3},{255,7,27},{255,118,28},{200,0,7},{212,72,8},{255,123,16},{118,0,3},{246,0,7},
 {255,129,28},{209,0,7},{220,80,8},{255,147,16},{126,0,3},{246,0,7},{255,138,28},{218,0,7},{228,88,8},{255,172,16},{134,0,3},{249,3,7},{245,91,8},{228,3,7},{236,96,8},{255,196,16},{142,6,3},{251,14,7},{250,102,8},{237,12,7},{245,105,8},{255,223,16},{151,15,3},{253,22,7},{254,112,8},{245,20,7},{253,113,8},{255,248,16},{159,23,3},{253,31,7},{255,124,8},{249,28,7},
 {255,124,8},{255,0,0},{167,31,3},{254,39,7},{255,10,4},{252,37,7},{255,10,4},{255,0,0},{175,39,3},{255,48,7},{255,38,4},{254,48,7},{255,38,4},{255,0,0},{184,48,3},{255,56,7},{255,62,4},{255,56,7},{255,62,4},{255,0,0},{192,56,3},{255,65,7},{255,86,4},{255,65,7},{255,86,4},{255,0,0},{200,64,3},{255,74,7},{255,111,4},{255,77,7},{255,111,4},{255,0,0},
-{208,5,2},
+{208,5,2},
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_bc7_m5_color.inc
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_bc7_m5_color.inc
@ -478,4 +478,4 @@
 {70,127,10779},{68,127,12146},{54,127,6176},{54,127,6176},{34,127,52},{14,127,7281},{2,127,1213},{109,127,937},{109,127,937},{109,127,937},{102,127,281},{127,84,1513},{93,127,565},{93,127,565},{43,127,0},{127,106,1513},{43,127,0},{127,127,0},{127,127,0},{127,127,0},{127,127,0},{127,127,0},{127,127,0},{127,127,0},{0,127,0},{127,127,0},{0,127,0},{65,0,9250},{65,0,9250},{65,0,9250},{65,0,9250},{49,127,3656},
 {49,127,3656},{49,127,3656},{34,127,52},{2,127,1213},{2,127,1213},{71,127,63180},{60,127,37225},{52,127,26137},{48,127,18128},{68,127,59595},{51,127,22636},{42,127,8480},{37,127,164},{22,127,37455},{0,126,2073},{114,127,1019},{111,127,766},{111,127,666},{105,127,205},{127,102,1473},{102,127,681},{99,127,405},{56,127,0},{127,115,1473},{56,127,0},{79,127,14066},{79,127,14066},{79,127,14066},{73,127,10571},{71,127,11450},{59,127,6166},{59,127,6166},
 {37,127,148},{25,127,6914},{5,127,1413},{111,127,666},{111,127,666},{111,127,666},{105,127,205},{127,90,1105},{99,127,405},{99,127,405},{56,127,0},{127,109,1105},{56,127,0},{127,127,0},{127,127,0},{127,127,0},{127,127,0},{127,127,0},{127,127,0},{127,127,0},{0,127,0},{127,127,0},{0,127,0},{69,0,9250},{69,0,9250},{69,0,9250},{69,0,9250},{52,127,3940},{52,127,3940},{52,127,3940},{37,127,148},{5,127,1413},
-{5,127,1413},
+{5,127,1413},
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_bc7_m6.inc
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_bc7_m6.inc
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_5.inc
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_5.inc
@ -491,4 +491,4 @@
 {17,31,10897},{16,31,12077},{13,31,6285},{13,31,6285},{8,31,68},{4,31,7686},{0,31,1341},{27,31,968},{27,31,968},{27,31,968},{25,31,325},{31,21,1513},{23,31,605},{23,31,605},{11,31,0},{31,26,1513},{11,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{0,31,0},{31,31,0},{0,31,0},{16,0,9248},{16,0,9248},{16,0,9248},{16,0,9248},{12,31,3626},
 {12,31,3626},{12,31,3626},{8,31,68},{0,31,1341},{0,31,1341},{21,31,17476},{20,31,14998},{20,31,14098},{18,31,10672},{20,31,16018},{15,31,8154},{15,31,6218},{9,31,200},{10,31,11338},{0,31,1613},{28,31,1041},{27,31,801},{27,31,680},{26,31,232},{29,29,1473},{26,31,753},{24,31,442},{14,31,0},{31,28,1473},{14,31,0},{20,31,14098},{20,31,14098},{20,31,14098},{18,31,10672},{17,31,11453},{15,31,6218},{15,31,6218},
 {9,31,200},{6,31,7270},{0,31,1613},{27,31,680},{27,31,680},{27,31,680},{26,31,232},{28,28,1105},{24,31,442},{24,31,442},{14,31,0},{28,28,1105},{14,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{31,31,0},{0,31,0},{31,31,0},{0,31,0},{17,0,9248},{17,0,9248},{17,0,9248},{17,0,9248},{13,31,3929},{13,31,3929},{13,31,3929},{9,31,200},{0,31,1613},
-{0,31,1613},
+{0,31,1613},
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_6.inc
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_dxt1_6.inc
@ -491,4 +491,4 @@
 {34,63,10841},{34,63,12089},{26,63,6206},{26,63,6206},{17,63,74},{9,63,7678},{0,63,1341},{54,63,937},{54,63,937},{54,63,937},{51,63,305},{63,43,1513},{47,63,605},{47,63,605},{22,63,1},{62,53,1513},{22,63,1},{63,63,0},{63,63,0},{63,63,0},{63,63,0},{63,63,0},{63,63,0},{63,63,0},{0,63,0},{63,63,0},{0,63,0},{32,0,9256},{32,0,9256},{32,0,9256},{32,0,9256},{23,63,3650},
 {23,63,3650},{23,63,3650},{17,63,74},{0,63,1341},{0,63,1341},{43,63,17392},{40,63,15021},{40,63,14060},{37,63,10673},{40,63,16013},{32,63,8261},{29,63,6166},{19,63,194},{20,63,11338},{1,63,1594},{57,63,1041},{56,63,822},{54,63,697},{52,63,234},{63,51,1473},{51,63,737},{49,63,442},{28,63,1},{63,57,1473},{28,63,1},{40,63,14060},{40,63,14060},{40,63,14060},{37,63,10673},{34,63,11401},{29,63,6166},{29,63,6166},
 {19,63,194},{12,63,7270},{1,63,1594},{54,63,697},{54,63,697},{54,63,697},{52,63,234},{63,46,1105},{49,63,442},{49,63,442},{28,63,1},{63,54,1105},{28,63,1},{63,63,0},{63,63,0},{63,63,0},{63,63,0},{63,63,0},{63,63,0},{63,63,0},{0,63,0},{63,63,0},{0,63,0},{34,0,9256},{34,0,9256},{34,0,9256},{34,0,9256},{26,63,3898},{26,63,3898},{26,63,3898},{19,63,194},{1,63,1594},
-{1,63,1594},
+{1,63,1594},
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_pvrtc2_45.inc
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_pvrtc2_45.inc
@ -478,4 +478,4 @@
 {8,31,11312},{8,31,11249},{7,31,6499},{7,31,6499},{4,31,260},{3,31,10457},{0,31,2642},{13,31,925},{13,31,925},{13,31,925},{12,31,397},{15,22,1513},{11,31,794},{11,31,794},{7,31,4},{14,27,1513},{7,31,4},{15,31,0},{15,31,0},{15,31,0},{15,31,0},{15,31,0},{15,31,0},{15,31,0},{0,31,0},{15,31,0},{0,31,0},{8,0,9376},{8,0,9376},{8,0,9376},{8,0,9376},{6,31,3074},
 {6,31,3074},{6,31,3074},{4,31,260},{0,31,2642},{0,31,2642},{8,31,58848},{7,31,39683},{6,31,25130},{6,31,19007},{8,31,54849},{6,31,27132},{5,31,8569},{4,31,756},{4,31,51302},{0,31,5046},{13,31,1078},{13,31,806},{13,31,637},{12,31,365},{15,26,1473},{12,31,978},{12,31,617},{8,31,9},{14,29,1473},{8,31,9},{9,31,13604},{9,31,13604},{9,31,13604},{8,31,11184},{8,31,10433},{7,31,6339},{7,31,6339},
 {5,31,424},{4,31,9713},{0,31,2930},{13,31,637},{13,31,637},{13,31,637},{12,31,365},{14,27,1105},{12,31,617},{12,31,617},{8,31,9},{13,29,1105},{8,31,9},{15,31,0},{15,31,0},{15,31,0},{15,31,0},{15,31,0},{15,31,0},{15,31,0},{0,31,0},{15,31,0},{0,31,0},{8,0,9248},{8,0,9248},{8,0,9248},{8,0,9248},{6,31,3330},{6,31,3330},{6,31,3330},{5,31,424},{0,31,2930},
-{0,31,2930},
+{0,31,2930},
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_pvrtc2_alpha_33.inc
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_tables_pvrtc2_alpha_33.inc
@ -478,4 +478,4 @@
 {4,7,11305},{4,7,11209},{3,7,6489},{3,7,6489},{2,7,272},{1,7,10377},{0,7,2642},{6,7,1040},{6,7,1040},{6,7,1040},{6,7,416},{7,6,1537},{6,7,929},{6,7,929},{3,7,9},{7,6,1513},{3,7,9},{7,7,242},{7,7,170},{7,7,121},{7,7,49},{7,7,242},{7,7,98},{7,7,49},{0,7,0},{7,7,98},{0,7,0},{4,0,9280},{4,0,9280},{4,0,9280},{4,0,9280},{3,7,3125},
 {3,7,3125},{3,7,3125},{2,7,272},{0,7,2642},{0,7,2642},{4,7,59414},{4,7,41414},{3,7,24952},{3,7,19100},{4,7,55014},{3,7,27085},{2,7,10021},{2,7,656},{1,7,52310},{0,7,5046},{7,7,1142},{7,7,1070},{7,7,1021},{6,7,416},{7,7,1538},{6,7,1025},{6,7,625},{4,7,4},{6,7,1529},{4,7,4},{5,7,13964},{5,7,13964},{5,7,13964},{4,7,11305},{4,7,10505},{3,7,6665},{3,7,6665},
 {2,7,592},{2,7,9973},{0,7,2930},{7,7,1021},{7,7,1021},{7,7,1021},{6,7,416},{7,6,1105},{6,7,625},{6,7,625},{4,7,4},{6,7,1129},{4,7,4},{7,7,242},{7,7,170},{7,7,121},{7,7,49},{7,7,242},{7,7,98},{7,7,49},{0,7,0},{7,7,98},{0,7,0},{4,0,9280},{4,0,9280},{4,0,9280},{4,0,9280},{3,7,3301},{3,7,3301},{3,7,3301},{2,7,592},{0,7,2930},
-{0,7,2930},
+{0,7,2930},
--- a/thirdparty/basis_universal/transcoder/basisu_transcoder_uastc.h
+++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_uastc.h
@ -0,0 +1,297 @@
+// basisu_transcoder_uastc.h
+#pragma once
+#include "basisu_transcoder_internal.h"
+
+namespace basist
+{
+	struct color_quad_u8
+	{ 
+		uint8_t m_c[4]; 
+	};
+
+	const uint32_t TOTAL_UASTC_MODES = 19;
+	const uint32_t UASTC_MODE_INDEX_SOLID_COLOR = 8;
+
+	const uint32_t TOTAL_ASTC_BC7_COMMON_PARTITIONS2 = 30;
+	const uint32_t TOTAL_ASTC_BC7_COMMON_PARTITIONS3 = 11;
+	const uint32_t TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS = 19;
+
+	extern const uint8_t g_uastc_mode_weight_bits[TOTAL_UASTC_MODES];
+	extern const uint8_t g_uastc_mode_weight_ranges[TOTAL_UASTC_MODES];
+	extern const uint8_t g_uastc_mode_endpoint_ranges[TOTAL_UASTC_MODES];
+	extern const uint8_t g_uastc_mode_subsets[TOTAL_UASTC_MODES];
+	extern const uint8_t g_uastc_mode_planes[TOTAL_UASTC_MODES];
+	extern const uint8_t g_uastc_mode_comps[TOTAL_UASTC_MODES];
+	extern const uint8_t g_uastc_mode_has_etc1_bias[TOTAL_UASTC_MODES];
+	extern const uint8_t g_uastc_mode_has_bc1_hint0[TOTAL_UASTC_MODES];
+	extern const uint8_t g_uastc_mode_has_bc1_hint1[TOTAL_UASTC_MODES];
+	extern const uint8_t g_uastc_mode_has_alpha[TOTAL_UASTC_MODES];
+	extern const uint8_t g_uastc_mode_is_la[TOTAL_UASTC_MODES];
+
+	struct astc_bc7_common_partition2_desc
+	{
+		uint8_t m_bc7;
+		uint16_t m_astc;
+		bool m_invert;
+	};
+
+	extern const astc_bc7_common_partition2_desc g_astc_bc7_common_partitions2[TOTAL_ASTC_BC7_COMMON_PARTITIONS2];
+
+	struct bc73_astc2_common_partition_desc
+	{
+		uint8_t m_bc73;
+		uint16_t m_astc2;
+		uint8_t k;		// 0-5 - how to modify the BC7 3-subset pattern to match the ASTC pattern (LSB=invert)
+	};
+
+	extern const bc73_astc2_common_partition_desc g_bc7_3_astc2_common_partitions[TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS];
+
+	struct astc_bc7_common_partition3_desc
+	{
+		uint8_t m_bc7;
+		uint16_t m_astc;
+		uint8_t m_astc_to_bc7_perm; // converts ASTC to BC7 partition using g_astc_bc7_partition_index_perm_tables[][]
+	};
+
+	extern const astc_bc7_common_partition3_desc g_astc_bc7_common_partitions3[TOTAL_ASTC_BC7_COMMON_PARTITIONS3];
+
+	extern const uint8_t g_astc_bc7_patterns2[TOTAL_ASTC_BC7_COMMON_PARTITIONS2][16];
+	extern const uint8_t g_astc_bc7_patterns3[TOTAL_ASTC_BC7_COMMON_PARTITIONS3][16];
+	extern const uint8_t g_bc7_3_astc2_patterns2[TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS][16];
+
+	extern const uint8_t g_astc_bc7_pattern2_anchors[TOTAL_ASTC_BC7_COMMON_PARTITIONS2][3];
+	extern const uint8_t g_astc_bc7_pattern3_anchors[TOTAL_ASTC_BC7_COMMON_PARTITIONS3][3];
+	extern const uint8_t g_bc7_3_astc2_patterns2_anchors[TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS][3];
+
+	extern const uint32_t g_uastc_mode_huff_codes[TOTAL_UASTC_MODES + 1][2];
+
+	extern const uint8_t g_astc_to_bc7_partition_index_perm_tables[6][3];
+	extern const uint8_t g_bc7_to_astc_partition_index_perm_tables[6][3]; // inverse of g_astc_to_bc7_partition_index_perm_tables
+
+	extern const uint8_t* s_uastc_to_bc1_weights[6];
+
+	uint32_t bc7_convert_partition_index_3_to_2(uint32_t p, uint32_t k);
+
+	inline uint32_t astc_interpolate(uint32_t l, uint32_t h, uint32_t w, bool srgb)
+	{
+		if (srgb)
+		{
+			l = (l << 8) | 0x80;
+			h = (h << 8) | 0x80;
+		}
+		else
+		{
+			l = (l << 8) | l;
+			h = (h << 8) | h;
+		}
+
+		uint32_t k = (l * (64 - w) + h * w + 32) >> 6;
+
+		return k >> 8;
+	}
+
+	struct astc_block_desc
+	{
+		int m_weight_range;	// weight BISE range
+
+		int m_subsets;			// number of ASTC partitions
+		int m_partition_seed;	// partition pattern seed
+		int m_cem;				// color endpoint mode used by all subsets
+
+		int m_ccs;				// color component selector (dual plane only)
+		bool m_dual_plane;	// true if dual plane
+
+		// Weight and endpoint BISE values. 
+		// Note these values are NOT linear, they must be BISE encoded. See Table 97 and Table 107.
+		uint8_t m_endpoints[18];	// endpoint values, in RR GG BB etc. order 
+		uint8_t m_weights[64];		// weight index values, raster order, in P0 P1, P0 P1, etc. or P0, P0, P0, P0, etc. order
+	};
+
+	const uint32_t BC7ENC_TOTAL_ASTC_RANGES = 21;
+
+	// See tables 81, 93, 18.13.Endpoint Unquantization
+	const uint32_t TOTAL_ASTC_RANGES = 21;
+	extern const int g_astc_bise_range_table[TOTAL_ASTC_RANGES][3];
+
+	struct astc_quant_bin
+	{
+		uint8_t m_unquant; // unquantized value
+		uint8_t m_index; // sorted index
+	};
+
+	extern astc_quant_bin g_astc_unquant[BC7ENC_TOTAL_ASTC_RANGES][256]; // [ASTC encoded endpoint index]
+
+	int astc_get_levels(int range);
+	bool astc_is_valid_endpoint_range(uint32_t range);
+	uint32_t unquant_astc_endpoint(uint32_t packed_bits, uint32_t packed_trits, uint32_t packed_quints, uint32_t range);
+	uint32_t unquant_astc_endpoint_val(uint32_t packed_val, uint32_t range);
+
+	const uint8_t* get_anchor_indices(uint32_t subsets, uint32_t mode, uint32_t common_pattern, const uint8_t*& pPartition_pattern);
+
+	// BC7
+	const uint32_t BC7ENC_BLOCK_SIZE = 16;
+
+	struct bc7_block
+	{
+		uint64_t m_qwords[2];
+	};
+
+	struct bc7_optimization_results
+	{
+		uint32_t m_mode;
+		uint32_t m_partition;
+		uint8_t m_selectors[16];
+		uint8_t m_alpha_selectors[16];
+		color_quad_u8 m_low[3];
+		color_quad_u8 m_high[3];
+		uint32_t m_pbits[3][2];
+		uint32_t m_index_selector;
+		uint32_t m_rotation;
+	};
+
+	extern const uint32_t g_bc7_weights1[2];
+	extern const uint32_t g_bc7_weights2[4];
+	extern const uint32_t g_bc7_weights3[8];
+	extern const uint32_t g_bc7_weights4[16];
+	extern const uint32_t g_astc_weights4[16];
+	extern const uint32_t g_astc_weights5[32];
+	extern const uint32_t g_astc_weights_3levels[3];
+	extern const uint8_t g_bc7_partition1[16];
+	extern const uint8_t g_bc7_partition2[64 * 16];
+	extern const uint8_t g_bc7_partition3[64 * 16];
+	extern const uint8_t g_bc7_table_anchor_index_second_subset[64];
+	extern const uint8_t g_bc7_table_anchor_index_third_subset_1[64];
+	extern const uint8_t g_bc7_table_anchor_index_third_subset_2[64];
+	extern const uint8_t g_bc7_num_subsets[8];
+	extern const uint8_t g_bc7_partition_bits[8];
+	extern const uint8_t g_bc7_color_index_bitcount[8];
+	extern const uint8_t g_bc7_mode_has_p_bits[8];
+	extern const uint8_t g_bc7_mode_has_shared_p_bits[8];
+	extern const uint8_t g_bc7_color_precision_table[8];
+	extern const int8_t g_bc7_alpha_precision_table[8];
+	extern const uint8_t g_bc7_alpha_index_bitcount[8];
+
+	inline bool get_bc7_mode_has_seperate_alpha_selectors(int mode) { return (mode == 4) || (mode == 5); }
+	inline int get_bc7_color_index_size(int mode, int index_selection_bit) { return g_bc7_color_index_bitcount[mode] + index_selection_bit; }
+	inline int get_bc7_alpha_index_size(int mode, int index_selection_bit) { return g_bc7_alpha_index_bitcount[mode] - index_selection_bit; }
+
+	struct endpoint_err
+	{
+		uint16_t m_error; uint8_t m_lo; uint8_t m_hi;
+	};
+
+	extern endpoint_err g_bc7_mode_6_optimal_endpoints[256][2]; // [c][pbit]
+	const uint32_t BC7ENC_MODE_6_OPTIMAL_INDEX = 5;
+
+	extern endpoint_err g_bc7_mode_5_optimal_endpoints[256]; // [c]
+	const uint32_t BC7ENC_MODE_5_OPTIMAL_INDEX = 1;
+
+	// Packs a BC7 block from a high-level description. Handles all BC7 modes.
+	void encode_bc7_block(void* pBlock, const bc7_optimization_results* pResults);
+
+	// Packs an ASTC block
+	// Constraints: Always 4x4, all subset CEM's must be equal, only tested with LDR CEM's.
+	bool pack_astc_block(uint32_t* pDst, const astc_block_desc* pBlock, uint32_t mode);
+
+	void pack_astc_solid_block(void* pDst_block, const color32& color);
+
+#ifdef _DEBUG
+	int astc_compute_texel_partition(int seed, int x, int y, int z, int partitioncount, bool small_block);
+#endif
+		
+	struct uastc_block
+	{
+		union
+		{
+			uint8_t m_bytes[16];
+			uint32_t m_dwords[4];
+
+#ifndef __EMSCRIPTEN__
+			uint64_t m_qwords[2];
+#endif
+		};
+	};
+
+	struct unpacked_uastc_block
+	{
+		astc_block_desc m_astc;
+
+		uint32_t m_mode;
+		uint32_t m_common_pattern;
+
+		color32 m_solid_color;
+
+		bool m_bc1_hint0;
+		bool m_bc1_hint1;
+
+		bool m_etc1_flip;
+		bool m_etc1_diff;
+		uint32_t m_etc1_inten0;
+		uint32_t m_etc1_inten1;
+
+		uint32_t m_etc1_bias;
+
+		uint32_t m_etc2_hints;
+
+		uint32_t m_etc1_selector;
+		uint32_t m_etc1_r, m_etc1_g, m_etc1_b;
+	};
+
+	color32 apply_etc1_bias(const color32 &block_color, uint32_t bias, uint32_t limit, uint32_t subblock);
+	
+	struct decoder_etc_block;
+	struct eac_block;
+		
+	bool unpack_uastc(uint32_t mode, uint32_t common_pattern, const color32& solid_color, const astc_block_desc& astc, color32* pPixels, bool srgb);
+	bool unpack_uastc(const unpacked_uastc_block& unpacked_blk, color32* pPixels, bool srgb);
+
+	bool unpack_uastc(const uastc_block& blk, color32* pPixels, bool srgb);
+	bool unpack_uastc(const uastc_block& blk, unpacked_uastc_block& unpacked, bool undo_blue_contract, bool read_hints = true);
+
+	bool transcode_uastc_to_astc(const uastc_block& src_blk, void* pDst);
+
+	bool transcode_uastc_to_bc7(const unpacked_uastc_block& unpacked_src_blk, bc7_optimization_results& dst_blk);
+	bool transcode_uastc_to_bc7(const uastc_block& src_blk, bc7_optimization_results& dst_blk);
+	bool transcode_uastc_to_bc7(const uastc_block& src_blk, void* pDst);
+
+	void transcode_uastc_to_etc1(unpacked_uastc_block& unpacked_src_blk, color32 block_pixels[4][4], void* pDst);
+	bool transcode_uastc_to_etc1(const uastc_block& src_blk, void* pDst);
+	bool transcode_uastc_to_etc1(const uastc_block& src_blk, void* pDst, uint32_t channel);
+
+	void transcode_uastc_to_etc2_eac_a8(unpacked_uastc_block& unpacked_src_blk, color32 block_pixels[4][4], void* pDst);
+	bool transcode_uastc_to_etc2_rgba(const uastc_block& src_blk, void* pDst);
+
+	// Packs 16 scalar values to BC4. Same PSNR as stb_dxt's BC4 encoder, around 13% faster.
+	void encode_bc4(void* pDst, const uint8_t* pPixels, uint32_t stride);
+	
+	void encode_bc1_solid_block(void* pDst, uint32_t fr, uint32_t fg, uint32_t fb);
+
+	enum
+	{
+		cEncodeBC1HighQuality = 1,
+		cEncodeBC1HigherQuality = 2,
+		cEncodeBC1UseSelectors = 4,
+	};
+	void encode_bc1(void* pDst, const uint8_t* pPixels, uint32_t flags);
+	
+	// Alternate PCA-free encoder, around 15% faster, same (or slightly higher) avg. PSNR
+	void encode_bc1_alt(void* pDst, const uint8_t* pPixels, uint32_t flags);
+
+	void transcode_uastc_to_bc1_hint0(const unpacked_uastc_block& unpacked_src_blk, void* pDst);
+	void transcode_uastc_to_bc1_hint1(const unpacked_uastc_block& unpacked_src_blk, const color32 block_pixels[4][4], void* pDst, bool high_quality);
+
+	bool transcode_uastc_to_bc1(const uastc_block& src_blk, void* pDst, bool high_quality);
+	bool transcode_uastc_to_bc3(const uastc_block& src_blk, void* pDst, bool high_quality);
+	bool transcode_uastc_to_bc4(const uastc_block& src_blk, void* pDst, bool high_quality, uint32_t chan0);
+	bool transcode_uastc_to_bc5(const uastc_block& src_blk, void* pDst, bool high_quality, uint32_t chan0, uint32_t chan1);
+
+	bool transcode_uastc_to_etc2_eac_r11(const uastc_block& src_blk, void* pDst, bool high_quality, uint32_t chan0);
+	bool transcode_uastc_to_etc2_eac_rg11(const uastc_block& src_blk, void* pDst, bool high_quality, uint32_t chan0, uint32_t chan1);
+
+	bool transcode_uastc_to_pvrtc1_4_rgb(const uastc_block* pSrc_blocks, void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, bool high_quality, bool from_alpha);
+	bool transcode_uastc_to_pvrtc1_4_rgba(const uastc_block* pSrc_blocks, void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, bool high_quality);
+		
+	// uastc_init() MUST be called before using this module.
+	void uastc_init();
+
+} // namespace basist