diff --git a/thirdparty/README.md b/thirdparty/README.md
index 95a6902089..ea625f14ab 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -182,12 +182,12 @@ Patches in the `patches` directory should be re-applied after updates.
 ## jpeg-compressor
 
 - Upstream: https://github.com/richgel999/jpeg-compressor
-- Version: 1.04
+- Version: 2.00 (1eb17d558b9d3b7442d256642a5745974e9eeb1e, 2020)
 - License: Public domain
 
 Files extracted from upstream source:
 
-- `jpgd.{c,h}`
+- `jpgd*.{c,h}`
 
 
 ## libogg
diff --git a/thirdparty/jpeg-compressor/jpgd.cpp b/thirdparty/jpeg-compressor/jpgd.cpp
index 62fbd1b72d..a0c494db61 100644
--- a/thirdparty/jpeg-compressor/jpgd.cpp
+++ b/thirdparty/jpeg-compressor/jpgd.cpp
@@ -1,27 +1,55 @@
-// jpgd.cpp - C++ class for JPEG decompression.
-// Public domain, Rich Geldreich <richgel99@gmail.com>
-// Alex Evans: Linear memory allocator (taken from jpge.h).
-// v1.04, May. 19, 2012: Code tweaks to fix VS2008 static code analysis warnings (all looked harmless)
-//
+// jpgd.cpp - C++ class for JPEG decompression. Written by Richard Geldreich <richgel99@gmail.com> between 1994-2020.
 // Supports progressive and baseline sequential JPEG image files, and the most common chroma subsampling factors: Y, H1V1, H2V1, H1V2, and H2V2.
+// Supports box and linear chroma upsampling.
+//
+// Released under two licenses. You are free to choose which license you want:
+// License 1: 
+// Public Domain
+//
+// License 2:
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Alex Evans: Linear memory allocator (taken from jpge.h).
+// v1.04, May. 19, 2012: Code tweaks to fix VS2008 static code analysis warnings
+// v2.00, March 20, 2020: Fuzzed with zzuf and afl. Fixed several issues, converted most assert()'s to run-time checks. Added chroma upsampling. Removed freq. domain upsampling. gcc/clang warnings.
+//
+// Important:
+// #define JPGD_USE_SSE2 to 0 to completely disable SSE2 usage.
 //
-// Chroma upsampling quality: H2V2 is upsampled in the frequency domain, H2V1 and H1V2 are upsampled using point sampling.
-// Chroma upsampling reference: "Fast Scheme for Image Size Change in the Compressed Domain"
-// http://vision.ai.uiuc.edu/~dugad/research/dct/index.html
-
 #include "jpgd.h"
 #include <string.h>
-
+#include <algorithm>
 #include <assert.h>
-#define JPGD_ASSERT(x) assert(x)
 
 #ifdef _MSC_VER
 #pragma warning (disable : 4611) // warning C4611: interaction between '_setjmp' and C++ object destruction is non-portable
 #endif
 
-// Set to 1 to enable freq. domain chroma upsampling on images using H2V2 subsampling (0=faster nearest neighbor sampling).
-// This is slower, but results in higher quality on images with highly saturated colors.
-#define JPGD_SUPPORT_FREQ_DOMAIN_UPSAMPLING 1
+#ifndef JPGD_USE_SSE2
+
+	#if defined(__GNUC__) 
+
+		#if (defined(__x86_64__) || defined(_M_X64)) 
+			#if defined(__SSE2__)
+				#define JPGD_USE_SSE2 (1)
+			#endif
+		#endif
+
+	#else
+		#define JPGD_USE_SSE2 (1)
+	#endif
+
+#endif
 
 #define JPGD_TRUE (1)
 #define JPGD_FALSE (0)
@@ -29,28 +57,28 @@
 #define JPGD_MAX(a,b) (((a)>(b)) ? (a) : (b))
 #define JPGD_MIN(a,b) (((a)<(b)) ? (a) : (b))
 
-// TODO: Move to header and use these constants when declaring the arrays.
-#define JPGD_HUFF_TREE_MAX_LENGTH 512
-#define JPGD_HUFF_CODE_SIZE_MAX_LENGTH 256
-
 namespace jpgd {
 
-static inline void *jpgd_malloc(size_t nSize) { return malloc(nSize); }
-static inline void jpgd_free(void *p) { free(p); }
+	static inline void* jpgd_malloc(size_t nSize) { return malloc(nSize); }
+	static inline void jpgd_free(void* p) { free(p); }
 
-// DCT coefficients are stored in this sequence.
-static int g_ZAG[64] = {  0,1,8,16,9,2,3,10,17,24,32,25,18,11,4,5,12,19,26,33,40,48,41,34,27,20,13,6,7,14,21,28,35,42,49,56,57,50,43,36,29,22,15,23,30,37,44,51,58,59,52,45,38,31,39,46,53,60,61,54,47,55,62,63 };
+	// DCT coefficients are stored in this sequence.
+	static int g_ZAG[64] = { 0,1,8,16,9,2,3,10,17,24,32,25,18,11,4,5,12,19,26,33,40,48,41,34,27,20,13,6,7,14,21,28,35,42,49,56,57,50,43,36,29,22,15,23,30,37,44,51,58,59,52,45,38,31,39,46,53,60,61,54,47,55,62,63 };
 
-enum JPEG_MARKER
-{
-  M_SOF0  = 0xC0, M_SOF1  = 0xC1, M_SOF2  = 0xC2, M_SOF3  = 0xC3, M_SOF5  = 0xC5, M_SOF6  = 0xC6, M_SOF7  = 0xC7, M_JPG   = 0xC8,
-  M_SOF9  = 0xC9, M_SOF10 = 0xCA, M_SOF11 = 0xCB, M_SOF13 = 0xCD, M_SOF14 = 0xCE, M_SOF15 = 0xCF, M_DHT   = 0xC4, M_DAC   = 0xCC,
-  M_RST0  = 0xD0, M_RST1  = 0xD1, M_RST2  = 0xD2, M_RST3  = 0xD3, M_RST4  = 0xD4, M_RST5  = 0xD5, M_RST6  = 0xD6, M_RST7  = 0xD7,
-  M_SOI   = 0xD8, M_EOI   = 0xD9, M_SOS   = 0xDA, M_DQT   = 0xDB, M_DNL   = 0xDC, M_DRI   = 0xDD, M_DHP   = 0xDE, M_EXP   = 0xDF,
-  M_APP0  = 0xE0, M_APP15 = 0xEF, M_JPG0  = 0xF0, M_JPG13 = 0xFD, M_COM   = 0xFE, M_TEM   = 0x01, M_ERROR = 0x100, RST0   = 0xD0
-};
+	enum JPEG_MARKER
+	{
+		M_SOF0 = 0xC0, M_SOF1 = 0xC1, M_SOF2 = 0xC2, M_SOF3 = 0xC3, M_SOF5 = 0xC5, M_SOF6 = 0xC6, M_SOF7 = 0xC7, M_JPG = 0xC8,
+		M_SOF9 = 0xC9, M_SOF10 = 0xCA, M_SOF11 = 0xCB, M_SOF13 = 0xCD, M_SOF14 = 0xCE, M_SOF15 = 0xCF, M_DHT = 0xC4, M_DAC = 0xCC,
+		M_RST0 = 0xD0, M_RST1 = 0xD1, M_RST2 = 0xD2, M_RST3 = 0xD3, M_RST4 = 0xD4, M_RST5 = 0xD5, M_RST6 = 0xD6, M_RST7 = 0xD7,
+		M_SOI = 0xD8, M_EOI = 0xD9, M_SOS = 0xDA, M_DQT = 0xDB, M_DNL = 0xDC, M_DRI = 0xDD, M_DHP = 0xDE, M_EXP = 0xDF,
+		M_APP0 = 0xE0, M_APP15 = 0xEF, M_JPG0 = 0xF0, M_JPG13 = 0xFD, M_COM = 0xFE, M_TEM = 0x01, M_ERROR = 0x100, RST0 = 0xD0
+	};
 
-enum JPEG_SUBSAMPLING { JPGD_GRAYSCALE = 0, JPGD_YH1V1, JPGD_YH2V1, JPGD_YH1V2, JPGD_YH2V2 };
+	enum JPEG_SUBSAMPLING { JPGD_GRAYSCALE = 0, JPGD_YH1V1, JPGD_YH2V1, JPGD_YH1V2, JPGD_YH2V2 };
+
+#if JPGD_USE_SSE2
+#include "jpgd_idct.h"
+#endif
 
 #define CONST_BITS  13
 #define PASS1_BITS  2
@@ -76,3130 +104,3182 @@ enum JPEG_SUBSAMPLING { JPGD_GRAYSCALE = 0, JPGD_YH1V1, JPGD_YH2V1, JPGD_YH1V2,
 
 #define CLAMP(i) ((static_cast<uint>(i) > 255) ? (((~i) >> 31) & 0xFF) : (i))
 
-// Compiler creates a fast path 1D IDCT for X non-zero columns
-template <int NONZERO_COLS>
-struct Row
-{
-  static void idct(int* pTemp, const jpgd_block_t* pSrc)
-  {
-    // ACCESS_COL() will be optimized at compile time to either an array access, or 0.
-    #define ACCESS_COL(x) (((x) < NONZERO_COLS) ? (int)pSrc[x] : 0)
+	static inline int left_shifti(int val, uint32_t bits)
+	{
+		return static_cast<int>(static_cast<uint32_t>(val) << bits);
+	}
 
-    const int z2 = ACCESS_COL(2), z3 = ACCESS_COL(6);
+	// Compiler creates a fast path 1D IDCT for X non-zero columns
+	template <int NONZERO_COLS>
+	struct Row
+	{
+		static void idct(int* pTemp, const jpgd_block_coeff_t* pSrc)
+		{
+			// ACCESS_COL() will be optimized at compile time to either an array access, or 0. Good compilers will then optimize out muls against 0.
+#define ACCESS_COL(x) (((x) < NONZERO_COLS) ? (int)pSrc[x] : 0)
 
-    const int z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
-    const int tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
-    const int tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
+			const int z2 = ACCESS_COL(2), z3 = ACCESS_COL(6);
 
-    const int tmp0 = (ACCESS_COL(0) + ACCESS_COL(4)) << CONST_BITS;
-    const int tmp1 = (ACCESS_COL(0) - ACCESS_COL(4)) << CONST_BITS;
+			const int z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+			const int tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065);
+			const int tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
 
-    const int tmp10 = tmp0 + tmp3, tmp13 = tmp0 - tmp3, tmp11 = tmp1 + tmp2, tmp12 = tmp1 - tmp2;
+			const int tmp0 = left_shifti(ACCESS_COL(0) + ACCESS_COL(4), CONST_BITS);
+			const int tmp1 = left_shifti(ACCESS_COL(0) - ACCESS_COL(4), CONST_BITS);
 
-    const int atmp0 = ACCESS_COL(7), atmp1 = ACCESS_COL(5), atmp2 = ACCESS_COL(3), atmp3 = ACCESS_COL(1);
+			const int tmp10 = tmp0 + tmp3, tmp13 = tmp0 - tmp3, tmp11 = tmp1 + tmp2, tmp12 = tmp1 - tmp2;
 
-    const int bz1 = atmp0 + atmp3, bz2 = atmp1 + atmp2, bz3 = atmp0 + atmp2, bz4 = atmp1 + atmp3;
-    const int bz5 = MULTIPLY(bz3 + bz4, FIX_1_175875602);
+			const int atmp0 = ACCESS_COL(7), atmp1 = ACCESS_COL(5), atmp2 = ACCESS_COL(3), atmp3 = ACCESS_COL(1);
 
-    const int az1 = MULTIPLY(bz1, - FIX_0_899976223);
-    const int az2 = MULTIPLY(bz2, - FIX_2_562915447);
-    const int az3 = MULTIPLY(bz3, - FIX_1_961570560) + bz5;
-    const int az4 = MULTIPLY(bz4, - FIX_0_390180644) + bz5;
+			const int bz1 = atmp0 + atmp3, bz2 = atmp1 + atmp2, bz3 = atmp0 + atmp2, bz4 = atmp1 + atmp3;
+			const int bz5 = MULTIPLY(bz3 + bz4, FIX_1_175875602);
 
-    const int btmp0 = MULTIPLY(atmp0, FIX_0_298631336) + az1 + az3;
-    const int btmp1 = MULTIPLY(atmp1, FIX_2_053119869) + az2 + az4;
-    const int btmp2 = MULTIPLY(atmp2, FIX_3_072711026) + az2 + az3;
-    const int btmp3 = MULTIPLY(atmp3, FIX_1_501321110) + az1 + az4;
+			const int az1 = MULTIPLY(bz1, -FIX_0_899976223);
+			const int az2 = MULTIPLY(bz2, -FIX_2_562915447);
+			const int az3 = MULTIPLY(bz3, -FIX_1_961570560) + bz5;
+			const int az4 = MULTIPLY(bz4, -FIX_0_390180644) + bz5;
 
-    pTemp[0] = DESCALE(tmp10 + btmp3, CONST_BITS-PASS1_BITS);
-    pTemp[7] = DESCALE(tmp10 - btmp3, CONST_BITS-PASS1_BITS);
-    pTemp[1] = DESCALE(tmp11 + btmp2, CONST_BITS-PASS1_BITS);
-    pTemp[6] = DESCALE(tmp11 - btmp2, CONST_BITS-PASS1_BITS);
-    pTemp[2] = DESCALE(tmp12 + btmp1, CONST_BITS-PASS1_BITS);
-    pTemp[5] = DESCALE(tmp12 - btmp1, CONST_BITS-PASS1_BITS);
-    pTemp[3] = DESCALE(tmp13 + btmp0, CONST_BITS-PASS1_BITS);
-    pTemp[4] = DESCALE(tmp13 - btmp0, CONST_BITS-PASS1_BITS);
-  }
-};
+			const int btmp0 = MULTIPLY(atmp0, FIX_0_298631336) + az1 + az3;
+			const int btmp1 = MULTIPLY(atmp1, FIX_2_053119869) + az2 + az4;
+			const int btmp2 = MULTIPLY(atmp2, FIX_3_072711026) + az2 + az3;
+			const int btmp3 = MULTIPLY(atmp3, FIX_1_501321110) + az1 + az4;
 
-template <>
-struct Row<0>
-{
-  static void idct(int* pTemp, const jpgd_block_t* pSrc)
-  {
-#ifdef _MSC_VER
-    pTemp; pSrc;
+			pTemp[0] = DESCALE(tmp10 + btmp3, CONST_BITS - PASS1_BITS);
+			pTemp[7] = DESCALE(tmp10 - btmp3, CONST_BITS - PASS1_BITS);
+			pTemp[1] = DESCALE(tmp11 + btmp2, CONST_BITS - PASS1_BITS);
+			pTemp[6] = DESCALE(tmp11 - btmp2, CONST_BITS - PASS1_BITS);
+			pTemp[2] = DESCALE(tmp12 + btmp1, CONST_BITS - PASS1_BITS);
+			pTemp[5] = DESCALE(tmp12 - btmp1, CONST_BITS - PASS1_BITS);
+			pTemp[3] = DESCALE(tmp13 + btmp0, CONST_BITS - PASS1_BITS);
+			pTemp[4] = DESCALE(tmp13 - btmp0, CONST_BITS - PASS1_BITS);
+		}
+	};
+
+	template <>
+	struct Row<0>
+	{
+		static void idct(int* pTemp, const jpgd_block_coeff_t* pSrc)
+		{
+			(void)pTemp; 
+			(void)pSrc;
+		}
+	};
+
+	template <>
+	struct Row<1>
+	{
+		static void idct(int* pTemp, const jpgd_block_coeff_t* pSrc)
+		{
+			const int dcval = left_shifti(pSrc[0], PASS1_BITS);
+
+			pTemp[0] = dcval;
+			pTemp[1] = dcval;
+			pTemp[2] = dcval;
+			pTemp[3] = dcval;
+			pTemp[4] = dcval;
+			pTemp[5] = dcval;
+			pTemp[6] = dcval;
+			pTemp[7] = dcval;
+		}
+	};
+
+	// Compiler creates a fast path 1D IDCT for X non-zero rows
+	template <int NONZERO_ROWS>
+	struct Col
+	{
+		static void idct(uint8* pDst_ptr, const int* pTemp)
+		{
+			// ACCESS_ROW() will be optimized at compile time to either an array access, or 0.
+#define ACCESS_ROW(x) (((x) < NONZERO_ROWS) ? pTemp[x * 8] : 0)
+
+			const int z2 = ACCESS_ROW(2);
+			const int z3 = ACCESS_ROW(6);
+
+			const int z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+			const int tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065);
+			const int tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
+
+			const int tmp0 = left_shifti(ACCESS_ROW(0) + ACCESS_ROW(4), CONST_BITS);
+			const int tmp1 = left_shifti(ACCESS_ROW(0) - ACCESS_ROW(4), CONST_BITS);
+
+			const int tmp10 = tmp0 + tmp3, tmp13 = tmp0 - tmp3, tmp11 = tmp1 + tmp2, tmp12 = tmp1 - tmp2;
+
+			const int atmp0 = ACCESS_ROW(7), atmp1 = ACCESS_ROW(5), atmp2 = ACCESS_ROW(3), atmp3 = ACCESS_ROW(1);
+
+			const int bz1 = atmp0 + atmp3, bz2 = atmp1 + atmp2, bz3 = atmp0 + atmp2, bz4 = atmp1 + atmp3;
+			const int bz5 = MULTIPLY(bz3 + bz4, FIX_1_175875602);
+
+			const int az1 = MULTIPLY(bz1, -FIX_0_899976223);
+			const int az2 = MULTIPLY(bz2, -FIX_2_562915447);
+			const int az3 = MULTIPLY(bz3, -FIX_1_961570560) + bz5;
+			const int az4 = MULTIPLY(bz4, -FIX_0_390180644) + bz5;
+
+			const int btmp0 = MULTIPLY(atmp0, FIX_0_298631336) + az1 + az3;
+			const int btmp1 = MULTIPLY(atmp1, FIX_2_053119869) + az2 + az4;
+			const int btmp2 = MULTIPLY(atmp2, FIX_3_072711026) + az2 + az3;
+			const int btmp3 = MULTIPLY(atmp3, FIX_1_501321110) + az1 + az4;
+
+			int i = DESCALE_ZEROSHIFT(tmp10 + btmp3, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 0] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp10 - btmp3, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 7] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp11 + btmp2, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 1] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp11 - btmp2, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 6] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp12 + btmp1, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 2] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp12 - btmp1, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 5] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp13 + btmp0, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 3] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp13 - btmp0, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 4] = (uint8)CLAMP(i);
+		}
+	};
+
+	template <>
+	struct Col<1>
+	{
+		static void idct(uint8* pDst_ptr, const int* pTemp)
+		{
+			int dcval = DESCALE_ZEROSHIFT(pTemp[0], PASS1_BITS + 3);
+			const uint8 dcval_clamped = (uint8)CLAMP(dcval);
+			pDst_ptr[0 * 8] = dcval_clamped;
+			pDst_ptr[1 * 8] = dcval_clamped;
+			pDst_ptr[2 * 8] = dcval_clamped;
+			pDst_ptr[3 * 8] = dcval_clamped;
+			pDst_ptr[4 * 8] = dcval_clamped;
+			pDst_ptr[5 * 8] = dcval_clamped;
+			pDst_ptr[6 * 8] = dcval_clamped;
+			pDst_ptr[7 * 8] = dcval_clamped;
+		}
+	};
+
+	static const uint8 s_idct_row_table[] =
+	{
+	  1,0,0,0,0,0,0,0, 2,0,0,0,0,0,0,0, 2,1,0,0,0,0,0,0, 2,1,1,0,0,0,0,0, 2,2,1,0,0,0,0,0, 3,2,1,0,0,0,0,0, 4,2,1,0,0,0,0,0, 4,3,1,0,0,0,0,0,
+	  4,3,2,0,0,0,0,0, 4,3,2,1,0,0,0,0, 4,3,2,1,1,0,0,0, 4,3,2,2,1,0,0,0, 4,3,3,2,1,0,0,0, 4,4,3,2,1,0,0,0, 5,4,3,2,1,0,0,0, 6,4,3,2,1,0,0,0,
+	  6,5,3,2,1,0,0,0, 6,5,4,2,1,0,0,0, 6,5,4,3,1,0,0,0, 6,5,4,3,2,0,0,0, 6,5,4,3,2,1,0,0, 6,5,4,3,2,1,1,0, 6,5,4,3,2,2,1,0, 6,5,4,3,3,2,1,0,
+	  6,5,4,4,3,2,1,0, 6,5,5,4,3,2,1,0, 6,6,5,4,3,2,1,0, 7,6,5,4,3,2,1,0, 8,6,5,4,3,2,1,0, 8,7,5,4,3,2,1,0, 8,7,6,4,3,2,1,0, 8,7,6,5,3,2,1,0,
+	  8,7,6,5,4,2,1,0, 8,7,6,5,4,3,1,0, 8,7,6,5,4,3,2,0, 8,7,6,5,4,3,2,1, 8,7,6,5,4,3,2,2, 8,7,6,5,4,3,3,2, 8,7,6,5,4,4,3,2, 8,7,6,5,5,4,3,2,
+	  8,7,6,6,5,4,3,2, 8,7,7,6,5,4,3,2, 8,8,7,6,5,4,3,2, 8,8,8,6,5,4,3,2, 8,8,8,7,5,4,3,2, 8,8,8,7,6,4,3,2, 8,8,8,7,6,5,3,2, 8,8,8,7,6,5,4,2,
+	  8,8,8,7,6,5,4,3, 8,8,8,7,6,5,4,4, 8,8,8,7,6,5,5,4, 8,8,8,7,6,6,5,4, 8,8,8,7,7,6,5,4, 8,8,8,8,7,6,5,4, 8,8,8,8,8,6,5,4, 8,8,8,8,8,7,5,4,
+	  8,8,8,8,8,7,6,4, 8,8,8,8,8,7,6,5, 8,8,8,8,8,7,6,6, 8,8,8,8,8,7,7,6, 8,8,8,8,8,8,7,6, 8,8,8,8,8,8,8,6, 8,8,8,8,8,8,8,7, 8,8,8,8,8,8,8,8,
+	};
+
+	static const uint8 s_idct_col_table[] = 
+	{ 
+		1, 1, 2, 3, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 
+		7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 
+	};
+
+	// Scalar "fast pathing" IDCT.
+	static void idct(const jpgd_block_coeff_t* pSrc_ptr, uint8* pDst_ptr, int block_max_zag, bool use_simd)
+	{
+		(void)use_simd;
+
+		assert(block_max_zag >= 1);
+		assert(block_max_zag <= 64);
+				
+		if (block_max_zag <= 1)
+		{
+			int k = ((pSrc_ptr[0] + 4) >> 3) + 128;
+			k = CLAMP(k);
+			k = k | (k << 8);
+			k = k | (k << 16);
+
+			for (int i = 8; i > 0; i--)
+			{
+				*(int*)&pDst_ptr[0] = k;
+				*(int*)&pDst_ptr[4] = k;
+				pDst_ptr += 8;
+			}
+			return;
+		}
+
+#if JPGD_USE_SSE2
+		if (use_simd)
+		{
+			assert((((uintptr_t)pSrc_ptr) & 15) == 0);
+			assert((((uintptr_t)pDst_ptr) & 15) == 0);
+			idctSSEShortU8(pSrc_ptr, pDst_ptr);
+			return;
+		}
 #endif
-  }
-};
-
-template <>
-struct Row<1>
-{
-  static void idct(int* pTemp, const jpgd_block_t* pSrc)
-  {
-    const int dcval = (pSrc[0] << PASS1_BITS);
-
-    pTemp[0] = dcval;
-    pTemp[1] = dcval;
-    pTemp[2] = dcval;
-    pTemp[3] = dcval;
-    pTemp[4] = dcval;
-    pTemp[5] = dcval;
-    pTemp[6] = dcval;
-    pTemp[7] = dcval;
-  }
-};
-
-// Compiler creates a fast path 1D IDCT for X non-zero rows
-template <int NONZERO_ROWS>
-struct Col
-{
-  static void idct(uint8* pDst_ptr, const int* pTemp)
-  {
-    // ACCESS_ROW() will be optimized at compile time to either an array access, or 0.
-    #define ACCESS_ROW(x) (((x) < NONZERO_ROWS) ? pTemp[x * 8] : 0)
-
-    const int z2 = ACCESS_ROW(2);
-    const int z3 = ACCESS_ROW(6);
-
-    const int z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
-    const int tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
-    const int tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
-
-    const int tmp0 = (ACCESS_ROW(0) + ACCESS_ROW(4)) << CONST_BITS;
-    const int tmp1 = (ACCESS_ROW(0) - ACCESS_ROW(4)) << CONST_BITS;
-
-    const int tmp10 = tmp0 + tmp3, tmp13 = tmp0 - tmp3, tmp11 = tmp1 + tmp2, tmp12 = tmp1 - tmp2;
-
-    const int atmp0 = ACCESS_ROW(7), atmp1 = ACCESS_ROW(5), atmp2 = ACCESS_ROW(3), atmp3 = ACCESS_ROW(1);
-
-    const int bz1 = atmp0 + atmp3, bz2 = atmp1 + atmp2, bz3 = atmp0 + atmp2, bz4 = atmp1 + atmp3;
-    const int bz5 = MULTIPLY(bz3 + bz4, FIX_1_175875602);
-
-    const int az1 = MULTIPLY(bz1, - FIX_0_899976223);
-    const int az2 = MULTIPLY(bz2, - FIX_2_562915447);
-    const int az3 = MULTIPLY(bz3, - FIX_1_961570560) + bz5;
-    const int az4 = MULTIPLY(bz4, - FIX_0_390180644) + bz5;
-
-    const int btmp0 = MULTIPLY(atmp0, FIX_0_298631336) + az1 + az3;
-    const int btmp1 = MULTIPLY(atmp1, FIX_2_053119869) + az2 + az4;
-    const int btmp2 = MULTIPLY(atmp2, FIX_3_072711026) + az2 + az3;
-    const int btmp3 = MULTIPLY(atmp3, FIX_1_501321110) + az1 + az4;
-
-    int i = DESCALE_ZEROSHIFT(tmp10 + btmp3, CONST_BITS+PASS1_BITS+3);
-    pDst_ptr[8*0] = (uint8)CLAMP(i);
-
-    i = DESCALE_ZEROSHIFT(tmp10 - btmp3, CONST_BITS+PASS1_BITS+3);
-    pDst_ptr[8*7] = (uint8)CLAMP(i);
-
-    i = DESCALE_ZEROSHIFT(tmp11 + btmp2, CONST_BITS+PASS1_BITS+3);
-    pDst_ptr[8*1] = (uint8)CLAMP(i);
-
-    i = DESCALE_ZEROSHIFT(tmp11 - btmp2, CONST_BITS+PASS1_BITS+3);
-    pDst_ptr[8*6] = (uint8)CLAMP(i);
-
-    i = DESCALE_ZEROSHIFT(tmp12 + btmp1, CONST_BITS+PASS1_BITS+3);
-    pDst_ptr[8*2] = (uint8)CLAMP(i);
-
-    i = DESCALE_ZEROSHIFT(tmp12 - btmp1, CONST_BITS+PASS1_BITS+3);
-    pDst_ptr[8*5] = (uint8)CLAMP(i);
-
-    i = DESCALE_ZEROSHIFT(tmp13 + btmp0, CONST_BITS+PASS1_BITS+3);
-    pDst_ptr[8*3] = (uint8)CLAMP(i);
-
-    i = DESCALE_ZEROSHIFT(tmp13 - btmp0, CONST_BITS+PASS1_BITS+3);
-    pDst_ptr[8*4] = (uint8)CLAMP(i);
-  }
-};
-
-template <>
-struct Col<1>
-{
-  static void idct(uint8* pDst_ptr, const int* pTemp)
-  {
-    int dcval = DESCALE_ZEROSHIFT(pTemp[0], PASS1_BITS+3);
-    const uint8 dcval_clamped = (uint8)CLAMP(dcval);
-    pDst_ptr[0*8] = dcval_clamped;
-    pDst_ptr[1*8] = dcval_clamped;
-    pDst_ptr[2*8] = dcval_clamped;
-    pDst_ptr[3*8] = dcval_clamped;
-    pDst_ptr[4*8] = dcval_clamped;
-    pDst_ptr[5*8] = dcval_clamped;
-    pDst_ptr[6*8] = dcval_clamped;
-    pDst_ptr[7*8] = dcval_clamped;
-  }
-};
-
-static const uint8 s_idct_row_table[] =
-{
-  1,0,0,0,0,0,0,0, 2,0,0,0,0,0,0,0, 2,1,0,0,0,0,0,0, 2,1,1,0,0,0,0,0, 2,2,1,0,0,0,0,0, 3,2,1,0,0,0,0,0, 4,2,1,0,0,0,0,0, 4,3,1,0,0,0,0,0,
-  4,3,2,0,0,0,0,0, 4,3,2,1,0,0,0,0, 4,3,2,1,1,0,0,0, 4,3,2,2,1,0,0,0, 4,3,3,2,1,0,0,0, 4,4,3,2,1,0,0,0, 5,4,3,2,1,0,0,0, 6,4,3,2,1,0,0,0,
-  6,5,3,2,1,0,0,0, 6,5,4,2,1,0,0,0, 6,5,4,3,1,0,0,0, 6,5,4,3,2,0,0,0, 6,5,4,3,2,1,0,0, 6,5,4,3,2,1,1,0, 6,5,4,3,2,2,1,0, 6,5,4,3,3,2,1,0,
-  6,5,4,4,3,2,1,0, 6,5,5,4,3,2,1,0, 6,6,5,4,3,2,1,0, 7,6,5,4,3,2,1,0, 8,6,5,4,3,2,1,0, 8,7,5,4,3,2,1,0, 8,7,6,4,3,2,1,0, 8,7,6,5,3,2,1,0,
-  8,7,6,5,4,2,1,0, 8,7,6,5,4,3,1,0, 8,7,6,5,4,3,2,0, 8,7,6,5,4,3,2,1, 8,7,6,5,4,3,2,2, 8,7,6,5,4,3,3,2, 8,7,6,5,4,4,3,2, 8,7,6,5,5,4,3,2,
-  8,7,6,6,5,4,3,2, 8,7,7,6,5,4,3,2, 8,8,7,6,5,4,3,2, 8,8,8,6,5,4,3,2, 8,8,8,7,5,4,3,2, 8,8,8,7,6,4,3,2, 8,8,8,7,6,5,3,2, 8,8,8,7,6,5,4,2,
-  8,8,8,7,6,5,4,3, 8,8,8,7,6,5,4,4, 8,8,8,7,6,5,5,4, 8,8,8,7,6,6,5,4, 8,8,8,7,7,6,5,4, 8,8,8,8,7,6,5,4, 8,8,8,8,8,6,5,4, 8,8,8,8,8,7,5,4,
-  8,8,8,8,8,7,6,4, 8,8,8,8,8,7,6,5, 8,8,8,8,8,7,6,6, 8,8,8,8,8,7,7,6, 8,8,8,8,8,8,7,6, 8,8,8,8,8,8,8,6, 8,8,8,8,8,8,8,7, 8,8,8,8,8,8,8,8,
-};
-
-static const uint8 s_idct_col_table[] = { 1, 1, 2, 3, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 };
-
-void idct(const jpgd_block_t* pSrc_ptr, uint8* pDst_ptr, int block_max_zag)
-{
-  JPGD_ASSERT(block_max_zag >= 1);
-  JPGD_ASSERT(block_max_zag <= 64);
-
-  if (block_max_zag <= 1)
-  {
-    int k = ((pSrc_ptr[0] + 4) >> 3) + 128;
-    k = CLAMP(k);
-    k = k | (k<<8);
-    k = k | (k<<16);
-
-    for (int i = 8; i > 0; i--)
-    {
-      *(int*)&pDst_ptr[0] = k;
-      *(int*)&pDst_ptr[4] = k;
-      pDst_ptr += 8;
-    }
-    return;
-  }
-
-  int temp[64];
-
-  const jpgd_block_t* pSrc = pSrc_ptr;
-  int* pTemp = temp;
-
-  const uint8* pRow_tab = &s_idct_row_table[(block_max_zag - 1) * 8];
-  int i;
-  for (i = 8; i > 0; i--, pRow_tab++)
-  {
-    switch (*pRow_tab)
-    {
-      case 0: Row<0>::idct(pTemp, pSrc); break;
-      case 1: Row<1>::idct(pTemp, pSrc); break;
-      case 2: Row<2>::idct(pTemp, pSrc); break;
-      case 3: Row<3>::idct(pTemp, pSrc); break;
-      case 4: Row<4>::idct(pTemp, pSrc); break;
-      case 5: Row<5>::idct(pTemp, pSrc); break;
-      case 6: Row<6>::idct(pTemp, pSrc); break;
-      case 7: Row<7>::idct(pTemp, pSrc); break;
-      case 8: Row<8>::idct(pTemp, pSrc); break;
-    }
-
-    pSrc += 8;
-    pTemp += 8;
-  }
-
-  pTemp = temp;
-
-  const int nonzero_rows = s_idct_col_table[block_max_zag - 1];
-  for (i = 8; i > 0; i--)
-  {
-    switch (nonzero_rows)
-    {
-      case 1: Col<1>::idct(pDst_ptr, pTemp); break;
-      case 2: Col<2>::idct(pDst_ptr, pTemp); break;
-      case 3: Col<3>::idct(pDst_ptr, pTemp); break;
-      case 4: Col<4>::idct(pDst_ptr, pTemp); break;
-      case 5: Col<5>::idct(pDst_ptr, pTemp); break;
-      case 6: Col<6>::idct(pDst_ptr, pTemp); break;
-      case 7: Col<7>::idct(pDst_ptr, pTemp); break;
-      case 8: Col<8>::idct(pDst_ptr, pTemp); break;
-    }
-
-    pTemp++;
-    pDst_ptr++;
-  }
-}
-
-void idct_4x4(const jpgd_block_t* pSrc_ptr, uint8* pDst_ptr)
-{
-  int temp[64];
-  int* pTemp = temp;
-  const jpgd_block_t* pSrc = pSrc_ptr;
-
-  for (int i = 4; i > 0; i--)
-  {
-    Row<4>::idct(pTemp, pSrc);
-    pSrc += 8;
-    pTemp += 8;
-  }
-
-  pTemp = temp;
-  for (int i = 8; i > 0; i--)
-  {
-    Col<4>::idct(pDst_ptr, pTemp);
-    pTemp++;
-    pDst_ptr++;
-  }
-}
-
-// Retrieve one character from the input stream.
-inline uint jpeg_decoder::get_char()
-{
-  // Any bytes remaining in buffer?
-  if (!m_in_buf_left)
-  {
-    // Try to get more bytes.
-    prep_in_buffer();
-    // Still nothing to get?
-    if (!m_in_buf_left)
-    {
-      // Pad the end of the stream with 0xFF 0xD9 (EOI marker)
-      int t = m_tem_flag;
-      m_tem_flag ^= 1;
-      if (t)
-        return 0xD9;
-      else
-        return 0xFF;
-    }
-  }
-
-  uint c = *m_pIn_buf_ofs++;
-  m_in_buf_left--;
-
-  return c;
-}
-
-// Same as previous method, except can indicate if the character is a pad character or not.
-inline uint jpeg_decoder::get_char(bool *pPadding_flag)
-{
-  if (!m_in_buf_left)
-  {
-    prep_in_buffer();
-    if (!m_in_buf_left)
-    {
-      *pPadding_flag = true;
-      int t = m_tem_flag;
-      m_tem_flag ^= 1;
-      if (t)
-        return 0xD9;
-      else
-        return 0xFF;
-    }
-  }
-
-  *pPadding_flag = false;
-
-  uint c = *m_pIn_buf_ofs++;
-  m_in_buf_left--;
-
-  return c;
-}
-
-// Inserts a previously retrieved character back into the input buffer.
-inline void jpeg_decoder::stuff_char(uint8 q)
-{
-  *(--m_pIn_buf_ofs) = q;
-  m_in_buf_left++;
-}
-
-// Retrieves one character from the input stream, but does not read past markers. Will continue to return 0xFF when a marker is encountered.
-inline uint8 jpeg_decoder::get_octet()
-{
-  bool padding_flag;
-  int c = get_char(&padding_flag);
-
-  if (c == 0xFF)
-  {
-    if (padding_flag)
-      return 0xFF;
-
-    c = get_char(&padding_flag);
-    if (padding_flag)
-    {
-      stuff_char(0xFF);
-      return 0xFF;
-    }
-
-    if (c == 0x00)
-      return 0xFF;
-    else
-    {
-      stuff_char(static_cast<uint8>(c));
-      stuff_char(0xFF);
-      return 0xFF;
-    }
-  }
-
-  return static_cast<uint8>(c);
-}
-
-// Retrieves a variable number of bits from the input stream. Does not recognize markers.
-inline uint jpeg_decoder::get_bits(int num_bits)
-{
-  if (!num_bits)
-    return 0;
-
-  uint i = m_bit_buf >> (32 - num_bits);
-
-  if ((m_bits_left -= num_bits) <= 0)
-  {
-    m_bit_buf <<= (num_bits += m_bits_left);
-
-    uint c1 = get_char();
-    uint c2 = get_char();
-    m_bit_buf = (m_bit_buf & 0xFFFF0000) | (c1 << 8) | c2;
-
-    m_bit_buf <<= -m_bits_left;
-
-    m_bits_left += 16;
-
-    JPGD_ASSERT(m_bits_left >= 0);
-  }
-  else
-    m_bit_buf <<= num_bits;
-
-  return i;
-}
-
-// Retrieves a variable number of bits from the input stream. Markers will not be read into the input bit buffer. Instead, an infinite number of all 1's will be returned when a marker is encountered.
-inline uint jpeg_decoder::get_bits_no_markers(int num_bits)
-{
-  if (!num_bits)
-    return 0;
-
-  uint i = m_bit_buf >> (32 - num_bits);
-
-  if ((m_bits_left -= num_bits) <= 0)
-  {
-    m_bit_buf <<= (num_bits += m_bits_left);
-
-    if ((m_in_buf_left < 2) || (m_pIn_buf_ofs[0] == 0xFF) || (m_pIn_buf_ofs[1] == 0xFF))
-    {
-      uint c1 = get_octet();
-      uint c2 = get_octet();
-      m_bit_buf |= (c1 << 8) | c2;
-    }
-    else
-    {
-      m_bit_buf |= ((uint)m_pIn_buf_ofs[0] << 8) | m_pIn_buf_ofs[1];
-      m_in_buf_left -= 2;
-      m_pIn_buf_ofs += 2;
-    }
-
-    m_bit_buf <<= -m_bits_left;
-
-    m_bits_left += 16;
-
-    JPGD_ASSERT(m_bits_left >= 0);
-  }
-  else
-    m_bit_buf <<= num_bits;
-
-  return i;
-}
-
-// Decodes a Huffman encoded symbol.
-inline int jpeg_decoder::huff_decode(huff_tables *pH)
-{
-  JPGD_ASSERT(pH);
-
-  int symbol;
-  // Check first 8-bits: do we have a complete symbol?
-  if ((symbol = pH->look_up[m_bit_buf >> 24]) < 0)
-  {
-    // Decode more bits, use a tree traversal to find symbol.
-    int ofs = 23;
-    do
-    {
-      unsigned int idx = -(int)(symbol + ((m_bit_buf >> ofs) & 1));
-      JPGD_ASSERT(idx < JPGD_HUFF_TREE_MAX_LENGTH);
-      symbol = pH->tree[idx];
-      ofs--;
-    } while (symbol < 0);
-
-    get_bits_no_markers(8 + (23 - ofs));
-  }
-  else
-  {
-    JPGD_ASSERT(symbol < JPGD_HUFF_CODE_SIZE_MAX_LENGTH);
-    get_bits_no_markers(pH->code_size[symbol]);
-  }
-
-  return symbol;
-}
-
-// Decodes a Huffman encoded symbol.
-inline int jpeg_decoder::huff_decode(huff_tables *pH, int& extra_bits)
-{
-  int symbol;
-
-  JPGD_ASSERT(pH);
-
-  // Check first 8-bits: do we have a complete symbol?
-  if ((symbol = pH->look_up2[m_bit_buf >> 24]) < 0)
-  {
-    // Use a tree traversal to find symbol.
-    int ofs = 23;
-    do
-    {
-      unsigned int idx = -(int)(symbol + ((m_bit_buf >> ofs) & 1));
-      JPGD_ASSERT(idx < JPGD_HUFF_TREE_MAX_LENGTH);
-      symbol = pH->tree[idx];
-      ofs--;
-    } while (symbol < 0);
-
-    get_bits_no_markers(8 + (23 - ofs));
-
-    extra_bits = get_bits_no_markers(symbol & 0xF);
-  }
-  else
-  {
-    JPGD_ASSERT(((symbol >> 8) & 31) == pH->code_size[symbol & 255] + ((symbol & 0x8000) ? (symbol & 15) : 0));
-
-    if (symbol & 0x8000)
-    {
-      get_bits_no_markers((symbol >> 8) & 31);
-      extra_bits = symbol >> 16;
-    }
-    else
-    {
-      int code_size = (symbol >> 8) & 31;
-      int num_extra_bits = symbol & 0xF;
-      int bits = code_size + num_extra_bits;
-      if (bits <= (m_bits_left + 16))
-        extra_bits = get_bits_no_markers(bits) & ((1 << num_extra_bits) - 1);
-      else
-      {
-        get_bits_no_markers(code_size);
-        extra_bits = get_bits_no_markers(num_extra_bits);
-      }
-    }
-
-    symbol &= 0xFF;
-  }
-
-  return symbol;
-}
-
-// Tables and macro used to fully decode the DPCM differences.
-static const int s_extend_test[16] = { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
-static const int s_extend_offset[16] = { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1, ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1, ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1, ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
-static const int s_extend_mask[] = { 0, (1<<0), (1<<1), (1<<2), (1<<3), (1<<4), (1<<5), (1<<6), (1<<7), (1<<8), (1<<9), (1<<10), (1<<11), (1<<12), (1<<13), (1<<14), (1<<15), (1<<16) };
-// The logical AND's in this macro are to shut up static code analysis (aren't really necessary - couldn't find another way to do this)
+
+		int temp[64];
+
+		const jpgd_block_coeff_t* pSrc = pSrc_ptr;
+		int* pTemp = temp;
+
+		const uint8* pRow_tab = &s_idct_row_table[(block_max_zag - 1) * 8];
+		int i;
+		for (i = 8; i > 0; i--, pRow_tab++)
+		{
+			switch (*pRow_tab)
+			{
+			case 0: Row<0>::idct(pTemp, pSrc); break;
+			case 1: Row<1>::idct(pTemp, pSrc); break;
+			case 2: Row<2>::idct(pTemp, pSrc); break;
+			case 3: Row<3>::idct(pTemp, pSrc); break;
+			case 4: Row<4>::idct(pTemp, pSrc); break;
+			case 5: Row<5>::idct(pTemp, pSrc); break;
+			case 6: Row<6>::idct(pTemp, pSrc); break;
+			case 7: Row<7>::idct(pTemp, pSrc); break;
+			case 8: Row<8>::idct(pTemp, pSrc); break;
+			}
+
+			pSrc += 8;
+			pTemp += 8;
+		}
+
+		pTemp = temp;
+
+		const int nonzero_rows = s_idct_col_table[block_max_zag - 1];
+		for (i = 8; i > 0; i--)
+		{
+			switch (nonzero_rows)
+			{
+			case 1: Col<1>::idct(pDst_ptr, pTemp); break;
+			case 2: Col<2>::idct(pDst_ptr, pTemp); break;
+			case 3: Col<3>::idct(pDst_ptr, pTemp); break;
+			case 4: Col<4>::idct(pDst_ptr, pTemp); break;
+			case 5: Col<5>::idct(pDst_ptr, pTemp); break;
+			case 6: Col<6>::idct(pDst_ptr, pTemp); break;
+			case 7: Col<7>::idct(pDst_ptr, pTemp); break;
+			case 8: Col<8>::idct(pDst_ptr, pTemp); break;
+			}
+
+			pTemp++;
+			pDst_ptr++;
+		}
+	}
+
+	// Retrieve one character from the input stream.
+	inline uint jpeg_decoder::get_char()
+	{
+		// Any bytes remaining in buffer?
+		if (!m_in_buf_left)
+		{
+			// Try to get more bytes.
+			prep_in_buffer();
+			// Still nothing to get?
+			if (!m_in_buf_left)
+			{
+				// Pad the end of the stream with 0xFF 0xD9 (EOI marker)
+				int t = m_tem_flag;
+				m_tem_flag ^= 1;
+				if (t)
+					return 0xD9;
+				else
+					return 0xFF;
+			}
+		}
+
+		uint c = *m_pIn_buf_ofs++;
+		m_in_buf_left--;
+
+		return c;
+	}
+
+	// Same as previous method, except can indicate if the character is a pad character or not.
+	inline uint jpeg_decoder::get_char(bool* pPadding_flag)
+	{
+		if (!m_in_buf_left)
+		{
+			prep_in_buffer();
+			if (!m_in_buf_left)
+			{
+				*pPadding_flag = true;
+				int t = m_tem_flag;
+				m_tem_flag ^= 1;
+				if (t)
+					return 0xD9;
+				else
+					return 0xFF;
+			}
+		}
+
+		*pPadding_flag = false;
+
+		uint c = *m_pIn_buf_ofs++;
+		m_in_buf_left--;
+
+		return c;
+	}
+
+	// Inserts a previously retrieved character back into the input buffer.
+	inline void jpeg_decoder::stuff_char(uint8 q)
+	{
+		// This could write before the input buffer, but we've placed another array there.
+		*(--m_pIn_buf_ofs) = q;
+		m_in_buf_left++;
+	}
+
+	// Retrieves one character from the input stream, but does not read past markers. Will continue to return 0xFF when a marker is encountered.
+	inline uint8 jpeg_decoder::get_octet()
+	{
+		bool padding_flag;
+		int c = get_char(&padding_flag);
+
+		if (c == 0xFF)
+		{
+			if (padding_flag)
+				return 0xFF;
+
+			c = get_char(&padding_flag);
+			if (padding_flag)
+			{
+				stuff_char(0xFF);
+				return 0xFF;
+			}
+
+			if (c == 0x00)
+				return 0xFF;
+			else
+			{
+				stuff_char(static_cast<uint8>(c));
+				stuff_char(0xFF);
+				return 0xFF;
+			}
+		}
+
+		return static_cast<uint8>(c);
+	}
+
+	// Retrieves a variable number of bits from the input stream. Does not recognize markers.
+	inline uint jpeg_decoder::get_bits(int num_bits)
+	{
+		if (!num_bits)
+			return 0;
+
+		uint i = m_bit_buf >> (32 - num_bits);
+
+		if ((m_bits_left -= num_bits) <= 0)
+		{
+			m_bit_buf <<= (num_bits += m_bits_left);
+
+			uint c1 = get_char();
+			uint c2 = get_char();
+			m_bit_buf = (m_bit_buf & 0xFFFF0000) | (c1 << 8) | c2;
+
+			m_bit_buf <<= -m_bits_left;
+
+			m_bits_left += 16;
+
+			assert(m_bits_left >= 0);
+		}
+		else
+			m_bit_buf <<= num_bits;
+
+		return i;
+	}
+
+	// Retrieves a variable number of bits from the input stream. Markers will not be read into the input bit buffer. Instead, an infinite number of all 1's will be returned when a marker is encountered.
+	inline uint jpeg_decoder::get_bits_no_markers(int num_bits)
+	{
+		if (!num_bits)
+			return 0;
+
+		assert(num_bits <= 16);
+
+		uint i = m_bit_buf >> (32 - num_bits);
+
+		if ((m_bits_left -= num_bits) <= 0)
+		{
+			m_bit_buf <<= (num_bits += m_bits_left);
+
+			if ((m_in_buf_left < 2) || (m_pIn_buf_ofs[0] == 0xFF) || (m_pIn_buf_ofs[1] == 0xFF))
+			{
+				uint c1 = get_octet();
+				uint c2 = get_octet();
+				m_bit_buf |= (c1 << 8) | c2;
+			}
+			else
+			{
+				m_bit_buf |= ((uint)m_pIn_buf_ofs[0] << 8) | m_pIn_buf_ofs[1];
+				m_in_buf_left -= 2;
+				m_pIn_buf_ofs += 2;
+			}
+
+			m_bit_buf <<= -m_bits_left;
+
+			m_bits_left += 16;
+
+			assert(m_bits_left >= 0);
+		}
+		else
+			m_bit_buf <<= num_bits;
+
+		return i;
+	}
+
+	// Decodes a Huffman encoded symbol.
+	inline int jpeg_decoder::huff_decode(huff_tables* pH)
+	{
+		if (!pH)
+			stop_decoding(JPGD_DECODE_ERROR);
+
+		int symbol;
+		// Check first 8-bits: do we have a complete symbol?
+		if ((symbol = pH->look_up[m_bit_buf >> 24]) < 0)
+		{
+			// Decode more bits, use a tree traversal to find symbol.
+			int ofs = 23;
+			do
+			{
+				unsigned int idx = -(int)(symbol + ((m_bit_buf >> ofs) & 1));
+
+				// This should never happen, but to be safe I'm turning these asserts into a run-time check.
+				if ((idx >= JPGD_HUFF_TREE_MAX_LENGTH) || (ofs < 0))
+					stop_decoding(JPGD_DECODE_ERROR);
+
+				symbol = pH->tree[idx];
+				ofs--;
+			} while (symbol < 0);
+
+			get_bits_no_markers(8 + (23 - ofs));
+		}
+		else
+		{
+			assert(symbol < JPGD_HUFF_CODE_SIZE_MAX_LENGTH);
+			get_bits_no_markers(pH->code_size[symbol]);
+		}
+
+		return symbol;
+	}
+
+	// Decodes a Huffman encoded symbol.
+	inline int jpeg_decoder::huff_decode(huff_tables* pH, int& extra_bits)
+	{
+		int symbol;
+
+		if (!pH)
+			stop_decoding(JPGD_DECODE_ERROR);
+
+		// Check first 8-bits: do we have a complete symbol?
+		if ((symbol = pH->look_up2[m_bit_buf >> 24]) < 0)
+		{
+			// Use a tree traversal to find symbol.
+			int ofs = 23;
+			do
+			{
+				unsigned int idx = -(int)(symbol + ((m_bit_buf >> ofs) & 1));
+
+				// This should never happen, but to be safe I'm turning these asserts into a run-time check.
+				if ((idx >= JPGD_HUFF_TREE_MAX_LENGTH) || (ofs < 0))
+					stop_decoding(JPGD_DECODE_ERROR);
+
+				symbol = pH->tree[idx];
+				ofs--;
+			} while (symbol < 0);
+
+			get_bits_no_markers(8 + (23 - ofs));
+
+			extra_bits = get_bits_no_markers(symbol & 0xF);
+		}
+		else
+		{
+			if (symbol & 0x8000)
+			{
+				//get_bits_no_markers((symbol >> 8) & 31);
+				assert(((symbol >> 8) & 31) <= 15);
+				get_bits_no_markers((symbol >> 8) & 15);
+				extra_bits = symbol >> 16;
+			}
+			else
+			{
+				int code_size = (symbol >> 8) & 31;
+				int num_extra_bits = symbol & 0xF;
+				int bits = code_size + num_extra_bits;
+
+				if (bits <= 16)
+					extra_bits = get_bits_no_markers(bits) & ((1 << num_extra_bits) - 1);
+				else
+				{
+					get_bits_no_markers(code_size);
+					extra_bits = get_bits_no_markers(num_extra_bits);
+				}
+			}
+
+			symbol &= 0xFF;
+		}
+
+		return symbol;
+	}
+
+	// Tables and macro used to fully decode the DPCM differences.
+	static const int s_extend_test[16] = { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
+	static const int s_extend_offset[16] = { 0, -1, -3, -7, -15, -31, -63, -127, -255, -511, -1023, -2047, -4095, -8191, -16383, -32767 };
+	//static const int s_extend_mask[] = { 0, (1 << 0), (1 << 1), (1 << 2), (1 << 3), (1 << 4), (1 << 5), (1 << 6), (1 << 7), (1 << 8), (1 << 9), (1 << 10), (1 << 11), (1 << 12), (1 << 13), (1 << 14), (1 << 15), (1 << 16) };
+
 #define JPGD_HUFF_EXTEND(x, s) (((x) < s_extend_test[s & 15]) ? ((x) + s_extend_offset[s & 15]) : (x))
 
-// Clamps a value between 0-255.
-inline uint8 jpeg_decoder::clamp(int i)
-{
-  if (static_cast<uint>(i) > 255)
-    i = (((~i) >> 31) & 0xFF);
-
-  return static_cast<uint8>(i);
-}
-
-namespace DCT_Upsample
-{
-  struct Matrix44
-  {
-    typedef int Element_Type;
-    enum { NUM_ROWS = 4, NUM_COLS = 4 };
-
-    Element_Type v[NUM_ROWS][NUM_COLS];
-
-    inline int rows() const { return NUM_ROWS; }
-    inline int cols() const { return NUM_COLS; }
-
-    inline const Element_Type & at(int r, int c) const { return v[r][c]; }
-    inline       Element_Type & at(int r, int c)       { return v[r][c]; }
-
-    inline Matrix44() { }
-
-    inline Matrix44& operator += (const Matrix44& a)
-    {
-      for (int r = 0; r < NUM_ROWS; r++)
-      {
-        at(r, 0) += a.at(r, 0);
-        at(r, 1) += a.at(r, 1);
-        at(r, 2) += a.at(r, 2);
-        at(r, 3) += a.at(r, 3);
-      }
-      return *this;
-    }
-
-    inline Matrix44& operator -= (const Matrix44& a)
-    {
-      for (int r = 0; r < NUM_ROWS; r++)
-      {
-        at(r, 0) -= a.at(r, 0);
-        at(r, 1) -= a.at(r, 1);
-        at(r, 2) -= a.at(r, 2);
-        at(r, 3) -= a.at(r, 3);
-      }
-      return *this;
-    }
-
-    friend inline Matrix44 operator + (const Matrix44& a, const Matrix44& b)
-    {
-      Matrix44 ret;
-      for (int r = 0; r < NUM_ROWS; r++)
-      {
-        ret.at(r, 0) = a.at(r, 0) + b.at(r, 0);
-        ret.at(r, 1) = a.at(r, 1) + b.at(r, 1);
-        ret.at(r, 2) = a.at(r, 2) + b.at(r, 2);
-        ret.at(r, 3) = a.at(r, 3) + b.at(r, 3);
-      }
-      return ret;
-    }
-
-    friend inline Matrix44 operator - (const Matrix44& a, const Matrix44& b)
-    {
-      Matrix44 ret;
-      for (int r = 0; r < NUM_ROWS; r++)
-      {
-        ret.at(r, 0) = a.at(r, 0) - b.at(r, 0);
-        ret.at(r, 1) = a.at(r, 1) - b.at(r, 1);
-        ret.at(r, 2) = a.at(r, 2) - b.at(r, 2);
-        ret.at(r, 3) = a.at(r, 3) - b.at(r, 3);
-      }
-      return ret;
-    }
-
-    static inline void add_and_store(jpgd_block_t* pDst, const Matrix44& a, const Matrix44& b)
-    {
-      for (int r = 0; r < 4; r++)
-      {
-        pDst[0*8 + r] = static_cast<jpgd_block_t>(a.at(r, 0) + b.at(r, 0));
-        pDst[1*8 + r] = static_cast<jpgd_block_t>(a.at(r, 1) + b.at(r, 1));
-        pDst[2*8 + r] = static_cast<jpgd_block_t>(a.at(r, 2) + b.at(r, 2));
-        pDst[3*8 + r] = static_cast<jpgd_block_t>(a.at(r, 3) + b.at(r, 3));
-      }
-    }
-
-    static inline void sub_and_store(jpgd_block_t* pDst, const Matrix44& a, const Matrix44& b)
-    {
-      for (int r = 0; r < 4; r++)
-      {
-        pDst[0*8 + r] = static_cast<jpgd_block_t>(a.at(r, 0) - b.at(r, 0));
-        pDst[1*8 + r] = static_cast<jpgd_block_t>(a.at(r, 1) - b.at(r, 1));
-        pDst[2*8 + r] = static_cast<jpgd_block_t>(a.at(r, 2) - b.at(r, 2));
-        pDst[3*8 + r] = static_cast<jpgd_block_t>(a.at(r, 3) - b.at(r, 3));
-      }
-    }
-  };
-
-  const int FRACT_BITS = 10;
-  const int SCALE = 1 << FRACT_BITS;
-
-  typedef int Temp_Type;
-  #define D(i) (((i) + (SCALE >> 1)) >> FRACT_BITS)
-  #define F(i) ((int)((i) * SCALE + .5f))
-
-  // Any decent C++ compiler will optimize this at compile time to a 0, or an array access.
-  #define AT(c, r) ((((c)>=NUM_COLS)||((r)>=NUM_ROWS)) ? 0 : pSrc[(c)+(r)*8])
-
-  // NUM_ROWS/NUM_COLS = # of non-zero rows/cols in input matrix
-  template<int NUM_ROWS, int NUM_COLS>
-  struct P_Q
-  {
-    static void calc(Matrix44& P, Matrix44& Q, const jpgd_block_t* pSrc)
-    {
-      // 4x8 = 4x8 times 8x8, matrix 0 is constant
-      const Temp_Type X000 = AT(0, 0);
-      const Temp_Type X001 = AT(0, 1);
-      const Temp_Type X002 = AT(0, 2);
-      const Temp_Type X003 = AT(0, 3);
-      const Temp_Type X004 = AT(0, 4);
-      const Temp_Type X005 = AT(0, 5);
-      const Temp_Type X006 = AT(0, 6);
-      const Temp_Type X007 = AT(0, 7);
-      const Temp_Type X010 = D(F(0.415735f) * AT(1, 0) + F(0.791065f) * AT(3, 0) + F(-0.352443f) * AT(5, 0) + F(0.277785f) * AT(7, 0));
-      const Temp_Type X011 = D(F(0.415735f) * AT(1, 1) + F(0.791065f) * AT(3, 1) + F(-0.352443f) * AT(5, 1) + F(0.277785f) * AT(7, 1));
-      const Temp_Type X012 = D(F(0.415735f) * AT(1, 2) + F(0.791065f) * AT(3, 2) + F(-0.352443f) * AT(5, 2) + F(0.277785f) * AT(7, 2));
-      const Temp_Type X013 = D(F(0.415735f) * AT(1, 3) + F(0.791065f) * AT(3, 3) + F(-0.352443f) * AT(5, 3) + F(0.277785f) * AT(7, 3));
-      const Temp_Type X014 = D(F(0.415735f) * AT(1, 4) + F(0.791065f) * AT(3, 4) + F(-0.352443f) * AT(5, 4) + F(0.277785f) * AT(7, 4));
-      const Temp_Type X015 = D(F(0.415735f) * AT(1, 5) + F(0.791065f) * AT(3, 5) + F(-0.352443f) * AT(5, 5) + F(0.277785f) * AT(7, 5));
-      const Temp_Type X016 = D(F(0.415735f) * AT(1, 6) + F(0.791065f) * AT(3, 6) + F(-0.352443f) * AT(5, 6) + F(0.277785f) * AT(7, 6));
-      const Temp_Type X017 = D(F(0.415735f) * AT(1, 7) + F(0.791065f) * AT(3, 7) + F(-0.352443f) * AT(5, 7) + F(0.277785f) * AT(7, 7));
-      const Temp_Type X020 = AT(4, 0);
-      const Temp_Type X021 = AT(4, 1);
-      const Temp_Type X022 = AT(4, 2);
-      const Temp_Type X023 = AT(4, 3);
-      const Temp_Type X024 = AT(4, 4);
-      const Temp_Type X025 = AT(4, 5);
-      const Temp_Type X026 = AT(4, 6);
-      const Temp_Type X027 = AT(4, 7);
-      const Temp_Type X030 = D(F(0.022887f) * AT(1, 0) + F(-0.097545f) * AT(3, 0) + F(0.490393f) * AT(5, 0) + F(0.865723f) * AT(7, 0));
-      const Temp_Type X031 = D(F(0.022887f) * AT(1, 1) + F(-0.097545f) * AT(3, 1) + F(0.490393f) * AT(5, 1) + F(0.865723f) * AT(7, 1));
-      const Temp_Type X032 = D(F(0.022887f) * AT(1, 2) + F(-0.097545f) * AT(3, 2) + F(0.490393f) * AT(5, 2) + F(0.865723f) * AT(7, 2));
-      const Temp_Type X033 = D(F(0.022887f) * AT(1, 3) + F(-0.097545f) * AT(3, 3) + F(0.490393f) * AT(5, 3) + F(0.865723f) * AT(7, 3));
-      const Temp_Type X034 = D(F(0.022887f) * AT(1, 4) + F(-0.097545f) * AT(3, 4) + F(0.490393f) * AT(5, 4) + F(0.865723f) * AT(7, 4));
-      const Temp_Type X035 = D(F(0.022887f) * AT(1, 5) + F(-0.097545f) * AT(3, 5) + F(0.490393f) * AT(5, 5) + F(0.865723f) * AT(7, 5));
-      const Temp_Type X036 = D(F(0.022887f) * AT(1, 6) + F(-0.097545f) * AT(3, 6) + F(0.490393f) * AT(5, 6) + F(0.865723f) * AT(7, 6));
-      const Temp_Type X037 = D(F(0.022887f) * AT(1, 7) + F(-0.097545f) * AT(3, 7) + F(0.490393f) * AT(5, 7) + F(0.865723f) * AT(7, 7));
-
-      // 4x4 = 4x8 times 8x4, matrix 1 is constant
-      P.at(0, 0) = X000;
-      P.at(0, 1) = D(X001 * F(0.415735f) + X003 * F(0.791065f) + X005 * F(-0.352443f) + X007 * F(0.277785f));
-      P.at(0, 2) = X004;
-      P.at(0, 3) = D(X001 * F(0.022887f) + X003 * F(-0.097545f) + X005 * F(0.490393f) + X007 * F(0.865723f));
-      P.at(1, 0) = X010;
-      P.at(1, 1) = D(X011 * F(0.415735f) + X013 * F(0.791065f) + X015 * F(-0.352443f) + X017 * F(0.277785f));
-      P.at(1, 2) = X014;
-      P.at(1, 3) = D(X011 * F(0.022887f) + X013 * F(-0.097545f) + X015 * F(0.490393f) + X017 * F(0.865723f));
-      P.at(2, 0) = X020;
-      P.at(2, 1) = D(X021 * F(0.415735f) + X023 * F(0.791065f) + X025 * F(-0.352443f) + X027 * F(0.277785f));
-      P.at(2, 2) = X024;
-      P.at(2, 3) = D(X021 * F(0.022887f) + X023 * F(-0.097545f) + X025 * F(0.490393f) + X027 * F(0.865723f));
-      P.at(3, 0) = X030;
-      P.at(3, 1) = D(X031 * F(0.415735f) + X033 * F(0.791065f) + X035 * F(-0.352443f) + X037 * F(0.277785f));
-      P.at(3, 2) = X034;
-      P.at(3, 3) = D(X031 * F(0.022887f) + X033 * F(-0.097545f) + X035 * F(0.490393f) + X037 * F(0.865723f));
-      // 40 muls 24 adds
-
-      // 4x4 = 4x8 times 8x4, matrix 1 is constant
-      Q.at(0, 0) = D(X001 * F(0.906127f) + X003 * F(-0.318190f) + X005 * F(0.212608f) + X007 * F(-0.180240f));
-      Q.at(0, 1) = X002;
-      Q.at(0, 2) = D(X001 * F(-0.074658f) + X003 * F(0.513280f) + X005 * F(0.768178f) + X007 * F(-0.375330f));
-      Q.at(0, 3) = X006;
-      Q.at(1, 0) = D(X011 * F(0.906127f) + X013 * F(-0.318190f) + X015 * F(0.212608f) + X017 * F(-0.180240f));
-      Q.at(1, 1) = X012;
-      Q.at(1, 2) = D(X011 * F(-0.074658f) + X013 * F(0.513280f) + X015 * F(0.768178f) + X017 * F(-0.375330f));
-      Q.at(1, 3) = X016;
-      Q.at(2, 0) = D(X021 * F(0.906127f) + X023 * F(-0.318190f) + X025 * F(0.212608f) + X027 * F(-0.180240f));
-      Q.at(2, 1) = X022;
-      Q.at(2, 2) = D(X021 * F(-0.074658f) + X023 * F(0.513280f) + X025 * F(0.768178f) + X027 * F(-0.375330f));
-      Q.at(2, 3) = X026;
-      Q.at(3, 0) = D(X031 * F(0.906127f) + X033 * F(-0.318190f) + X035 * F(0.212608f) + X037 * F(-0.180240f));
-      Q.at(3, 1) = X032;
-      Q.at(3, 2) = D(X031 * F(-0.074658f) + X033 * F(0.513280f) + X035 * F(0.768178f) + X037 * F(-0.375330f));
-      Q.at(3, 3) = X036;
-      // 40 muls 24 adds
-    }
-  };
-
-  template<int NUM_ROWS, int NUM_COLS>
-  struct R_S
-  {
-    static void calc(Matrix44& R, Matrix44& S, const jpgd_block_t* pSrc)
-    {
-      // 4x8 = 4x8 times 8x8, matrix 0 is constant
-      const Temp_Type X100 = D(F(0.906127f) * AT(1, 0) + F(-0.318190f) * AT(3, 0) + F(0.212608f) * AT(5, 0) + F(-0.180240f) * AT(7, 0));
-      const Temp_Type X101 = D(F(0.906127f) * AT(1, 1) + F(-0.318190f) * AT(3, 1) + F(0.212608f) * AT(5, 1) + F(-0.180240f) * AT(7, 1));
-      const Temp_Type X102 = D(F(0.906127f) * AT(1, 2) + F(-0.318190f) * AT(3, 2) + F(0.212608f) * AT(5, 2) + F(-0.180240f) * AT(7, 2));
-      const Temp_Type X103 = D(F(0.906127f) * AT(1, 3) + F(-0.318190f) * AT(3, 3) + F(0.212608f) * AT(5, 3) + F(-0.180240f) * AT(7, 3));
-      const Temp_Type X104 = D(F(0.906127f) * AT(1, 4) + F(-0.318190f) * AT(3, 4) + F(0.212608f) * AT(5, 4) + F(-0.180240f) * AT(7, 4));
-      const Temp_Type X105 = D(F(0.906127f) * AT(1, 5) + F(-0.318190f) * AT(3, 5) + F(0.212608f) * AT(5, 5) + F(-0.180240f) * AT(7, 5));
-      const Temp_Type X106 = D(F(0.906127f) * AT(1, 6) + F(-0.318190f) * AT(3, 6) + F(0.212608f) * AT(5, 6) + F(-0.180240f) * AT(7, 6));
-      const Temp_Type X107 = D(F(0.906127f) * AT(1, 7) + F(-0.318190f) * AT(3, 7) + F(0.212608f) * AT(5, 7) + F(-0.180240f) * AT(7, 7));
-      const Temp_Type X110 = AT(2, 0);
-      const Temp_Type X111 = AT(2, 1);
-      const Temp_Type X112 = AT(2, 2);
-      const Temp_Type X113 = AT(2, 3);
-      const Temp_Type X114 = AT(2, 4);
-      const Temp_Type X115 = AT(2, 5);
-      const Temp_Type X116 = AT(2, 6);
-      const Temp_Type X117 = AT(2, 7);
-      const Temp_Type X120 = D(F(-0.074658f) * AT(1, 0) + F(0.513280f) * AT(3, 0) + F(0.768178f) * AT(5, 0) + F(-0.375330f) * AT(7, 0));
-      const Temp_Type X121 = D(F(-0.074658f) * AT(1, 1) + F(0.513280f) * AT(3, 1) + F(0.768178f) * AT(5, 1) + F(-0.375330f) * AT(7, 1));
-      const Temp_Type X122 = D(F(-0.074658f) * AT(1, 2) + F(0.513280f) * AT(3, 2) + F(0.768178f) * AT(5, 2) + F(-0.375330f) * AT(7, 2));
-      const Temp_Type X123 = D(F(-0.074658f) * AT(1, 3) + F(0.513280f) * AT(3, 3) + F(0.768178f) * AT(5, 3) + F(-0.375330f) * AT(7, 3));
-      const Temp_Type X124 = D(F(-0.074658f) * AT(1, 4) + F(0.513280f) * AT(3, 4) + F(0.768178f) * AT(5, 4) + F(-0.375330f) * AT(7, 4));
-      const Temp_Type X125 = D(F(-0.074658f) * AT(1, 5) + F(0.513280f) * AT(3, 5) + F(0.768178f) * AT(5, 5) + F(-0.375330f) * AT(7, 5));
-      const Temp_Type X126 = D(F(-0.074658f) * AT(1, 6) + F(0.513280f) * AT(3, 6) + F(0.768178f) * AT(5, 6) + F(-0.375330f) * AT(7, 6));
-      const Temp_Type X127 = D(F(-0.074658f) * AT(1, 7) + F(0.513280f) * AT(3, 7) + F(0.768178f) * AT(5, 7) + F(-0.375330f) * AT(7, 7));
-      const Temp_Type X130 = AT(6, 0);
-      const Temp_Type X131 = AT(6, 1);
-      const Temp_Type X132 = AT(6, 2);
-      const Temp_Type X133 = AT(6, 3);
-      const Temp_Type X134 = AT(6, 4);
-      const Temp_Type X135 = AT(6, 5);
-      const Temp_Type X136 = AT(6, 6);
-      const Temp_Type X137 = AT(6, 7);
-      // 80 muls 48 adds
-
-      // 4x4 = 4x8 times 8x4, matrix 1 is constant
-      R.at(0, 0) = X100;
-      R.at(0, 1) = D(X101 * F(0.415735f) + X103 * F(0.791065f) + X105 * F(-0.352443f) + X107 * F(0.277785f));
-      R.at(0, 2) = X104;
-      R.at(0, 3) = D(X101 * F(0.022887f) + X103 * F(-0.097545f) + X105 * F(0.490393f) + X107 * F(0.865723f));
-      R.at(1, 0) = X110;
-      R.at(1, 1) = D(X111 * F(0.415735f) + X113 * F(0.791065f) + X115 * F(-0.352443f) + X117 * F(0.277785f));
-      R.at(1, 2) = X114;
-      R.at(1, 3) = D(X111 * F(0.022887f) + X113 * F(-0.097545f) + X115 * F(0.490393f) + X117 * F(0.865723f));
-      R.at(2, 0) = X120;
-      R.at(2, 1) = D(X121 * F(0.415735f) + X123 * F(0.791065f) + X125 * F(-0.352443f) + X127 * F(0.277785f));
-      R.at(2, 2) = X124;
-      R.at(2, 3) = D(X121 * F(0.022887f) + X123 * F(-0.097545f) + X125 * F(0.490393f) + X127 * F(0.865723f));
-      R.at(3, 0) = X130;
-      R.at(3, 1) = D(X131 * F(0.415735f) + X133 * F(0.791065f) + X135 * F(-0.352443f) + X137 * F(0.277785f));
-      R.at(3, 2) = X134;
-      R.at(3, 3) = D(X131 * F(0.022887f) + X133 * F(-0.097545f) + X135 * F(0.490393f) + X137 * F(0.865723f));
-      // 40 muls 24 adds
-      // 4x4 = 4x8 times 8x4, matrix 1 is constant
-      S.at(0, 0) = D(X101 * F(0.906127f) + X103 * F(-0.318190f) + X105 * F(0.212608f) + X107 * F(-0.180240f));
-      S.at(0, 1) = X102;
-      S.at(0, 2) = D(X101 * F(-0.074658f) + X103 * F(0.513280f) + X105 * F(0.768178f) + X107 * F(-0.375330f));
-      S.at(0, 3) = X106;
-      S.at(1, 0) = D(X111 * F(0.906127f) + X113 * F(-0.318190f) + X115 * F(0.212608f) + X117 * F(-0.180240f));
-      S.at(1, 1) = X112;
-      S.at(1, 2) = D(X111 * F(-0.074658f) + X113 * F(0.513280f) + X115 * F(0.768178f) + X117 * F(-0.375330f));
-      S.at(1, 3) = X116;
-      S.at(2, 0) = D(X121 * F(0.906127f) + X123 * F(-0.318190f) + X125 * F(0.212608f) + X127 * F(-0.180240f));
-      S.at(2, 1) = X122;
-      S.at(2, 2) = D(X121 * F(-0.074658f) + X123 * F(0.513280f) + X125 * F(0.768178f) + X127 * F(-0.375330f));
-      S.at(2, 3) = X126;
-      S.at(3, 0) = D(X131 * F(0.906127f) + X133 * F(-0.318190f) + X135 * F(0.212608f) + X137 * F(-0.180240f));
-      S.at(3, 1) = X132;
-      S.at(3, 2) = D(X131 * F(-0.074658f) + X133 * F(0.513280f) + X135 * F(0.768178f) + X137 * F(-0.375330f));
-      S.at(3, 3) = X136;
-      // 40 muls 24 adds
-    }
-  };
-} // end namespace DCT_Upsample
-
-// Unconditionally frees all allocated m_blocks.
-void jpeg_decoder::free_all_blocks()
-{
-  m_pStream = NULL;
-  for (mem_block *b = m_pMem_blocks; b; )
-  {
-    mem_block *n = b->m_pNext;
-    jpgd_free(b);
-    b = n;
-  }
-  m_pMem_blocks = NULL;
-}
-
-// This method handles all errors. It will never return.
-// It could easily be changed to use C++ exceptions.
-JPGD_NORETURN void jpeg_decoder::stop_decoding(jpgd_status status)
-{
-  m_error_code = status;
-  free_all_blocks();
-  longjmp(m_jmp_state, status);
-}
-
-void *jpeg_decoder::alloc(size_t nSize, bool zero)
-{
-  nSize = (JPGD_MAX(nSize, 1) + 3) & ~3;
-  char *rv = NULL;
-  for (mem_block *b = m_pMem_blocks; b; b = b->m_pNext)
-  {
-    if ((b->m_used_count + nSize) <= b->m_size)
-    {
-      rv = b->m_data + b->m_used_count;
-      b->m_used_count += nSize;
-      break;
-    }
-  }
-  if (!rv)
-  {
-    int capacity = JPGD_MAX(32768 - 256, (nSize + 2047) & ~2047);
-    mem_block *b = (mem_block*)jpgd_malloc(sizeof(mem_block) + capacity);
-    if (!b) { stop_decoding(JPGD_NOTENOUGHMEM); }
-    b->m_pNext = m_pMem_blocks; m_pMem_blocks = b;
-    b->m_used_count = nSize;
-    b->m_size = capacity;
-    rv = b->m_data;
-  }
-  if (zero) memset(rv, 0, nSize);
-  return rv;
-}
-
-void jpeg_decoder::word_clear(void *p, uint16 c, uint n)
-{
-  uint8 *pD = (uint8*)p;
-  const uint8 l = c & 0xFF, h = (c >> 8) & 0xFF;
-  while (n)
-  {
-    pD[0] = l; pD[1] = h; pD += 2;
-    n--;
-  }
-}
-
-// Refill the input buffer.
-// This method will sit in a loop until (A) the buffer is full or (B)
-// the stream's read() method reports and end of file condition.
-void jpeg_decoder::prep_in_buffer()
-{
-  m_in_buf_left = 0;
-  m_pIn_buf_ofs = m_in_buf;
-
-  if (m_eof_flag)
-    return;
-
-  do
-  {
-    int bytes_read = m_pStream->read(m_in_buf + m_in_buf_left, JPGD_IN_BUF_SIZE - m_in_buf_left, &m_eof_flag);
-    if (bytes_read == -1)
-      stop_decoding(JPGD_STREAM_READ);
-
-    m_in_buf_left += bytes_read;
-  } while ((m_in_buf_left < JPGD_IN_BUF_SIZE) && (!m_eof_flag));
-
-  m_total_bytes_read += m_in_buf_left;
-
-  // Pad the end of the block with M_EOI (prevents the decompressor from going off the rails if the stream is invalid).
-  // (This dates way back to when this decompressor was written in C/asm, and the all-asm Huffman decoder did some fancy things to increase perf.)
-  word_clear(m_pIn_buf_ofs + m_in_buf_left, 0xD9FF, 64);
-}
-
-// Read a Huffman code table.
-void jpeg_decoder::read_dht_marker()
-{
-  int i, index, count;
-  uint8 huff_num[17];
-  uint8 huff_val[256];
-
-  uint num_left = get_bits(16);
-
-  if (num_left < 2)
-    stop_decoding(JPGD_BAD_DHT_MARKER);
-
-  num_left -= 2;
-
-  while (num_left)
-  {
-    index = get_bits(8);
-
-    huff_num[0] = 0;
-
-    count = 0;
-
-    for (i = 1; i <= 16; i++)
-    {
-      huff_num[i] = static_cast<uint8>(get_bits(8));
-      count += huff_num[i];
-    }
-
-    if (count > 255)
-      stop_decoding(JPGD_BAD_DHT_COUNTS);
-
-    for (i = 0; i < count; i++)
-      huff_val[i] = static_cast<uint8>(get_bits(8));
+	// Unconditionally frees all allocated m_blocks.
+	void jpeg_decoder::free_all_blocks()
+	{
+		m_pStream = nullptr;
+		for (mem_block* b = m_pMem_blocks; b; )
+		{
+			mem_block* n = b->m_pNext;
+			jpgd_free(b);
+			b = n;
+		}
+		m_pMem_blocks = nullptr;
+	}
+
+	// This method handles all errors. It will never return.
+	// It could easily be changed to use C++ exceptions.
+	JPGD_NORETURN void jpeg_decoder::stop_decoding(jpgd_status status)
+	{
+		m_error_code = status;
+		free_all_blocks();
+		longjmp(m_jmp_state, status);
+	}
+		
+	void* jpeg_decoder::alloc(size_t nSize, bool zero)
+	{
+		nSize = (JPGD_MAX(nSize, 1) + 3) & ~3;
+		char* rv = nullptr;
+		for (mem_block* b = m_pMem_blocks; b; b = b->m_pNext)
+		{
+			if ((b->m_used_count + nSize) <= b->m_size)
+			{
+				rv = b->m_data + b->m_used_count;
+				b->m_used_count += nSize;
+				break;
+			}
+		}
+		if (!rv)
+		{
+			int capacity = JPGD_MAX(32768 - 256, (nSize + 2047) & ~2047);
+			mem_block* b = (mem_block*)jpgd_malloc(sizeof(mem_block) + capacity);
+			if (!b)
+			{
+				stop_decoding(JPGD_NOTENOUGHMEM);
+			}
+
+			b->m_pNext = m_pMem_blocks;
+			m_pMem_blocks = b;
+			b->m_used_count = nSize;
+			b->m_size = capacity;
+			rv = b->m_data;
+		}
+		if (zero) memset(rv, 0, nSize);
+		return rv;
+	}
+
+	void* jpeg_decoder::alloc_aligned(size_t nSize, uint32_t align, bool zero)
+	{
+		assert((align >= 1U) && ((align & (align - 1U)) == 0U));
+		void *p = alloc(nSize + align - 1U, zero);
+		p = (void *)( ((uintptr_t)p + (align - 1U)) & ~((uintptr_t)(align - 1U)) );
+		return p;
+	}
+
+	void jpeg_decoder::word_clear(void* p, uint16 c, uint n)
+	{
+		uint8* pD = (uint8*)p;
+		const uint8 l = c & 0xFF, h = (c >> 8) & 0xFF;
+		while (n)
+		{
+			pD[0] = l;
+			pD[1] = h;
+			pD += 2;
+			n--;
+		}
+	}
+
+	// Refill the input buffer.
+	// This method will sit in a loop until (A) the buffer is full or (B)
+	// the stream's read() method reports and end of file condition.
+	void jpeg_decoder::prep_in_buffer()
+	{
+		m_in_buf_left = 0;
+		m_pIn_buf_ofs = m_in_buf;
+
+		if (m_eof_flag)
+			return;
+
+		do
+		{
+			int bytes_read = m_pStream->read(m_in_buf + m_in_buf_left, JPGD_IN_BUF_SIZE - m_in_buf_left, &m_eof_flag);
+			if (bytes_read == -1)
+				stop_decoding(JPGD_STREAM_READ);
+
+			m_in_buf_left += bytes_read;
+		} while ((m_in_buf_left < JPGD_IN_BUF_SIZE) && (!m_eof_flag));
+
+		m_total_bytes_read += m_in_buf_left;
+
+		// Pad the end of the block with M_EOI (prevents the decompressor from going off the rails if the stream is invalid).
+		// (This dates way back to when this decompressor was written in C/asm, and the all-asm Huffman decoder did some fancy things to increase perf.)
+		word_clear(m_pIn_buf_ofs + m_in_buf_left, 0xD9FF, 64);
+	}
+
+	// Read a Huffman code table.
+	void jpeg_decoder::read_dht_marker()
+	{
+		int i, index, count;
+		uint8 huff_num[17];
+		uint8 huff_val[256];
+
+		uint num_left = get_bits(16);
+
+		if (num_left < 2)
+			stop_decoding(JPGD_BAD_DHT_MARKER);
+
+		num_left -= 2;
+
+		while (num_left)
+		{
+			index = get_bits(8);
+
+			huff_num[0] = 0;
+
+			count = 0;
+
+			for (i = 1; i <= 16; i++)
+			{
+				huff_num[i] = static_cast<uint8>(get_bits(8));
+				count += huff_num[i];
+			}
+
+			if (count > 255)
+				stop_decoding(JPGD_BAD_DHT_COUNTS);
+
+			bool symbol_present[256];
+			memset(symbol_present, 0, sizeof(symbol_present));
+
+			for (i = 0; i < count; i++)
+			{
+				const int s = get_bits(8);
 
-    i = 1 + 16 + count;
+				// Check for obviously bogus tables.
+				if (symbol_present[s])
+					stop_decoding(JPGD_BAD_DHT_COUNTS);
 
-    if (num_left < (uint)i)
-      stop_decoding(JPGD_BAD_DHT_MARKER);
+				huff_val[i] = static_cast<uint8_t>(s);
+				symbol_present[s] = true;
+			}
 
-    num_left -= i;
+			i = 1 + 16 + count;
 
-    if ((index & 0x10) > 0x10)
-      stop_decoding(JPGD_BAD_DHT_INDEX);
+			if (num_left < (uint)i)
+				stop_decoding(JPGD_BAD_DHT_MARKER);
 
-    index = (index & 0x0F) + ((index & 0x10) >> 4) * (JPGD_MAX_HUFF_TABLES >> 1);
+			num_left -= i;
 
-    if (index >= JPGD_MAX_HUFF_TABLES)
-      stop_decoding(JPGD_BAD_DHT_INDEX);
+			if ((index & 0x10) > 0x10)
+				stop_decoding(JPGD_BAD_DHT_INDEX);
 
-    if (!m_huff_num[index])
-      m_huff_num[index] = (uint8 *)alloc(17);
+			index = (index & 0x0F) + ((index & 0x10) >> 4) * (JPGD_MAX_HUFF_TABLES >> 1);
 
-    if (!m_huff_val[index])
-      m_huff_val[index] = (uint8 *)alloc(256);
+			if (index >= JPGD_MAX_HUFF_TABLES)
+				stop_decoding(JPGD_BAD_DHT_INDEX);
 
-    m_huff_ac[index] = (index & 0x10) != 0;
-    memcpy(m_huff_num[index], huff_num, 17);
-    memcpy(m_huff_val[index], huff_val, 256);
-  }
-}
+			if (!m_huff_num[index])
+				m_huff_num[index] = (uint8*)alloc(17);
 
-// Read a quantization table.
-void jpeg_decoder::read_dqt_marker()
-{
-  int n, i, prec;
-  uint num_left;
-  uint temp;
+			if (!m_huff_val[index])
+				m_huff_val[index] = (uint8*)alloc(256);
 
-  num_left = get_bits(16);
+			m_huff_ac[index] = (index & 0x10) != 0;
+			memcpy(m_huff_num[index], huff_num, 17);
+			memcpy(m_huff_val[index], huff_val, 256);
+		}
+	}
 
-  if (num_left < 2)
-    stop_decoding(JPGD_BAD_DQT_MARKER);
+	// Read a quantization table.
+	void jpeg_decoder::read_dqt_marker()
+	{
+		int n, i, prec;
+		uint num_left;
+		uint temp;
 
-  num_left -= 2;
+		num_left = get_bits(16);
 
-  while (num_left)
-  {
-    n = get_bits(8);
-    prec = n >> 4;
-    n &= 0x0F;
+		if (num_left < 2)
+			stop_decoding(JPGD_BAD_DQT_MARKER);
 
-    if (n >= JPGD_MAX_QUANT_TABLES)
-      stop_decoding(JPGD_BAD_DQT_TABLE);
+		num_left -= 2;
 
-    if (!m_quant[n])
-      m_quant[n] = (jpgd_quant_t *)alloc(64 * sizeof(jpgd_quant_t));
+		while (num_left)
+		{
+			n = get_bits(8);
+			prec = n >> 4;
+			n &= 0x0F;
 
-    // read quantization entries, in zag order
-    for (i = 0; i < 64; i++)
-    {
-      temp = get_bits(8);
+			if (n >= JPGD_MAX_QUANT_TABLES)
+				stop_decoding(JPGD_BAD_DQT_TABLE);
 
-      if (prec)
-        temp = (temp << 8) + get_bits(8);
+			if (!m_quant[n])
+				m_quant[n] = (jpgd_quant_t*)alloc(64 * sizeof(jpgd_quant_t));
 
-			m_quant[n][i] = static_cast<jpgd_quant_t>(temp);
-    }
+			// read quantization entries, in zag order
+			for (i = 0; i < 64; i++)
+			{
+				temp = get_bits(8);
 
-    i = 64 + 1;
+				if (prec)
+					temp = (temp << 8) + get_bits(8);
 
-    if (prec)
-      i += 64;
+				m_quant[n][i] = static_cast<jpgd_quant_t>(temp);
+			}
 
-    if (num_left < (uint)i)
-      stop_decoding(JPGD_BAD_DQT_LENGTH);
+			i = 64 + 1;
 
-    num_left -= i;
-  }
-}
+			if (prec)
+				i += 64;
 
-// Read the start of frame (SOF) marker.
-void jpeg_decoder::read_sof_marker()
-{
-  int i;
-  uint num_left;
+			if (num_left < (uint)i)
+				stop_decoding(JPGD_BAD_DQT_LENGTH);
 
-  num_left = get_bits(16);
+			num_left -= i;
+		}
+	}
 
-  if (get_bits(8) != 8)   /* precision: sorry, only 8-bit precision is supported right now */
-    stop_decoding(JPGD_BAD_PRECISION);
+	// Read the start of frame (SOF) marker.
+	void jpeg_decoder::read_sof_marker()
+	{
+		int i;
+		uint num_left;
 
-  m_image_y_size = get_bits(16);
+		num_left = get_bits(16);
 
-  if ((m_image_y_size < 1) || (m_image_y_size > JPGD_MAX_HEIGHT))
-    stop_decoding(JPGD_BAD_HEIGHT);
+		/* precision: sorry, only 8-bit precision is supported */
+		if (get_bits(8) != 8)
+			stop_decoding(JPGD_BAD_PRECISION);
 
-  m_image_x_size = get_bits(16);
+		m_image_y_size = get_bits(16);
 
-  if ((m_image_x_size < 1) || (m_image_x_size > JPGD_MAX_WIDTH))
-    stop_decoding(JPGD_BAD_WIDTH);
+		if ((m_image_y_size < 1) || (m_image_y_size > JPGD_MAX_HEIGHT))
+			stop_decoding(JPGD_BAD_HEIGHT);
 
-  m_comps_in_frame = get_bits(8);
+		m_image_x_size = get_bits(16);
 
-  if (m_comps_in_frame > JPGD_MAX_COMPONENTS)
-    stop_decoding(JPGD_TOO_MANY_COMPONENTS);
+		if ((m_image_x_size < 1) || (m_image_x_size > JPGD_MAX_WIDTH))
+			stop_decoding(JPGD_BAD_WIDTH);
 
-  if (num_left != (uint)(m_comps_in_frame * 3 + 8))
-    stop_decoding(JPGD_BAD_SOF_LENGTH);
+		m_comps_in_frame = get_bits(8);
 
-  for (i = 0; i < m_comps_in_frame; i++)
-  {
-    m_comp_ident[i]  = get_bits(8);
-    m_comp_h_samp[i] = get_bits(4);
-    m_comp_v_samp[i] = get_bits(4);
-    m_comp_quant[i]  = get_bits(8);
-  }
-}
+		if (m_comps_in_frame > JPGD_MAX_COMPONENTS)
+			stop_decoding(JPGD_TOO_MANY_COMPONENTS);
 
-// Used to skip unrecognized markers.
-void jpeg_decoder::skip_variable_marker()
-{
-  uint num_left;
+		if (num_left != (uint)(m_comps_in_frame * 3 + 8))
+			stop_decoding(JPGD_BAD_SOF_LENGTH);
 
-  num_left = get_bits(16);
+		for (i = 0; i < m_comps_in_frame; i++)
+		{
+			m_comp_ident[i] = get_bits(8);
+			m_comp_h_samp[i] = get_bits(4);
+			m_comp_v_samp[i] = get_bits(4);
 
-  if (num_left < 2)
-    stop_decoding(JPGD_BAD_VARIABLE_MARKER);
+			if (!m_comp_h_samp[i] || !m_comp_v_samp[i] || (m_comp_h_samp[i] > 2) || (m_comp_v_samp[i] > 2))
+				stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
 
-  num_left -= 2;
+			m_comp_quant[i] = get_bits(8);
+			if (m_comp_quant[i] >= JPGD_MAX_QUANT_TABLES)
+				stop_decoding(JPGD_DECODE_ERROR);
+		}
+	}
 
-  while (num_left)
-  {
-    get_bits(8);
-    num_left--;
-  }
-}
+	// Used to skip unrecognized markers.
+	void jpeg_decoder::skip_variable_marker()
+	{
+		uint num_left;
 
-// Read a define restart interval (DRI) marker.
-void jpeg_decoder::read_dri_marker()
-{
-  if (get_bits(16) != 4)
-    stop_decoding(JPGD_BAD_DRI_LENGTH);
+		num_left = get_bits(16);
 
-  m_restart_interval = get_bits(16);
-}
+		if (num_left < 2)
+			stop_decoding(JPGD_BAD_VARIABLE_MARKER);
 
-// Read a start of scan (SOS) marker.
-void jpeg_decoder::read_sos_marker()
-{
-  uint num_left;
-  int i, ci, n, c, cc;
+		num_left -= 2;
 
-  num_left = get_bits(16);
+		while (num_left)
+		{
+			get_bits(8);
+			num_left--;
+		}
+	}
 
-  n = get_bits(8);
-
-  m_comps_in_scan = n;
-
-  num_left -= 3;
-
-  if ( (num_left != (uint)(n * 2 + 3)) || (n < 1) || (n > JPGD_MAX_COMPS_IN_SCAN) )
-    stop_decoding(JPGD_BAD_SOS_LENGTH);
-
-  for (i = 0; i < n; i++)
-  {
-    cc = get_bits(8);
-    c = get_bits(8);
-    num_left -= 2;
-
-    for (ci = 0; ci < m_comps_in_frame; ci++)
-      if (cc == m_comp_ident[ci])
-        break;
-
-    if (ci >= m_comps_in_frame)
-      stop_decoding(JPGD_BAD_SOS_COMP_ID);
-
-    m_comp_list[i]    = ci;
-    m_comp_dc_tab[ci] = (c >> 4) & 15;
-    m_comp_ac_tab[ci] = (c & 15) + (JPGD_MAX_HUFF_TABLES >> 1);
-  }
-
-  m_spectral_start  = get_bits(8);
-  m_spectral_end    = get_bits(8);
-  m_successive_high = get_bits(4);
-  m_successive_low  = get_bits(4);
-
-  if (!m_progressive_flag)
-  {
-    m_spectral_start = 0;
-    m_spectral_end = 63;
-  }
-
-  num_left -= 3;
-
-  while (num_left)                  /* read past whatever is num_left */
-  {
-    get_bits(8);
-    num_left--;
-  }
-}
-
-// Finds the next marker.
-int jpeg_decoder::next_marker()
-{
-  uint c, bytes;
-
-  bytes = 0;
-
-  do
-  {
-    do
-    {
-      bytes++;
-      c = get_bits(8);
-    } while (c != 0xFF);
-
-    do
-    {
-      c = get_bits(8);
-    } while (c == 0xFF);
-
-  } while (c == 0);
-
-  // If bytes > 0 here, there where extra bytes before the marker (not good).
-
-  return c;
-}
-
-// Process markers. Returns when an SOFx, SOI, EOI, or SOS marker is
-// encountered.
-int jpeg_decoder::process_markers()
-{
-  int c;
-
-  for ( ; ; )
-  {
-    c = next_marker();
-
-    switch (c)
-    {
-      case M_SOF0:
-      case M_SOF1:
-      case M_SOF2:
-      case M_SOF3:
-      case M_SOF5:
-      case M_SOF6:
-      case M_SOF7:
-//      case M_JPG:
-      case M_SOF9:
-      case M_SOF10:
-      case M_SOF11:
-      case M_SOF13:
-      case M_SOF14:
-      case M_SOF15:
-      case M_SOI:
-      case M_EOI:
-      case M_SOS:
-      {
-        return c;
-      }
-      case M_DHT:
-      {
-        read_dht_marker();
-        break;
-      }
-      // No arithmitic support - dumb patents!
-      case M_DAC:
-      {
-        stop_decoding(JPGD_NO_ARITHMITIC_SUPPORT);
-        break;
-      }
-      case M_DQT:
-      {
-        read_dqt_marker();
-        break;
-      }
-      case M_DRI:
-      {
-        read_dri_marker();
-        break;
-      }
-      //case M_APP0:  /* no need to read the JFIF marker */
-
-      case M_JPG:
-      case M_RST0:    /* no parameters */
-      case M_RST1:
-      case M_RST2:
-      case M_RST3:
-      case M_RST4:
-      case M_RST5:
-      case M_RST6:
-      case M_RST7:
-      case M_TEM:
-      {
-        stop_decoding(JPGD_UNEXPECTED_MARKER);
-        break;
-      }
-      default:    /* must be DNL, DHP, EXP, APPn, JPGn, COM, or RESn or APP0 */
-      {
-        skip_variable_marker();
-        break;
-      }
-    }
-  }
-}
-
-// Finds the start of image (SOI) marker.
-// This code is rather defensive: it only checks the first 512 bytes to avoid
-// false positives.
-void jpeg_decoder::locate_soi_marker()
-{
-  uint lastchar, thischar;
-  uint bytesleft;
-
-  lastchar = get_bits(8);
-
-  thischar = get_bits(8);
-
-  /* ok if it's a normal JPEG file without a special header */
-
-  if ((lastchar == 0xFF) && (thischar == M_SOI))
-    return;
-
-  bytesleft = 4096; //512;
-
-  for ( ; ; )
-  {
-    if (--bytesleft == 0)
-      stop_decoding(JPGD_NOT_JPEG);
-
-    lastchar = thischar;
-
-    thischar = get_bits(8);
-
-    if (lastchar == 0xFF)
-    {
-      if (thischar == M_SOI)
-        break;
-      else if (thischar == M_EOI) // get_bits will keep returning M_EOI if we read past the end
-        stop_decoding(JPGD_NOT_JPEG);
-    }
-  }
-
-  // Check the next character after marker: if it's not 0xFF, it can't be the start of the next marker, so the file is bad.
-  thischar = (m_bit_buf >> 24) & 0xFF;
-
-  if (thischar != 0xFF)
-    stop_decoding(JPGD_NOT_JPEG);
-}
-
-// Find a start of frame (SOF) marker.
-void jpeg_decoder::locate_sof_marker()
-{
-  locate_soi_marker();
-
-  int c = process_markers();
-
-  switch (c)
-  {
-    case M_SOF2:
-      m_progressive_flag = JPGD_TRUE;
-    case M_SOF0:  /* baseline DCT */
-    case M_SOF1:  /* extended sequential DCT */
-    {
-      read_sof_marker();
-      break;
-    }
-    case M_SOF9:  /* Arithmitic coding */
-    {
-      stop_decoding(JPGD_NO_ARITHMITIC_SUPPORT);
-      break;
-    }
-    default:
-    {
-      stop_decoding(JPGD_UNSUPPORTED_MARKER);
-      break;
-    }
-  }
-}
-
-// Find a start of scan (SOS) marker.
-int jpeg_decoder::locate_sos_marker()
-{
-  int c;
-
-  c = process_markers();
-
-  if (c == M_EOI)
-    return JPGD_FALSE;
-  else if (c != M_SOS)
-    stop_decoding(JPGD_UNEXPECTED_MARKER);
-
-  read_sos_marker();
-
-  return JPGD_TRUE;
-}
-
-// Reset everything to default/uninitialized state.
-void jpeg_decoder::init(jpeg_decoder_stream *pStream)
-{
-  m_pMem_blocks = NULL;
-  m_error_code = JPGD_SUCCESS;
-  m_ready_flag = false;
-  m_image_x_size = m_image_y_size = 0;
-  m_pStream = pStream;
-  m_progressive_flag = JPGD_FALSE;
-
-  memset(m_huff_ac, 0, sizeof(m_huff_ac));
-  memset(m_huff_num, 0, sizeof(m_huff_num));
-  memset(m_huff_val, 0, sizeof(m_huff_val));
-  memset(m_quant, 0, sizeof(m_quant));
-
-  m_scan_type = 0;
-  m_comps_in_frame = 0;
-
-  memset(m_comp_h_samp, 0, sizeof(m_comp_h_samp));
-  memset(m_comp_v_samp, 0, sizeof(m_comp_v_samp));
-  memset(m_comp_quant, 0, sizeof(m_comp_quant));
-  memset(m_comp_ident, 0, sizeof(m_comp_ident));
-  memset(m_comp_h_blocks, 0, sizeof(m_comp_h_blocks));
-  memset(m_comp_v_blocks, 0, sizeof(m_comp_v_blocks));
-
-  m_comps_in_scan = 0;
-  memset(m_comp_list, 0, sizeof(m_comp_list));
-  memset(m_comp_dc_tab, 0, sizeof(m_comp_dc_tab));
-  memset(m_comp_ac_tab, 0, sizeof(m_comp_ac_tab));
-
-  m_spectral_start = 0;
-  m_spectral_end = 0;
-  m_successive_low = 0;
-  m_successive_high = 0;
-  m_max_mcu_x_size = 0;
-  m_max_mcu_y_size = 0;
-  m_blocks_per_mcu = 0;
-  m_max_blocks_per_row = 0;
-  m_mcus_per_row = 0;
-  m_mcus_per_col = 0;
-  m_expanded_blocks_per_component = 0;
-  m_expanded_blocks_per_mcu = 0;
-  m_expanded_blocks_per_row = 0;
-  m_freq_domain_chroma_upsample = false;
-
-  memset(m_mcu_org, 0, sizeof(m_mcu_org));
-
-  m_total_lines_left = 0;
-  m_mcu_lines_left = 0;
-  m_real_dest_bytes_per_scan_line = 0;
-  m_dest_bytes_per_scan_line = 0;
-  m_dest_bytes_per_pixel = 0;
-
-  memset(m_pHuff_tabs, 0, sizeof(m_pHuff_tabs));
-
-  memset(m_dc_coeffs, 0, sizeof(m_dc_coeffs));
-  memset(m_ac_coeffs, 0, sizeof(m_ac_coeffs));
-  memset(m_block_y_mcu, 0, sizeof(m_block_y_mcu));
-
-  m_eob_run = 0;
-
-  memset(m_block_y_mcu, 0, sizeof(m_block_y_mcu));
-
-  m_pIn_buf_ofs = m_in_buf;
-  m_in_buf_left = 0;
-  m_eof_flag = false;
-  m_tem_flag = 0;
-
-  memset(m_in_buf_pad_start, 0, sizeof(m_in_buf_pad_start));
-  memset(m_in_buf, 0, sizeof(m_in_buf));
-  memset(m_in_buf_pad_end, 0, sizeof(m_in_buf_pad_end));
-
-  m_restart_interval = 0;
-  m_restarts_left    = 0;
-  m_next_restart_num = 0;
-
-  m_max_mcus_per_row = 0;
-  m_max_blocks_per_mcu = 0;
-  m_max_mcus_per_col = 0;
-
-  memset(m_last_dc_val, 0, sizeof(m_last_dc_val));
-  m_pMCU_coefficients = NULL;
-  m_pSample_buf = NULL;
-
-  m_total_bytes_read = 0;
-
-  m_pScan_line_0 = NULL;
-  m_pScan_line_1 = NULL;
-
-  // Ready the input buffer.
-  prep_in_buffer();
-
-  // Prime the bit buffer.
-  m_bits_left = 16;
-  m_bit_buf = 0;
-
-  get_bits(16);
-  get_bits(16);
-
-  for (int i = 0; i < JPGD_MAX_BLOCKS_PER_MCU; i++)
-    m_mcu_block_max_zag[i] = 64;
-}
+	// Read a define restart interval (DRI) marker.
+	void jpeg_decoder::read_dri_marker()
+	{
+		if (get_bits(16) != 4)
+			stop_decoding(JPGD_BAD_DRI_LENGTH);
+
+		m_restart_interval = get_bits(16);
+	}
+
+	// Read a start of scan (SOS) marker.
+	void jpeg_decoder::read_sos_marker()
+	{
+		uint num_left;
+		int i, ci, n, c, cc;
+
+		num_left = get_bits(16);
+
+		n = get_bits(8);
+
+		m_comps_in_scan = n;
+
+		num_left -= 3;
+
+		if ((num_left != (uint)(n * 2 + 3)) || (n < 1) || (n > JPGD_MAX_COMPS_IN_SCAN))
+			stop_decoding(JPGD_BAD_SOS_LENGTH);
+
+		for (i = 0; i < n; i++)
+		{
+			cc = get_bits(8);
+			c = get_bits(8);
+			num_left -= 2;
+
+			for (ci = 0; ci < m_comps_in_frame; ci++)
+				if (cc == m_comp_ident[ci])
+					break;
+
+			if (ci >= m_comps_in_frame)
+				stop_decoding(JPGD_BAD_SOS_COMP_ID);
+
+			if (ci >= JPGD_MAX_COMPONENTS)
+				stop_decoding(JPGD_DECODE_ERROR);
+
+			m_comp_list[i] = ci;
+
+			m_comp_dc_tab[ci] = (c >> 4) & 15;
+			m_comp_ac_tab[ci] = (c & 15) + (JPGD_MAX_HUFF_TABLES >> 1);
+
+			if (m_comp_dc_tab[ci] >= JPGD_MAX_HUFF_TABLES)
+				stop_decoding(JPGD_DECODE_ERROR);
+
+			if (m_comp_ac_tab[ci] >= JPGD_MAX_HUFF_TABLES)
+				stop_decoding(JPGD_DECODE_ERROR);
+		}
+
+		m_spectral_start = get_bits(8);
+		m_spectral_end = get_bits(8);
+		m_successive_high = get_bits(4);
+		m_successive_low = get_bits(4);
+
+		if (!m_progressive_flag)
+		{
+			m_spectral_start = 0;
+			m_spectral_end = 63;
+		}
+
+		num_left -= 3;
+
+		/* read past whatever is num_left */
+		while (num_left)
+		{
+			get_bits(8);
+			num_left--;
+		}
+	}
+
+	// Finds the next marker.
+	int jpeg_decoder::next_marker()
+	{
+		uint c, bytes;
+
+		bytes = 0;
+
+		do
+		{
+			do
+			{
+				bytes++;
+				c = get_bits(8);
+			} while (c != 0xFF);
+
+			do
+			{
+				c = get_bits(8);
+			} while (c == 0xFF);
+
+		} while (c == 0);
+
+		// If bytes > 0 here, there where extra bytes before the marker (not good).
+
+		return c;
+	}
+
+	// Process markers. Returns when an SOFx, SOI, EOI, or SOS marker is
+	// encountered.
+	int jpeg_decoder::process_markers()
+	{
+		int c;
+
+		for (; ; )
+		{
+			c = next_marker();
+
+			switch (c)
+			{
+			case M_SOF0:
+			case M_SOF1:
+			case M_SOF2:
+			case M_SOF3:
+			case M_SOF5:
+			case M_SOF6:
+			case M_SOF7:
+				//      case M_JPG:
+			case M_SOF9:
+			case M_SOF10:
+			case M_SOF11:
+			case M_SOF13:
+			case M_SOF14:
+			case M_SOF15:
+			case M_SOI:
+			case M_EOI:
+			case M_SOS:
+			{
+				return c;
+			}
+			case M_DHT:
+			{
+				read_dht_marker();
+				break;
+			}
+			// No arithmitic support - dumb patents!
+			case M_DAC:
+			{
+				stop_decoding(JPGD_NO_ARITHMITIC_SUPPORT);
+				break;
+			}
+			case M_DQT:
+			{
+				read_dqt_marker();
+				break;
+			}
+			case M_DRI:
+			{
+				read_dri_marker();
+				break;
+			}
+			//case M_APP0:  /* no need to read the JFIF marker */
+			case M_JPG:
+			case M_RST0:    /* no parameters */
+			case M_RST1:
+			case M_RST2:
+			case M_RST3:
+			case M_RST4:
+			case M_RST5:
+			case M_RST6:
+			case M_RST7:
+			case M_TEM:
+			{
+				stop_decoding(JPGD_UNEXPECTED_MARKER);
+				break;
+			}
+			default:    /* must be DNL, DHP, EXP, APPn, JPGn, COM, or RESn or APP0 */
+			{
+				skip_variable_marker();
+				break;
+			}
+			}
+		}
+	}
+
+	// Finds the start of image (SOI) marker.
+	void jpeg_decoder::locate_soi_marker()
+	{
+		uint lastchar, thischar;
+		uint bytesleft;
+
+		lastchar = get_bits(8);
+
+		thischar = get_bits(8);
+
+		/* ok if it's a normal JPEG file without a special header */
+
+		if ((lastchar == 0xFF) && (thischar == M_SOI))
+			return;
+
+		bytesleft = 4096;
+
+		for (; ; )
+		{
+			if (--bytesleft == 0)
+				stop_decoding(JPGD_NOT_JPEG);
+
+			lastchar = thischar;
+
+			thischar = get_bits(8);
+
+			if (lastchar == 0xFF)
+			{
+				if (thischar == M_SOI)
+					break;
+				else if (thischar == M_EOI) // get_bits will keep returning M_EOI if we read past the end
+					stop_decoding(JPGD_NOT_JPEG);
+			}
+		}
+
+		// Check the next character after marker: if it's not 0xFF, it can't be the start of the next marker, so the file is bad.
+		thischar = (m_bit_buf >> 24) & 0xFF;
+
+		if (thischar != 0xFF)
+			stop_decoding(JPGD_NOT_JPEG);
+	}
+
+	// Find a start of frame (SOF) marker.
+	void jpeg_decoder::locate_sof_marker()
+	{
+		locate_soi_marker();
+
+		int c = process_markers();
+
+		switch (c)
+		{
+		case M_SOF2:
+		{
+			m_progressive_flag = JPGD_TRUE;
+			read_sof_marker();
+			break;
+		}
+		case M_SOF0:  /* baseline DCT */
+		case M_SOF1:  /* extended sequential DCT */
+		{
+			read_sof_marker();
+			break;
+		}
+		case M_SOF9:  /* Arithmitic coding */
+		{
+			stop_decoding(JPGD_NO_ARITHMITIC_SUPPORT);
+			break;
+		}
+		default:
+		{
+			stop_decoding(JPGD_UNSUPPORTED_MARKER);
+			break;
+		}
+		}
+	}
+
+	// Find a start of scan (SOS) marker.
+	int jpeg_decoder::locate_sos_marker()
+	{
+		int c;
+
+		c = process_markers();
+
+		if (c == M_EOI)
+			return JPGD_FALSE;
+		else if (c != M_SOS)
+			stop_decoding(JPGD_UNEXPECTED_MARKER);
+
+		read_sos_marker();
+
+		return JPGD_TRUE;
+	}
+
+	// Reset everything to default/uninitialized state.
+	void jpeg_decoder::init(jpeg_decoder_stream* pStream, uint32_t flags)
+	{
+		m_flags = flags;
+		m_pMem_blocks = nullptr;
+		m_error_code = JPGD_SUCCESS;
+		m_ready_flag = false;
+		m_image_x_size = m_image_y_size = 0;
+		m_pStream = pStream;
+		m_progressive_flag = JPGD_FALSE;
+				
+		memset(m_huff_ac, 0, sizeof(m_huff_ac));
+		memset(m_huff_num, 0, sizeof(m_huff_num));
+		memset(m_huff_val, 0, sizeof(m_huff_val));
+		memset(m_quant, 0, sizeof(m_quant));
+
+		m_scan_type = 0;
+		m_comps_in_frame = 0;
+
+		memset(m_comp_h_samp, 0, sizeof(m_comp_h_samp));
+		memset(m_comp_v_samp, 0, sizeof(m_comp_v_samp));
+		memset(m_comp_quant, 0, sizeof(m_comp_quant));
+		memset(m_comp_ident, 0, sizeof(m_comp_ident));
+		memset(m_comp_h_blocks, 0, sizeof(m_comp_h_blocks));
+		memset(m_comp_v_blocks, 0, sizeof(m_comp_v_blocks));
+
+		m_comps_in_scan = 0;
+		memset(m_comp_list, 0, sizeof(m_comp_list));
+		memset(m_comp_dc_tab, 0, sizeof(m_comp_dc_tab));
+		memset(m_comp_ac_tab, 0, sizeof(m_comp_ac_tab));
+
+		m_spectral_start = 0;
+		m_spectral_end = 0;
+		m_successive_low = 0;
+		m_successive_high = 0;
+		m_max_mcu_x_size = 0;
+		m_max_mcu_y_size = 0;
+		m_blocks_per_mcu = 0;
+		m_max_blocks_per_row = 0;
+		m_mcus_per_row = 0;
+		m_mcus_per_col = 0;
+
+		memset(m_mcu_org, 0, sizeof(m_mcu_org));
+
+		m_total_lines_left = 0;
+		m_mcu_lines_left = 0;
+		m_num_buffered_scanlines = 0;
+		m_real_dest_bytes_per_scan_line = 0;
+		m_dest_bytes_per_scan_line = 0;
+		m_dest_bytes_per_pixel = 0;
+
+		memset(m_pHuff_tabs, 0, sizeof(m_pHuff_tabs));
+
+		memset(m_dc_coeffs, 0, sizeof(m_dc_coeffs));
+		memset(m_ac_coeffs, 0, sizeof(m_ac_coeffs));
+		memset(m_block_y_mcu, 0, sizeof(m_block_y_mcu));
+
+		m_eob_run = 0;
+
+		m_pIn_buf_ofs = m_in_buf;
+		m_in_buf_left = 0;
+		m_eof_flag = false;
+		m_tem_flag = 0;
+
+		memset(m_in_buf_pad_start, 0, sizeof(m_in_buf_pad_start));
+		memset(m_in_buf, 0, sizeof(m_in_buf));
+		memset(m_in_buf_pad_end, 0, sizeof(m_in_buf_pad_end));
+
+		m_restart_interval = 0;
+		m_restarts_left = 0;
+		m_next_restart_num = 0;
+
+		m_max_mcus_per_row = 0;
+		m_max_blocks_per_mcu = 0;
+		m_max_mcus_per_col = 0;
+
+		memset(m_last_dc_val, 0, sizeof(m_last_dc_val));
+		m_pMCU_coefficients = nullptr;
+		m_pSample_buf = nullptr;
+		m_pSample_buf_prev = nullptr;
+		m_sample_buf_prev_valid = false;
+
+		m_total_bytes_read = 0;
+
+		m_pScan_line_0 = nullptr;
+		m_pScan_line_1 = nullptr;
+
+		// Ready the input buffer.
+		prep_in_buffer();
+
+		// Prime the bit buffer.
+		m_bits_left = 16;
+		m_bit_buf = 0;
+
+		get_bits(16);
+		get_bits(16);
+
+		for (int i = 0; i < JPGD_MAX_BLOCKS_PER_MCU; i++)
+			m_mcu_block_max_zag[i] = 64;
+
+		m_has_sse2 = false;
+
+#if JPGD_USE_SSE2
+#ifdef _MSC_VER
+		int cpu_info[4];
+		__cpuid(cpu_info, 1);
+		const int cpu_info3 = cpu_info[3];
+		m_has_sse2 = ((cpu_info3 >> 26U) & 1U) != 0U;
+#else
+		m_has_sse2 = true;
+#endif
+#endif
+	}
 
 #define SCALEBITS 16
 #define ONE_HALF  ((int) 1 << (SCALEBITS-1))
 #define FIX(x)    ((int) ((x) * (1L<<SCALEBITS) + 0.5f))
 
-// Create a few tables that allow us to quickly convert YCbCr to RGB.
-void jpeg_decoder::create_look_ups()
-{
-  for (int i = 0; i <= 255; i++)
-  {
-    int k = i - 128;
-    m_crr[i] = ( FIX(1.40200f)  * k + ONE_HALF) >> SCALEBITS;
-    m_cbb[i] = ( FIX(1.77200f)  * k + ONE_HALF) >> SCALEBITS;
-    m_crg[i] = (-FIX(0.71414f)) * k;
-    m_cbg[i] = (-FIX(0.34414f)) * k + ONE_HALF;
-  }
-}
-
-// This method throws back into the stream any bytes that where read
-// into the bit buffer during initial marker scanning.
-void jpeg_decoder::fix_in_buffer()
-{
-  // In case any 0xFF's where pulled into the buffer during marker scanning.
-  JPGD_ASSERT((m_bits_left & 7) == 0);
-
-  if (m_bits_left == 16)
-    stuff_char( (uint8)(m_bit_buf & 0xFF));
-
-  if (m_bits_left >= 8)
-    stuff_char( (uint8)((m_bit_buf >> 8) & 0xFF));
-
-  stuff_char((uint8)((m_bit_buf >> 16) & 0xFF));
-  stuff_char((uint8)((m_bit_buf >> 24) & 0xFF));
-
-  m_bits_left = 16;
-  get_bits_no_markers(16);
-  get_bits_no_markers(16);
-}
-
-void jpeg_decoder::transform_mcu(int mcu_row)
-{
-  jpgd_block_t* pSrc_ptr = m_pMCU_coefficients;
-  if (m_freq_domain_chroma_upsample) {
-     JPGD_ASSERT(mcu_row * m_blocks_per_mcu < m_expanded_blocks_per_row);
-  }
-  else {
-     JPGD_ASSERT(mcu_row * m_blocks_per_mcu < m_max_blocks_per_row);
-  }
-  uint8* pDst_ptr = m_pSample_buf + mcu_row * m_blocks_per_mcu * 64;
-
-  for (int mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
-  {
-    idct(pSrc_ptr, pDst_ptr, m_mcu_block_max_zag[mcu_block]);
-    pSrc_ptr += 64;
-    pDst_ptr += 64;
-  }
-}
-
-static const uint8 s_max_rc[64] =
-{
-  17, 18, 34, 50, 50, 51, 52, 52, 52, 68, 84, 84, 84, 84, 85, 86, 86, 86, 86, 86,
-  102, 118, 118, 118, 118, 118, 118, 119, 120, 120, 120, 120, 120, 120, 120, 136,
-  136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136,
-  136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136
-};
-
-void jpeg_decoder::transform_mcu_expand(int mcu_row)
-{
-  jpgd_block_t* pSrc_ptr = m_pMCU_coefficients;
-  uint8* pDst_ptr = m_pSample_buf + mcu_row * m_expanded_blocks_per_mcu * 64;
-
-  // Y IDCT
-	int mcu_block;
-  for (mcu_block = 0; mcu_block < m_expanded_blocks_per_component; mcu_block++)
-  {
-    idct(pSrc_ptr, pDst_ptr, m_mcu_block_max_zag[mcu_block]);
-    pSrc_ptr += 64;
-    pDst_ptr += 64;
-  }
-
-  // Chroma IDCT, with upsampling
-	jpgd_block_t temp_block[64];
-
-  for (int i = 0; i < 2; i++)
-  {
-    DCT_Upsample::Matrix44 P, Q, R, S;
-
-    JPGD_ASSERT(m_mcu_block_max_zag[mcu_block] >= 1);
-    JPGD_ASSERT(m_mcu_block_max_zag[mcu_block] <= 64);
-
-    int max_zag = m_mcu_block_max_zag[mcu_block++] - 1; 
-    if (max_zag <= 0) max_zag = 0; // should never happen, only here to shut up static analysis
-    switch (s_max_rc[max_zag])
-    {
-    case 1*16+1:
-      DCT_Upsample::P_Q<1, 1>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<1, 1>::calc(R, S, pSrc_ptr);
-      break;
-    case 1*16+2:
-      DCT_Upsample::P_Q<1, 2>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<1, 2>::calc(R, S, pSrc_ptr);
-      break;
-    case 2*16+2:
-      DCT_Upsample::P_Q<2, 2>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<2, 2>::calc(R, S, pSrc_ptr);
-      break;
-    case 3*16+2:
-      DCT_Upsample::P_Q<3, 2>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<3, 2>::calc(R, S, pSrc_ptr);
-      break;
-    case 3*16+3:
-      DCT_Upsample::P_Q<3, 3>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<3, 3>::calc(R, S, pSrc_ptr);
-      break;
-    case 3*16+4:
-      DCT_Upsample::P_Q<3, 4>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<3, 4>::calc(R, S, pSrc_ptr);
-      break;
-    case 4*16+4:
-      DCT_Upsample::P_Q<4, 4>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<4, 4>::calc(R, S, pSrc_ptr);
-      break;
-    case 5*16+4:
-      DCT_Upsample::P_Q<5, 4>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<5, 4>::calc(R, S, pSrc_ptr);
-      break;
-    case 5*16+5:
-      DCT_Upsample::P_Q<5, 5>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<5, 5>::calc(R, S, pSrc_ptr);
-      break;
-    case 5*16+6:
-      DCT_Upsample::P_Q<5, 6>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<5, 6>::calc(R, S, pSrc_ptr);
-      break;
-    case 6*16+6:
-      DCT_Upsample::P_Q<6, 6>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<6, 6>::calc(R, S, pSrc_ptr);
-      break;
-    case 7*16+6:
-      DCT_Upsample::P_Q<7, 6>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<7, 6>::calc(R, S, pSrc_ptr);
-      break;
-    case 7*16+7:
-      DCT_Upsample::P_Q<7, 7>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<7, 7>::calc(R, S, pSrc_ptr);
-      break;
-    case 7*16+8:
-      DCT_Upsample::P_Q<7, 8>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<7, 8>::calc(R, S, pSrc_ptr);
-      break;
-    case 8*16+8:
-      DCT_Upsample::P_Q<8, 8>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<8, 8>::calc(R, S, pSrc_ptr);
-      break;
-    default:
-      JPGD_ASSERT(false);
-    }
-
-    DCT_Upsample::Matrix44 a(P + Q); P -= Q;
-    DCT_Upsample::Matrix44& b = P;
-    DCT_Upsample::Matrix44 c(R + S); R -= S;
-    DCT_Upsample::Matrix44& d = R;
-
-    DCT_Upsample::Matrix44::add_and_store(temp_block, a, c);
-    idct_4x4(temp_block, pDst_ptr);
-    pDst_ptr += 64;
-
-    DCT_Upsample::Matrix44::sub_and_store(temp_block, a, c);
-    idct_4x4(temp_block, pDst_ptr);
-    pDst_ptr += 64;
-
-    DCT_Upsample::Matrix44::add_and_store(temp_block, b, d);
-    idct_4x4(temp_block, pDst_ptr);
-    pDst_ptr += 64;
-
-    DCT_Upsample::Matrix44::sub_and_store(temp_block, b, d);
-    idct_4x4(temp_block, pDst_ptr);
-    pDst_ptr += 64;
-
-    pSrc_ptr += 64;
-  }
-}
-
-// Loads and dequantizes the next row of (already decoded) coefficients.
-// Progressive images only.
-void jpeg_decoder::load_next_row()
-{
-  int i;
-  jpgd_block_t *p;
-  jpgd_quant_t *q;
-  int mcu_row, mcu_block, row_block = 0;
-  int component_num, component_id;
-  int block_x_mcu[JPGD_MAX_COMPONENTS];
-
-  memset(block_x_mcu, 0, JPGD_MAX_COMPONENTS * sizeof(int));
-
-  for (mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
-  {
-    int block_x_mcu_ofs = 0, block_y_mcu_ofs = 0;
-
-    for (mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
-    {
-      component_id = m_mcu_org[mcu_block];
-      JPGD_ASSERT(m_comp_quant[component_id] < JPGD_MAX_QUANT_TABLES);
-      q = m_quant[m_comp_quant[component_id]];
-
-      p = m_pMCU_coefficients + 64 * mcu_block;
-
-      jpgd_block_t* pAC = coeff_buf_getp(m_ac_coeffs[component_id], block_x_mcu[component_id] + block_x_mcu_ofs, m_block_y_mcu[component_id] + block_y_mcu_ofs);
-      jpgd_block_t* pDC = coeff_buf_getp(m_dc_coeffs[component_id], block_x_mcu[component_id] + block_x_mcu_ofs, m_block_y_mcu[component_id] + block_y_mcu_ofs);
-      p[0] = pDC[0];
-      memcpy(&p[1], &pAC[1], 63 * sizeof(jpgd_block_t));
-
-      for (i = 63; i > 0; i--)
-        if (p[g_ZAG[i]])
-          break;
-
-      m_mcu_block_max_zag[mcu_block] = i + 1;
-
-      for ( ; i >= 0; i--)
-				if (p[g_ZAG[i]])
-					p[g_ZAG[i]] = static_cast<jpgd_block_t>(p[g_ZAG[i]] * q[i]);
-
-      row_block++;
-
-      if (m_comps_in_scan == 1)
-        block_x_mcu[component_id]++;
-      else
-      {
-        if (++block_x_mcu_ofs == m_comp_h_samp[component_id])
-        {
-          block_x_mcu_ofs = 0;
-
-          if (++block_y_mcu_ofs == m_comp_v_samp[component_id])
-          {
-            block_y_mcu_ofs = 0;
-
-            block_x_mcu[component_id] += m_comp_h_samp[component_id];
-          }
-        }
-      }
-    }
-
-    if (m_freq_domain_chroma_upsample)
-      transform_mcu_expand(mcu_row);
-    else
-      transform_mcu(mcu_row);
-  }
-
-  if (m_comps_in_scan == 1)
-    m_block_y_mcu[m_comp_list[0]]++;
-  else
-  {
-    for (component_num = 0; component_num < m_comps_in_scan; component_num++)
-    {
-      component_id = m_comp_list[component_num];
-
-      m_block_y_mcu[component_id] += m_comp_v_samp[component_id];
-    }
-  }
-}
-
-// Restart interval processing.
-void jpeg_decoder::process_restart()
-{
-  int i;
-  int c = 0;
-
-  // Align to a byte boundry
-  // FIXME: Is this really necessary? get_bits_no_markers() never reads in markers!
-  //get_bits_no_markers(m_bits_left & 7);
-
-  // Let's scan a little bit to find the marker, but not _too_ far.
-  // 1536 is a "fudge factor" that determines how much to scan.
-  for (i = 1536; i > 0; i--)
-    if (get_char() == 0xFF)
-      break;
-
-  if (i == 0)
-    stop_decoding(JPGD_BAD_RESTART_MARKER);
-
-  for ( ; i > 0; i--)
-    if ((c = get_char()) != 0xFF)
-      break;
-
-  if (i == 0)
-    stop_decoding(JPGD_BAD_RESTART_MARKER);
-
-  // Is it the expected marker? If not, something bad happened.
-  if (c != (m_next_restart_num + M_RST0))
-    stop_decoding(JPGD_BAD_RESTART_MARKER);
-
-  // Reset each component's DC prediction values.
-  memset(&m_last_dc_val, 0, m_comps_in_frame * sizeof(uint));
-
-  m_eob_run = 0;
-
-  m_restarts_left = m_restart_interval;
-
-  m_next_restart_num = (m_next_restart_num + 1) & 7;
-
-  // Get the bit buffer going again...
-
-  m_bits_left = 16;
-  get_bits_no_markers(16);
-  get_bits_no_markers(16);
-}
-
-static inline int dequantize_ac(int c, int q) {	c *= q;	return c; }
-
-// Decodes and dequantizes the next row of coefficients.
-void jpeg_decoder::decode_next_row()
-{
-  int row_block = 0;
-
-  for (int mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
-  {
-    if ((m_restart_interval) && (m_restarts_left == 0))
-      process_restart();
-
-    jpgd_block_t* p = m_pMCU_coefficients;
-    for (int mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++, p += 64)
-    {
-      int component_id = m_mcu_org[mcu_block];
-      JPGD_ASSERT(m_comp_quant[component_id] < JPGD_MAX_QUANT_TABLES);
-      jpgd_quant_t* q = m_quant[m_comp_quant[component_id]];
-
-      int r, s;
-      s = huff_decode(m_pHuff_tabs[m_comp_dc_tab[component_id]], r);
-      s = JPGD_HUFF_EXTEND(r, s);
-
-      m_last_dc_val[component_id] = (s += m_last_dc_val[component_id]);
-
-      p[0] = static_cast<jpgd_block_t>(s * q[0]);
-
-      int prev_num_set = m_mcu_block_max_zag[mcu_block];
-
-      huff_tables *pH = m_pHuff_tabs[m_comp_ac_tab[component_id]];
-
-      int k;
-      for (k = 1; k < 64; k++)
-      {
-        int extra_bits;
-        s = huff_decode(pH, extra_bits);
-
-        r = s >> 4;
-        s &= 15;
-
-        if (s)
-        {
-          if (r)
-          {
-            if ((k + r) > 63)
-              stop_decoding(JPGD_DECODE_ERROR);
-
-            if (k < prev_num_set)
-            {
-              int n = JPGD_MIN(r, prev_num_set - k);
-              int kt = k;
-              while (n--)
-                p[g_ZAG[kt++]] = 0;
-            }
-
-            k += r;
-          }
-          
-          s = JPGD_HUFF_EXTEND(extra_bits, s);
-
-          JPGD_ASSERT(k < 64);
-
-          p[g_ZAG[k]] = static_cast<jpgd_block_t>(dequantize_ac(s, q[k])); //s * q[k];
-        }
-        else
-        {
-          if (r == 15)
-          {
-            if ((k + 16) > 64)
-              stop_decoding(JPGD_DECODE_ERROR);
-
-            if (k < prev_num_set)
-            {
-              int n = JPGD_MIN(16, prev_num_set - k);
-              int kt = k;
-              while (n--)
-              {
-                JPGD_ASSERT(kt <= 63);
-                p[g_ZAG[kt++]] = 0;
-              }
-            }
-
-            k += 16 - 1; // - 1 because the loop counter is k
-            JPGD_ASSERT(p[g_ZAG[k]] == 0);
-          }
-          else
-            break;
-        }
-      }
-
-      if (k < prev_num_set)
-      {
-        int kt = k;
-        while (kt < prev_num_set)
-          p[g_ZAG[kt++]] = 0;
-      }
-
-      m_mcu_block_max_zag[mcu_block] = k;
-
-      row_block++;
-    }
-
-    if (m_freq_domain_chroma_upsample)
-      transform_mcu_expand(mcu_row);
-    else
-      transform_mcu(mcu_row);
-
-    m_restarts_left--;
-  }
-}
-
-// YCbCr H1V1 (1x1:1:1, 3 m_blocks per MCU) to RGB
-void jpeg_decoder::H1V1Convert()
-{
-  int row = m_max_mcu_y_size - m_mcu_lines_left;
-  uint8 *d = m_pScan_line_0;
-  uint8 *s = m_pSample_buf + row * 8;
-
-  for (int i = m_max_mcus_per_row; i > 0; i--)
-  {
-    for (int j = 0; j < 8; j++)
-    {
-      int y = s[j];
-      int cb = s[64+j];
-      int cr = s[128+j];
-
-      d[0] = clamp(y + m_crr[cr]);
-      d[1] = clamp(y + ((m_crg[cr] + m_cbg[cb]) >> 16));
-      d[2] = clamp(y + m_cbb[cb]);
-      d[3] = 255;
-
-      d += 4;
-    }
-
-    s += 64*3;
-  }
-}
-
-// YCbCr H2V1 (2x1:1:1, 4 m_blocks per MCU) to RGB
-void jpeg_decoder::H2V1Convert()
-{
-  int row = m_max_mcu_y_size - m_mcu_lines_left;
-  uint8 *d0 = m_pScan_line_0;
-  uint8 *y = m_pSample_buf + row * 8;
-  uint8 *c = m_pSample_buf + 2*64 + row * 8;
-
-  for (int i = m_max_mcus_per_row; i > 0; i--)
-  {
-    for (int l = 0; l < 2; l++)
-    {
-      for (int j = 0; j < 4; j++)
-      {
-        int cb = c[0];
-        int cr = c[64];
-
-        int rc = m_crr[cr];
-        int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
-        int bc = m_cbb[cb];
-
-        int yy = y[j<<1];
-        d0[0] = clamp(yy+rc);
-        d0[1] = clamp(yy+gc);
-        d0[2] = clamp(yy+bc);
-        d0[3] = 255;
-
-        yy = y[(j<<1)+1];
-        d0[4] = clamp(yy+rc);
-        d0[5] = clamp(yy+gc);
-        d0[6] = clamp(yy+bc);
-        d0[7] = 255;
-
-        d0 += 8;
-
-        c++;
-      }
-      y += 64;
-    }
-
-    y += 64*4 - 64*2;
-    c += 64*4 - 8;
-  }
-}
-
-// YCbCr H2V1 (1x2:1:1, 4 m_blocks per MCU) to RGB
-void jpeg_decoder::H1V2Convert()
-{
-  int row = m_max_mcu_y_size - m_mcu_lines_left;
-  uint8 *d0 = m_pScan_line_0;
-  uint8 *d1 = m_pScan_line_1;
-  uint8 *y;
-  uint8 *c;
-
-  if (row < 8)
-    y = m_pSample_buf + row * 8;
-  else
-    y = m_pSample_buf + 64*1 + (row & 7) * 8;
-
-  c = m_pSample_buf + 64*2 + (row >> 1) * 8;
-
-  for (int i = m_max_mcus_per_row; i > 0; i--)
-  {
-    for (int j = 0; j < 8; j++)
-    {
-      int cb = c[0+j];
-      int cr = c[64+j];
-
-      int rc = m_crr[cr];
-      int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
-      int bc = m_cbb[cb];
-
-      int yy = y[j];
-      d0[0] = clamp(yy+rc);
-      d0[1] = clamp(yy+gc);
-      d0[2] = clamp(yy+bc);
-      d0[3] = 255;
-
-      yy = y[8+j];
-      d1[0] = clamp(yy+rc);
-      d1[1] = clamp(yy+gc);
-      d1[2] = clamp(yy+bc);
-      d1[3] = 255;
-
-      d0 += 4;
-      d1 += 4;
-    }
-
-    y += 64*4;
-    c += 64*4;
-  }
-}
-
-// YCbCr H2V2 (2x2:1:1, 6 m_blocks per MCU) to RGB
-void jpeg_decoder::H2V2Convert()
-{
-	int row = m_max_mcu_y_size - m_mcu_lines_left;
-	uint8 *d0 = m_pScan_line_0;
-	uint8 *d1 = m_pScan_line_1;
-	uint8 *y;
-	uint8 *c;
-
-	if (row < 8)
-		y = m_pSample_buf + row * 8;
-	else
-		y = m_pSample_buf + 64*2 + (row & 7) * 8;
-
-	c = m_pSample_buf + 64*4 + (row >> 1) * 8;
-
-	for (int i = m_max_mcus_per_row; i > 0; i--)
+	// Create a few tables that allow us to quickly convert YCbCr to RGB.
+	void jpeg_decoder::create_look_ups()
 	{
-		for (int l = 0; l < 2; l++)
+		for (int i = 0; i <= 255; i++)
 		{
-			for (int j = 0; j < 8; j += 2)
+			int k = i - 128;
+			m_crr[i] = (FIX(1.40200f) * k + ONE_HALF) >> SCALEBITS;
+			m_cbb[i] = (FIX(1.77200f) * k + ONE_HALF) >> SCALEBITS;
+			m_crg[i] = (-FIX(0.71414f)) * k;
+			m_cbg[i] = (-FIX(0.34414f)) * k + ONE_HALF;
+		}
+	}
+
+	// This method throws back into the stream any bytes that where read
+	// into the bit buffer during initial marker scanning.
+	void jpeg_decoder::fix_in_buffer()
+	{
+		// In case any 0xFF's where pulled into the buffer during marker scanning.
+		assert((m_bits_left & 7) == 0);
+
+		if (m_bits_left == 16)
+			stuff_char((uint8)(m_bit_buf & 0xFF));
+
+		if (m_bits_left >= 8)
+			stuff_char((uint8)((m_bit_buf >> 8) & 0xFF));
+
+		stuff_char((uint8)((m_bit_buf >> 16) & 0xFF));
+		stuff_char((uint8)((m_bit_buf >> 24) & 0xFF));
+
+		m_bits_left = 16;
+		get_bits_no_markers(16);
+		get_bits_no_markers(16);
+	}
+
+	void jpeg_decoder::transform_mcu(int mcu_row)
+	{
+		jpgd_block_coeff_t* pSrc_ptr = m_pMCU_coefficients;
+		if (mcu_row * m_blocks_per_mcu >= m_max_blocks_per_row)
+			stop_decoding(JPGD_DECODE_ERROR);
+
+		uint8* pDst_ptr = m_pSample_buf + mcu_row * m_blocks_per_mcu * 64;
+
+		for (int mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
+		{
+			idct(pSrc_ptr, pDst_ptr, m_mcu_block_max_zag[mcu_block], ((m_flags & cFlagDisableSIMD) == 0) && m_has_sse2);
+			pSrc_ptr += 64;
+			pDst_ptr += 64;
+		}
+	}
+
+	// Loads and dequantizes the next row of (already decoded) coefficients.
+	// Progressive images only.
+	void jpeg_decoder::load_next_row()
+	{
+		int i;
+		jpgd_block_coeff_t* p;
+		jpgd_quant_t* q;
+		int mcu_row, mcu_block, row_block = 0;
+		int component_num, component_id;
+		int block_x_mcu[JPGD_MAX_COMPONENTS];
+
+		memset(block_x_mcu, 0, JPGD_MAX_COMPONENTS * sizeof(int));
+
+		for (mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
+		{
+			int block_x_mcu_ofs = 0, block_y_mcu_ofs = 0;
+
+			for (mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
 			{
-				int cb = c[0];
-				int cr = c[64];
+				component_id = m_mcu_org[mcu_block];
+				if (m_comp_quant[component_id] >= JPGD_MAX_QUANT_TABLES)
+					stop_decoding(JPGD_DECODE_ERROR);
+
+				q = m_quant[m_comp_quant[component_id]];
+
+				p = m_pMCU_coefficients + 64 * mcu_block;
+
+				jpgd_block_coeff_t* pAC = coeff_buf_getp(m_ac_coeffs[component_id], block_x_mcu[component_id] + block_x_mcu_ofs, m_block_y_mcu[component_id] + block_y_mcu_ofs);
+				jpgd_block_coeff_t* pDC = coeff_buf_getp(m_dc_coeffs[component_id], block_x_mcu[component_id] + block_x_mcu_ofs, m_block_y_mcu[component_id] + block_y_mcu_ofs);
+				p[0] = pDC[0];
+				memcpy(&p[1], &pAC[1], 63 * sizeof(jpgd_block_coeff_t));
+
+				for (i = 63; i > 0; i--)
+					if (p[g_ZAG[i]])
+						break;
+
+				m_mcu_block_max_zag[mcu_block] = i + 1;
+
+				for (; i >= 0; i--)
+					if (p[g_ZAG[i]])
+						p[g_ZAG[i]] = static_cast<jpgd_block_coeff_t>(p[g_ZAG[i]] * q[i]);
+
+				row_block++;
+
+				if (m_comps_in_scan == 1)
+					block_x_mcu[component_id]++;
+				else
+				{
+					if (++block_x_mcu_ofs == m_comp_h_samp[component_id])
+					{
+						block_x_mcu_ofs = 0;
+
+						if (++block_y_mcu_ofs == m_comp_v_samp[component_id])
+						{
+							block_y_mcu_ofs = 0;
+
+							block_x_mcu[component_id] += m_comp_h_samp[component_id];
+						}
+					}
+				}
+			}
+
+			transform_mcu(mcu_row);
+		}
+
+		if (m_comps_in_scan == 1)
+			m_block_y_mcu[m_comp_list[0]]++;
+		else
+		{
+			for (component_num = 0; component_num < m_comps_in_scan; component_num++)
+			{
+				component_id = m_comp_list[component_num];
+
+				m_block_y_mcu[component_id] += m_comp_v_samp[component_id];
+			}
+		}
+	}
+
+	// Restart interval processing.
+	void jpeg_decoder::process_restart()
+	{
+		int i;
+		int c = 0;
+
+		// Align to a byte boundry
+		// FIXME: Is this really necessary? get_bits_no_markers() never reads in markers!
+		//get_bits_no_markers(m_bits_left & 7);
+
+		// Let's scan a little bit to find the marker, but not _too_ far.
+		// 1536 is a "fudge factor" that determines how much to scan.
+		for (i = 1536; i > 0; i--)
+			if (get_char() == 0xFF)
+				break;
+
+		if (i == 0)
+			stop_decoding(JPGD_BAD_RESTART_MARKER);
+
+		for (; i > 0; i--)
+			if ((c = get_char()) != 0xFF)
+				break;
+
+		if (i == 0)
+			stop_decoding(JPGD_BAD_RESTART_MARKER);
+
+		// Is it the expected marker? If not, something bad happened.
+		if (c != (m_next_restart_num + M_RST0))
+			stop_decoding(JPGD_BAD_RESTART_MARKER);
+
+		// Reset each component's DC prediction values.
+		memset(&m_last_dc_val, 0, m_comps_in_frame * sizeof(uint));
+
+		m_eob_run = 0;
+
+		m_restarts_left = m_restart_interval;
+
+		m_next_restart_num = (m_next_restart_num + 1) & 7;
+
+		// Get the bit buffer going again...
+
+		m_bits_left = 16;
+		get_bits_no_markers(16);
+		get_bits_no_markers(16);
+	}
+
+	static inline int dequantize_ac(int c, int q) { c *= q; return c; }
+
+	// Decodes and dequantizes the next row of coefficients.
+	void jpeg_decoder::decode_next_row()
+	{
+		int row_block = 0;
+
+		for (int mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
+		{
+			if ((m_restart_interval) && (m_restarts_left == 0))
+				process_restart();
+
+			jpgd_block_coeff_t* p = m_pMCU_coefficients;
+			for (int mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++, p += 64)
+			{
+				int component_id = m_mcu_org[mcu_block];
+				if (m_comp_quant[component_id] >= JPGD_MAX_QUANT_TABLES)
+					stop_decoding(JPGD_DECODE_ERROR);
+
+				jpgd_quant_t* q = m_quant[m_comp_quant[component_id]];
+
+				int r, s;
+				s = huff_decode(m_pHuff_tabs[m_comp_dc_tab[component_id]], r);
+				if (s >= 16)
+					stop_decoding(JPGD_DECODE_ERROR);
+
+				s = JPGD_HUFF_EXTEND(r, s);
+
+				m_last_dc_val[component_id] = (s += m_last_dc_val[component_id]);
+
+				p[0] = static_cast<jpgd_block_coeff_t>(s * q[0]);
+
+				int prev_num_set = m_mcu_block_max_zag[mcu_block];
+
+				huff_tables* pH = m_pHuff_tabs[m_comp_ac_tab[component_id]];
+
+				int k;
+				for (k = 1; k < 64; k++)
+				{
+					int extra_bits;
+					s = huff_decode(pH, extra_bits);
+
+					r = s >> 4;
+					s &= 15;
+
+					if (s)
+					{
+						if (r)
+						{
+							if ((k + r) > 63)
+								stop_decoding(JPGD_DECODE_ERROR);
+
+							if (k < prev_num_set)
+							{
+								int n = JPGD_MIN(r, prev_num_set - k);
+								int kt = k;
+								while (n--)
+									p[g_ZAG[kt++]] = 0;
+							}
+
+							k += r;
+						}
+
+						s = JPGD_HUFF_EXTEND(extra_bits, s);
+
+						if (k >= 64)
+							stop_decoding(JPGD_DECODE_ERROR);
+
+						p[g_ZAG[k]] = static_cast<jpgd_block_coeff_t>(dequantize_ac(s, q[k])); //s * q[k];
+					}
+					else
+					{
+						if (r == 15)
+						{
+							if ((k + 16) > 64)
+								stop_decoding(JPGD_DECODE_ERROR);
+
+							if (k < prev_num_set)
+							{
+								int n = JPGD_MIN(16, prev_num_set - k);
+								int kt = k;
+								while (n--)
+								{
+									if (kt > 63)
+										stop_decoding(JPGD_DECODE_ERROR);
+									p[g_ZAG[kt++]] = 0;
+								}
+							}
+
+							k += 16 - 1; // - 1 because the loop counter is k
+
+							if (p[g_ZAG[k & 63]] != 0)
+								stop_decoding(JPGD_DECODE_ERROR);
+						}
+						else
+							break;
+					}
+				}
+
+				if (k < prev_num_set)
+				{
+					int kt = k;
+					while (kt < prev_num_set)
+						p[g_ZAG[kt++]] = 0;
+				}
+
+				m_mcu_block_max_zag[mcu_block] = k;
+
+				row_block++;
+			}
+
+			transform_mcu(mcu_row);
+
+			m_restarts_left--;
+		}
+	}
+
+	// YCbCr H1V1 (1x1:1:1, 3 m_blocks per MCU) to RGB
+	void jpeg_decoder::H1V1Convert()
+	{
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+		uint8* d = m_pScan_line_0;
+		uint8* s = m_pSample_buf + row * 8;
+
+		for (int i = m_max_mcus_per_row; i > 0; i--)
+		{
+			for (int j = 0; j < 8; j++)
+			{
+				int y = s[j];
+				int cb = s[64 + j];
+				int cr = s[128 + j];
+
+				d[0] = clamp(y + m_crr[cr]);
+				d[1] = clamp(y + ((m_crg[cr] + m_cbg[cb]) >> 16));
+				d[2] = clamp(y + m_cbb[cb]);
+				d[3] = 255;
+
+				d += 4;
+			}
+
+			s += 64 * 3;
+		}
+	}
+
+	// YCbCr H2V1 (2x1:1:1, 4 m_blocks per MCU) to RGB
+	void jpeg_decoder::H2V1Convert()
+	{
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+		uint8* d0 = m_pScan_line_0;
+		uint8* y = m_pSample_buf + row * 8;
+		uint8* c = m_pSample_buf + 2 * 64 + row * 8;
+
+		for (int i = m_max_mcus_per_row; i > 0; i--)
+		{
+			for (int l = 0; l < 2; l++)
+			{
+				for (int j = 0; j < 4; j++)
+				{
+					int cb = c[0];
+					int cr = c[64];
+
+					int rc = m_crr[cr];
+					int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+					int bc = m_cbb[cb];
+
+					int yy = y[j << 1];
+					d0[0] = clamp(yy + rc);
+					d0[1] = clamp(yy + gc);
+					d0[2] = clamp(yy + bc);
+					d0[3] = 255;
+
+					yy = y[(j << 1) + 1];
+					d0[4] = clamp(yy + rc);
+					d0[5] = clamp(yy + gc);
+					d0[6] = clamp(yy + bc);
+					d0[7] = 255;
+
+					d0 += 8;
+
+					c++;
+				}
+				y += 64;
+			}
+
+			y += 64 * 4 - 64 * 2;
+			c += 64 * 4 - 8;
+		}
+	}
+
+	// YCbCr H2V1 (2x1:1:1, 4 m_blocks per MCU) to RGB
+	void jpeg_decoder::H2V1ConvertFiltered()
+	{
+		const uint BLOCKS_PER_MCU = 4;
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+		uint8* d0 = m_pScan_line_0;
+
+		const int half_image_x_size = (m_image_x_size >> 1) - 1;
+		const int row_x8 = row * 8;
+
+		for (int x = 0; x < m_image_x_size; x++)
+		{
+			int y = m_pSample_buf[check_sample_buf_ofs((x >> 4) * BLOCKS_PER_MCU * 64 + ((x & 8) ? 64 : 0) + (x & 7) + row_x8)];
+
+			int c_x0 = (x - 1) >> 1;
+			int c_x1 = JPGD_MIN(c_x0 + 1, half_image_x_size);
+			c_x0 = JPGD_MAX(c_x0, 0);
+
+			int a = (c_x0 >> 3) * BLOCKS_PER_MCU * 64 + (c_x0 & 7) + row_x8 + 128;
+			int cb0 = m_pSample_buf[check_sample_buf_ofs(a)];
+			int cr0 = m_pSample_buf[check_sample_buf_ofs(a + 64)];
+
+			int b = (c_x1 >> 3) * BLOCKS_PER_MCU * 64 + (c_x1 & 7) + row_x8 + 128;
+			int cb1 = m_pSample_buf[check_sample_buf_ofs(b)];
+			int cr1 = m_pSample_buf[check_sample_buf_ofs(b + 64)];
+
+			int w0 = (x & 1) ? 3 : 1;
+			int w1 = (x & 1) ? 1 : 3;
+
+			int cb = (cb0 * w0 + cb1 * w1 + 2) >> 2;
+			int cr = (cr0 * w0 + cr1 * w1 + 2) >> 2;
+
+			int rc = m_crr[cr];
+			int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+			int bc = m_cbb[cb];
+
+			d0[0] = clamp(y + rc);
+			d0[1] = clamp(y + gc);
+			d0[2] = clamp(y + bc);
+			d0[3] = 255;
+
+			d0 += 4;
+		}
+	}
+
+	// YCbCr H2V1 (1x2:1:1, 4 m_blocks per MCU) to RGB
+	void jpeg_decoder::H1V2Convert()
+	{
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+		uint8* d0 = m_pScan_line_0;
+		uint8* d1 = m_pScan_line_1;
+		uint8* y;
+		uint8* c;
+
+		if (row < 8)
+			y = m_pSample_buf + row * 8;
+		else
+			y = m_pSample_buf + 64 * 1 + (row & 7) * 8;
+
+		c = m_pSample_buf + 64 * 2 + (row >> 1) * 8;
+
+		for (int i = m_max_mcus_per_row; i > 0; i--)
+		{
+			for (int j = 0; j < 8; j++)
+			{
+				int cb = c[0 + j];
+				int cr = c[64 + j];
 
 				int rc = m_crr[cr];
 				int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
 				int bc = m_cbb[cb];
 
 				int yy = y[j];
-				d0[0] = clamp(yy+rc);
-				d0[1] = clamp(yy+gc);
-				d0[2] = clamp(yy+bc);
+				d0[0] = clamp(yy + rc);
+				d0[1] = clamp(yy + gc);
+				d0[2] = clamp(yy + bc);
 				d0[3] = 255;
 
-				yy = y[j+1];
-				d0[4] = clamp(yy+rc);
-				d0[5] = clamp(yy+gc);
-				d0[6] = clamp(yy+bc);
-				d0[7] = 255;
-
-				yy = y[j+8];
-				d1[0] = clamp(yy+rc);
-				d1[1] = clamp(yy+gc);
-				d1[2] = clamp(yy+bc);
+				yy = y[8 + j];
+				d1[0] = clamp(yy + rc);
+				d1[1] = clamp(yy + gc);
+				d1[2] = clamp(yy + bc);
 				d1[3] = 255;
 
-				yy = y[j+8+1];
-				d1[4] = clamp(yy+rc);
-				d1[5] = clamp(yy+gc);
-				d1[6] = clamp(yy+bc);
-				d1[7] = 255;
-
-				d0 += 8;
-				d1 += 8;
-
-				c++;
+				d0 += 4;
+				d1 += 4;
 			}
-			y += 64;
+
+			y += 64 * 4;
+			c += 64 * 4;
+		}
+	}
+
+	// YCbCr H2V1 (1x2:1:1, 4 m_blocks per MCU) to RGB
+	void jpeg_decoder::H1V2ConvertFiltered()
+	{
+		const uint BLOCKS_PER_MCU = 4;
+		int y = m_image_y_size - m_total_lines_left;
+		int row = y & 15;
+
+		const int half_image_y_size = (m_image_y_size >> 1) - 1;
+
+		uint8* d0 = m_pScan_line_0;
+
+		const int w0 = (row & 1) ? 3 : 1;
+		const int w1 = (row & 1) ? 1 : 3;
+
+		int c_y0 = (y - 1) >> 1;
+		int c_y1 = JPGD_MIN(c_y0 + 1, half_image_y_size);
+
+		const uint8_t* p_YSamples = m_pSample_buf;
+		const uint8_t* p_C0Samples = m_pSample_buf;
+		if ((c_y0 >= 0) && (((row & 15) == 0) || ((row & 15) == 15)) && (m_total_lines_left > 1))
+		{
+			assert(y > 0);
+			assert(m_sample_buf_prev_valid);
+
+			if ((row & 15) == 15)
+				p_YSamples = m_pSample_buf_prev;
+
+			p_C0Samples = m_pSample_buf_prev;
 		}
 
-		y += 64*6 - 64*2;
-		c += 64*6 - 8;
+		const int y_sample_base_ofs = ((row & 8) ? 64 : 0) + (row & 7) * 8;
+		const int y0_base = (c_y0 & 7) * 8 + 128;
+		const int y1_base = (c_y1 & 7) * 8 + 128;
+
+		for (int x = 0; x < m_image_x_size; x++)
+		{
+			const int base_ofs = (x >> 3) * BLOCKS_PER_MCU * 64 + (x & 7);
+
+			int y_sample = p_YSamples[check_sample_buf_ofs(base_ofs + y_sample_base_ofs)];
+
+			int a = base_ofs + y0_base;
+			int cb0_sample = p_C0Samples[check_sample_buf_ofs(a)];
+			int cr0_sample = p_C0Samples[check_sample_buf_ofs(a + 64)];
+
+			int b = base_ofs + y1_base;
+			int cb1_sample = m_pSample_buf[check_sample_buf_ofs(b)];
+			int cr1_sample = m_pSample_buf[check_sample_buf_ofs(b + 64)];
+
+			int cb = (cb0_sample * w0 + cb1_sample * w1 + 2) >> 2;
+			int cr = (cr0_sample * w0 + cr1_sample * w1 + 2) >> 2;
+
+			int rc = m_crr[cr];
+			int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+			int bc = m_cbb[cb];
+
+			d0[0] = clamp(y_sample + rc);
+			d0[1] = clamp(y_sample + gc);
+			d0[2] = clamp(y_sample + bc);
+			d0[3] = 255;
+
+			d0 += 4;
+		}
 	}
-}
-
-// Y (1 block per MCU) to 8-bit grayscale
-void jpeg_decoder::gray_convert()
-{
-  int row = m_max_mcu_y_size - m_mcu_lines_left;
-  uint8 *d = m_pScan_line_0;
-  uint8 *s = m_pSample_buf + row * 8;
-
-  for (int i = m_max_mcus_per_row; i > 0; i--)
-  {
-    *(uint *)d = *(uint *)s;
-    *(uint *)(&d[4]) = *(uint *)(&s[4]);
-
-    s += 64;
-    d += 8;
-  }
-}
-
-void jpeg_decoder::expanded_convert()
-{
-  int row = m_max_mcu_y_size - m_mcu_lines_left;
-
-  uint8* Py = m_pSample_buf + (row / 8) * 64 * m_comp_h_samp[0] + (row & 7) * 8;
-
-  uint8* d = m_pScan_line_0;
-
-  for (int i = m_max_mcus_per_row; i > 0; i--)
-  {
-    for (int k = 0; k < m_max_mcu_x_size; k += 8)
-    {
-      const int Y_ofs = k * 8;
-      const int Cb_ofs = Y_ofs + 64 * m_expanded_blocks_per_component;
-      const int Cr_ofs = Y_ofs + 64 * m_expanded_blocks_per_component * 2;
-      for (int j = 0; j < 8; j++)
-      {
-        int y = Py[Y_ofs + j];
-        int cb = Py[Cb_ofs + j];
-        int cr = Py[Cr_ofs + j];
-
-        d[0] = clamp(y + m_crr[cr]);
-        d[1] = clamp(y + ((m_crg[cr] + m_cbg[cb]) >> 16));
-        d[2] = clamp(y + m_cbb[cb]);
-        d[3] = 255;
-
-        d += 4;
-      }
-    }
-
-    Py += 64 * m_expanded_blocks_per_mcu;
-  }
-}
-
-// Find end of image (EOI) marker, so we can return to the user the exact size of the input stream.
-void jpeg_decoder::find_eoi()
-{
-  if (!m_progressive_flag)
-  {
-    // Attempt to read the EOI marker.
-    //get_bits_no_markers(m_bits_left & 7);
-
-    // Prime the bit buffer
-    m_bits_left = 16;
-    get_bits(16);
-    get_bits(16);
-
-    // The next marker _should_ be EOI
-    process_markers();
-  }
-
-  m_total_bytes_read -= m_in_buf_left;
-}
-
-int jpeg_decoder::decode(const void** pScan_line, uint* pScan_line_len)
-{
-  if ((m_error_code) || (!m_ready_flag))
-    return JPGD_FAILED;
-
-  if (m_total_lines_left == 0)
-    return JPGD_DONE;
-
-  if (m_mcu_lines_left == 0)
-  {
-    if (setjmp(m_jmp_state))
-      return JPGD_FAILED;
-
-    if (m_progressive_flag)
-      load_next_row();
-    else
-      decode_next_row();
-
-    // Find the EOI marker if that was the last row.
-    if (m_total_lines_left <= m_max_mcu_y_size)
-      find_eoi();
-
-    m_mcu_lines_left = m_max_mcu_y_size;
-  }
-
-  if (m_freq_domain_chroma_upsample)
-  {
-    expanded_convert();
-    *pScan_line = m_pScan_line_0;
-  }
-  else
-  {
-    switch (m_scan_type)
-    {
-      case JPGD_YH2V2:
-      {
-        if ((m_mcu_lines_left & 1) == 0)
-        {
-          H2V2Convert();
-          *pScan_line = m_pScan_line_0;
-        }
-        else
-          *pScan_line = m_pScan_line_1;
-
-        break;
-      }
-      case JPGD_YH2V1:
-      {
-        H2V1Convert();
-        *pScan_line = m_pScan_line_0;
-        break;
-      }
-      case JPGD_YH1V2:
-      {
-        if ((m_mcu_lines_left & 1) == 0)
-        {
-          H1V2Convert();
-          *pScan_line = m_pScan_line_0;
-        }
-        else
-          *pScan_line = m_pScan_line_1;
-
-        break;
-      }
-      case JPGD_YH1V1:
-      {
-        H1V1Convert();
-        *pScan_line = m_pScan_line_0;
-        break;
-      }
-      case JPGD_GRAYSCALE:
-      {
-        gray_convert();
-        *pScan_line = m_pScan_line_0;
-
-        break;
-      }
-    }
-  }
-
-  *pScan_line_len = m_real_dest_bytes_per_scan_line;
-
-  m_mcu_lines_left--;
-  m_total_lines_left--;
-
-  return JPGD_SUCCESS;
-}
-
-// Creates the tables needed for efficient Huffman decoding.
-void jpeg_decoder::make_huff_table(int index, huff_tables *pH)
-{
-  int p, i, l, si;
-  uint8 huffsize[257];
-  uint huffcode[257];
-  uint code;
-  uint subtree;
-  int code_size;
-  int lastp;
-  int nextfreeentry;
-  int currententry;
-
-  pH->ac_table = m_huff_ac[index] != 0;
-
-  p = 0;
-
-  for (l = 1; l <= 16; l++)
-  {
-    for (i = 1; i <= m_huff_num[index][l]; i++)
-    {
-      JPGD_ASSERT(p < 257);
-      huffsize[p++] = static_cast<uint8>(l);
-    }
-  }
-
-  huffsize[p] = 0;
-
-  lastp = p;
-
-  code = 0;
-  si = huffsize[0];
-  p = 0;
-
-  while (huffsize[p])
-  {
-    while (huffsize[p] == si)
-    {
-      JPGD_ASSERT(p < 257);
-      huffcode[p++] = code;
-      code++;
-    }
-
-    code <<= 1;
-    si++;
-  }
-
-  memset(pH->look_up, 0, sizeof(pH->look_up));
-  memset(pH->look_up2, 0, sizeof(pH->look_up2));
-  memset(pH->tree, 0, sizeof(pH->tree));
-  memset(pH->code_size, 0, sizeof(pH->code_size));
-
-  nextfreeentry = -1;
-
-  p = 0;
-
-  while (p < lastp)
-  {
-    i = m_huff_val[index][p];
-    code = huffcode[p];
-    code_size = huffsize[p];
-
-    pH->code_size[i] = static_cast<uint8>(code_size);
-
-    if (code_size <= 8)
-    {
-      code <<= (8 - code_size);
-
-      for (l = 1 << (8 - code_size); l > 0; l--)
-      {
-        JPGD_ASSERT(i < JPGD_HUFF_CODE_SIZE_MAX_LENGTH);
-        JPGD_ASSERT(code < JPGD_HUFF_CODE_SIZE_MAX_LENGTH);
-
-        pH->look_up[code] = i;
-
-        bool has_extrabits = false;
-				int extra_bits = 0;
-        int num_extra_bits = i & 15;
-
-        int bits_to_fetch = code_size;
-        if (num_extra_bits)
-        {
-          int total_codesize = code_size + num_extra_bits;
-          if (total_codesize <= 8)
-          {
-            has_extrabits = true;
-            extra_bits = ((1 << num_extra_bits) - 1) & (code >> (8 - total_codesize));
-            JPGD_ASSERT(extra_bits <= 0x7FFF);
-            bits_to_fetch += num_extra_bits;
-          }
-        }
-
-        if (!has_extrabits)
-          pH->look_up2[code] = i | (bits_to_fetch << 8);
-        else
-          pH->look_up2[code] = i | 0x8000 | (extra_bits << 16) | (bits_to_fetch << 8);
-
-        code++;
-      }
-    }
-    else
-    {
-      subtree = (code >> (code_size - 8)) & 0xFF;
-
-      currententry = pH->look_up[subtree];
-
-      if (currententry == 0)
-      {
-        pH->look_up[subtree] = currententry = nextfreeentry;
-        pH->look_up2[subtree] = currententry = nextfreeentry;
-
-        nextfreeentry -= 2;
-      }
-
-      code <<= (16 - (code_size - 8));
-
-      for (l = code_size; l > 9; l--)
-      {
-        if ((code & 0x8000) == 0)
-          currententry--;
-
-        unsigned int idx = -currententry - 1;
-        JPGD_ASSERT(idx < JPGD_HUFF_TREE_MAX_LENGTH);
-        if (pH->tree[idx] == 0)
-        {
-          pH->tree[idx] = nextfreeentry;
-
-          currententry = nextfreeentry;
-
-          nextfreeentry -= 2;
-        }
-        else {
-          currententry = pH->tree[idx];
-        }
-
-        code <<= 1;
-      }
-
-      if ((code & 0x8000) == 0)
-        currententry--;
-
-      pH->tree[-currententry - 1] = i;
-    }
-
-    p++;
-  }
-}
-
-// Verifies the quantization tables needed for this scan are available.
-void jpeg_decoder::check_quant_tables()
-{
-  for (int i = 0; i < m_comps_in_scan; i++)
-    if (m_quant[m_comp_quant[m_comp_list[i]]] == NULL)
-      stop_decoding(JPGD_UNDEFINED_QUANT_TABLE);
-}
-
-// Verifies that all the Huffman tables needed for this scan are available.
-void jpeg_decoder::check_huff_tables()
-{
-  for (int i = 0; i < m_comps_in_scan; i++)
-  {
-    if ((m_spectral_start == 0) && (m_huff_num[m_comp_dc_tab[m_comp_list[i]]] == NULL))
-      stop_decoding(JPGD_UNDEFINED_HUFF_TABLE);
-
-    if ((m_spectral_end > 0) && (m_huff_num[m_comp_ac_tab[m_comp_list[i]]] == NULL))
-      stop_decoding(JPGD_UNDEFINED_HUFF_TABLE);
-  }
-
-  for (int i = 0; i < JPGD_MAX_HUFF_TABLES; i++)
-    if (m_huff_num[i])
-    {
-      if (!m_pHuff_tabs[i])
-        m_pHuff_tabs[i] = (huff_tables *)alloc(sizeof(huff_tables));
-
-      make_huff_table(i, m_pHuff_tabs[i]);
-    }
-}
-
-// Determines the component order inside each MCU.
-// Also calcs how many MCU's are on each row, etc.
-void jpeg_decoder::calc_mcu_block_order()
-{
-  int component_num, component_id;
-  int max_h_samp = 0, max_v_samp = 0;
-
-  for (component_id = 0; component_id < m_comps_in_frame; component_id++)
-  {
-    if (m_comp_h_samp[component_id] > max_h_samp)
-      max_h_samp = m_comp_h_samp[component_id];
-
-    if (m_comp_v_samp[component_id] > max_v_samp)
-      max_v_samp = m_comp_v_samp[component_id];
-  }
-
-  for (component_id = 0; component_id < m_comps_in_frame; component_id++)
-  {
-    m_comp_h_blocks[component_id] = ((((m_image_x_size * m_comp_h_samp[component_id]) + (max_h_samp - 1)) / max_h_samp) + 7) / 8;
-    m_comp_v_blocks[component_id] = ((((m_image_y_size * m_comp_v_samp[component_id]) + (max_v_samp - 1)) / max_v_samp) + 7) / 8;
-  }
-
-  if (m_comps_in_scan == 1)
-  {
-    m_mcus_per_row = m_comp_h_blocks[m_comp_list[0]];
-    m_mcus_per_col = m_comp_v_blocks[m_comp_list[0]];
-  }
-  else
-  {
-    m_mcus_per_row = (((m_image_x_size + 7) / 8) + (max_h_samp - 1)) / max_h_samp;
-    m_mcus_per_col = (((m_image_y_size + 7) / 8) + (max_v_samp - 1)) / max_v_samp;
-  }
-
-  if (m_comps_in_scan == 1)
-  {
-    m_mcu_org[0] = m_comp_list[0];
-
-    m_blocks_per_mcu = 1;
-  }
-  else
-  {
-    m_blocks_per_mcu = 0;
-
-    for (component_num = 0; component_num < m_comps_in_scan; component_num++)
-    {
-      int num_blocks;
-
-      component_id = m_comp_list[component_num];
-
-      num_blocks = m_comp_h_samp[component_id] * m_comp_v_samp[component_id];
-
-      while (num_blocks--)
-        m_mcu_org[m_blocks_per_mcu++] = component_id;
-    }
-  }
-}
-
-// Starts a new scan.
-int jpeg_decoder::init_scan()
-{
-  if (!locate_sos_marker())
-    return JPGD_FALSE;
-
-  calc_mcu_block_order();
-
-  check_huff_tables();
-
-  check_quant_tables();
-
-  memset(m_last_dc_val, 0, m_comps_in_frame * sizeof(uint));
-
-  m_eob_run = 0;
-
-  if (m_restart_interval)
-  {
-    m_restarts_left = m_restart_interval;
-    m_next_restart_num = 0;
-  }
-
-  fix_in_buffer();
-
-  return JPGD_TRUE;
-}
-
-// Starts a frame. Determines if the number of components or sampling factors
-// are supported.
-void jpeg_decoder::init_frame()
-{
-  int i;
-
-  if (m_comps_in_frame == 1)
-  {
-    if ((m_comp_h_samp[0] != 1) || (m_comp_v_samp[0] != 1))
-      stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
-
-    m_scan_type = JPGD_GRAYSCALE;
-    m_max_blocks_per_mcu = 1;
-    m_max_mcu_x_size = 8;
-    m_max_mcu_y_size = 8;
-  }
-  else if (m_comps_in_frame == 3)
-  {
-    if ( ((m_comp_h_samp[1] != 1) || (m_comp_v_samp[1] != 1)) ||
-         ((m_comp_h_samp[2] != 1) || (m_comp_v_samp[2] != 1)) )
-      stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
-
-    if ((m_comp_h_samp[0] == 1) && (m_comp_v_samp[0] == 1))
-    {
-      m_scan_type = JPGD_YH1V1;
-
-      m_max_blocks_per_mcu = 3;
-      m_max_mcu_x_size = 8;
-      m_max_mcu_y_size = 8;
-    }
-    else if ((m_comp_h_samp[0] == 2) && (m_comp_v_samp[0] == 1))
-    {
-      m_scan_type = JPGD_YH2V1;
-      m_max_blocks_per_mcu = 4;
-      m_max_mcu_x_size = 16;
-      m_max_mcu_y_size = 8;
-    }
-    else if ((m_comp_h_samp[0] == 1) && (m_comp_v_samp[0] == 2))
-    {
-      m_scan_type = JPGD_YH1V2;
-      m_max_blocks_per_mcu = 4;
-      m_max_mcu_x_size = 8;
-      m_max_mcu_y_size = 16;
-    }
-    else if ((m_comp_h_samp[0] == 2) && (m_comp_v_samp[0] == 2))
-    {
-      m_scan_type = JPGD_YH2V2;
-      m_max_blocks_per_mcu = 6;
-      m_max_mcu_x_size = 16;
-      m_max_mcu_y_size = 16;
-    }
-    else
-      stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
-  }
-  else
-    stop_decoding(JPGD_UNSUPPORTED_COLORSPACE);
-
-  m_max_mcus_per_row = (m_image_x_size + (m_max_mcu_x_size - 1)) / m_max_mcu_x_size;
-  m_max_mcus_per_col = (m_image_y_size + (m_max_mcu_y_size - 1)) / m_max_mcu_y_size;
-
-  // These values are for the *destination* pixels: after conversion.
-  if (m_scan_type == JPGD_GRAYSCALE)
-    m_dest_bytes_per_pixel = 1;
-  else
-    m_dest_bytes_per_pixel = 4;
-
-  m_dest_bytes_per_scan_line = ((m_image_x_size + 15) & 0xFFF0) * m_dest_bytes_per_pixel;
-
-  m_real_dest_bytes_per_scan_line = (m_image_x_size * m_dest_bytes_per_pixel);
-
-  // Initialize two scan line buffers.
-  m_pScan_line_0 = (uint8 *)alloc(m_dest_bytes_per_scan_line, true);
-  if ((m_scan_type == JPGD_YH1V2) || (m_scan_type == JPGD_YH2V2))
-    m_pScan_line_1 = (uint8 *)alloc(m_dest_bytes_per_scan_line, true);
-
-  m_max_blocks_per_row = m_max_mcus_per_row * m_max_blocks_per_mcu;
-
-  // Should never happen
-  if (m_max_blocks_per_row > JPGD_MAX_BLOCKS_PER_ROW)
-    stop_decoding(JPGD_ASSERTION_ERROR);
-
-  // Allocate the coefficient buffer, enough for one MCU
-  m_pMCU_coefficients = (jpgd_block_t*)alloc(m_max_blocks_per_mcu * 64 * sizeof(jpgd_block_t));
-
-  for (i = 0; i < m_max_blocks_per_mcu; i++)
-    m_mcu_block_max_zag[i] = 64;
-
-  m_expanded_blocks_per_component = m_comp_h_samp[0] * m_comp_v_samp[0];
-  m_expanded_blocks_per_mcu = m_expanded_blocks_per_component * m_comps_in_frame;
-  m_expanded_blocks_per_row = m_max_mcus_per_row * m_expanded_blocks_per_mcu;
-	// Freq. domain chroma upsampling is only supported for H2V2 subsampling factor (the most common one I've seen).
-  m_freq_domain_chroma_upsample = false;
-#if JPGD_SUPPORT_FREQ_DOMAIN_UPSAMPLING
-  m_freq_domain_chroma_upsample = (m_expanded_blocks_per_mcu == 4*3);
-#endif
-
-  if (m_freq_domain_chroma_upsample)
-    m_pSample_buf = (uint8 *)alloc(m_expanded_blocks_per_row * 64);
-  else
-    m_pSample_buf = (uint8 *)alloc(m_max_blocks_per_row * 64);
-
-  m_total_lines_left = m_image_y_size;
-
-  m_mcu_lines_left = 0;
-
-  create_look_ups();
-}
-
-// The coeff_buf series of methods originally stored the coefficients
-// into a "virtual" file which was located in EMS, XMS, or a disk file. A cache
-// was used to make this process more efficient. Now, we can store the entire
-// thing in RAM.
-jpeg_decoder::coeff_buf* jpeg_decoder::coeff_buf_open(int block_num_x, int block_num_y, int block_len_x, int block_len_y)
-{
-  coeff_buf* cb = (coeff_buf*)alloc(sizeof(coeff_buf));
-
-  cb->block_num_x = block_num_x;
-  cb->block_num_y = block_num_y;
-  cb->block_len_x = block_len_x;
-  cb->block_len_y = block_len_y;
-  cb->block_size = (block_len_x * block_len_y) * sizeof(jpgd_block_t);
-  cb->pData = (uint8 *)alloc(cb->block_size * block_num_x * block_num_y, true);
-  return cb;
-}
-
-inline jpgd_block_t *jpeg_decoder::coeff_buf_getp(coeff_buf *cb, int block_x, int block_y)
-{
-  JPGD_ASSERT((block_x < cb->block_num_x) && (block_y < cb->block_num_y));
-  return (jpgd_block_t *)(cb->pData + block_x * cb->block_size + block_y * (cb->block_size * cb->block_num_x));
-}
-
-// The following methods decode the various types of m_blocks encountered
-// in progressively encoded images.
-void jpeg_decoder::decode_block_dc_first(jpeg_decoder *pD, int component_id, int block_x, int block_y)
-{
-  int s, r;
-  jpgd_block_t *p = pD->coeff_buf_getp(pD->m_dc_coeffs[component_id], block_x, block_y);
-
-  if ((s = pD->huff_decode(pD->m_pHuff_tabs[pD->m_comp_dc_tab[component_id]])) != 0)
-  {
-    r = pD->get_bits_no_markers(s);
-    s = JPGD_HUFF_EXTEND(r, s);
-  }
-
-  pD->m_last_dc_val[component_id] = (s += pD->m_last_dc_val[component_id]);
-
-  p[0] = static_cast<jpgd_block_t>(s << pD->m_successive_low);
-}
-
-void jpeg_decoder::decode_block_dc_refine(jpeg_decoder *pD, int component_id, int block_x, int block_y)
-{
-  if (pD->get_bits_no_markers(1))
-  {
-    jpgd_block_t *p = pD->coeff_buf_getp(pD->m_dc_coeffs[component_id], block_x, block_y);
-
-    p[0] |= (1 << pD->m_successive_low);
-  }
-}
-
-void jpeg_decoder::decode_block_ac_first(jpeg_decoder *pD, int component_id, int block_x, int block_y)
-{
-  int k, s, r;
-
-  if (pD->m_eob_run)
-  {
-    pD->m_eob_run--;
-    return;
-  }
-
-  jpgd_block_t *p = pD->coeff_buf_getp(pD->m_ac_coeffs[component_id], block_x, block_y);
-
-  for (k = pD->m_spectral_start; k <= pD->m_spectral_end; k++)
-  {
-    unsigned int idx = pD->m_comp_ac_tab[component_id];
-    JPGD_ASSERT(idx < JPGD_MAX_HUFF_TABLES);
-    s = pD->huff_decode(pD->m_pHuff_tabs[idx]);
-
-    r = s >> 4;
-    s &= 15;
-
-    if (s)
-    {
-      if ((k += r) > 63)
-        pD->stop_decoding(JPGD_DECODE_ERROR);
-
-      r = pD->get_bits_no_markers(s);
-      s = JPGD_HUFF_EXTEND(r, s);
-
-      p[g_ZAG[k]] = static_cast<jpgd_block_t>(s << pD->m_successive_low);
-    }
-    else
-    {
-      if (r == 15)
-      {
-        if ((k += 15) > 63)
-          pD->stop_decoding(JPGD_DECODE_ERROR);
-      }
-      else
-      {
-        pD->m_eob_run = 1 << r;
-
-        if (r)
-          pD->m_eob_run += pD->get_bits_no_markers(r);
-
-        pD->m_eob_run--;
-
-        break;
-      }
-    }
-  }
-}
-
-void jpeg_decoder::decode_block_ac_refine(jpeg_decoder *pD, int component_id, int block_x, int block_y)
-{
-  int s, k, r;
-  int p1 = 1 << pD->m_successive_low;
-  int m1 = (-1) << pD->m_successive_low;
-  jpgd_block_t *p = pD->coeff_buf_getp(pD->m_ac_coeffs[component_id], block_x, block_y);
-  JPGD_ASSERT(pD->m_spectral_end <= 63);
-  
-  k = pD->m_spectral_start;
-  
-  if (pD->m_eob_run == 0)
-  {
-    for ( ; k <= pD->m_spectral_end; k++)
-    {
-      unsigned int idx = pD->m_comp_ac_tab[component_id];
-      JPGD_ASSERT(idx < JPGD_MAX_HUFF_TABLES);
-      s = pD->huff_decode(pD->m_pHuff_tabs[idx]);
-
-      r = s >> 4;
-      s &= 15;
-
-      if (s)
-      {
-        if (s != 1)
-          pD->stop_decoding(JPGD_DECODE_ERROR);
-
-        if (pD->get_bits_no_markers(1))
-          s = p1;
-        else
-          s = m1;
-      }
-      else
-      {
-        if (r != 15)
-        {
-          pD->m_eob_run = 1 << r;
-
-          if (r)
-            pD->m_eob_run += pD->get_bits_no_markers(r);
-
-          break;
-        }
-      }
-
-      do
-      {
-        jpgd_block_t *this_coef = p + g_ZAG[k & 63];
-
-        if (*this_coef != 0)
-        {
-          if (pD->get_bits_no_markers(1))
-          {
-            if ((*this_coef & p1) == 0)
-            {
-              if (*this_coef >= 0)
-                *this_coef = static_cast<jpgd_block_t>(*this_coef + p1);
-              else
-                *this_coef = static_cast<jpgd_block_t>(*this_coef + m1);
-            }
-          }
-        }
-        else
-        {
-          if (--r < 0)
-            break;
-        }
-
-        k++;
-
-      } while (k <= pD->m_spectral_end);
-
-      if ((s) && (k < 64))
-      {
-        p[g_ZAG[k]] = static_cast<jpgd_block_t>(s);
-      }
-    }
-  }
-
-  if (pD->m_eob_run > 0)
-  {
-    for ( ; k <= pD->m_spectral_end; k++)
-    {
-      jpgd_block_t *this_coef = p + g_ZAG[k & 63]; // logical AND to shut up static code analysis
-
-      if (*this_coef != 0)
-      {
-        if (pD->get_bits_no_markers(1))
-        {
-          if ((*this_coef & p1) == 0)
-          {
-            if (*this_coef >= 0)
-              *this_coef = static_cast<jpgd_block_t>(*this_coef + p1);
-            else
-              *this_coef = static_cast<jpgd_block_t>(*this_coef + m1);
-          }
-        }
-      }
-    }
-
-    pD->m_eob_run--;
-  }
-}
-
-// Decode a scan in a progressively encoded image.
-void jpeg_decoder::decode_scan(pDecode_block_func decode_block_func)
-{
-  int mcu_row, mcu_col, mcu_block;
-  int block_x_mcu[JPGD_MAX_COMPONENTS], m_block_y_mcu[JPGD_MAX_COMPONENTS];
-
-  memset(m_block_y_mcu, 0, sizeof(m_block_y_mcu));
-
-  for (mcu_col = 0; mcu_col < m_mcus_per_col; mcu_col++)
-  {
-    int component_num, component_id;
-
-    memset(block_x_mcu, 0, sizeof(block_x_mcu));
-
-    for (mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
-    {
-      int block_x_mcu_ofs = 0, block_y_mcu_ofs = 0;
-
-      if ((m_restart_interval) && (m_restarts_left == 0))
-        process_restart();
-
-      for (mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
-      {
-        component_id = m_mcu_org[mcu_block];
-
-        decode_block_func(this, component_id, block_x_mcu[component_id] + block_x_mcu_ofs, m_block_y_mcu[component_id] + block_y_mcu_ofs);
-
-        if (m_comps_in_scan == 1)
-          block_x_mcu[component_id]++;
-        else
-        {
-          if (++block_x_mcu_ofs == m_comp_h_samp[component_id])
-          {
-            block_x_mcu_ofs = 0;
-
-            if (++block_y_mcu_ofs == m_comp_v_samp[component_id])
-            {
-              block_y_mcu_ofs = 0;
-              block_x_mcu[component_id] += m_comp_h_samp[component_id];
-            }
-          }
-        }
-      }
-
-      m_restarts_left--;
-    }
-
-    if (m_comps_in_scan == 1)
-      m_block_y_mcu[m_comp_list[0]]++;
-    else
-    {
-      for (component_num = 0; component_num < m_comps_in_scan; component_num++)
-      {
-        component_id = m_comp_list[component_num];
-        m_block_y_mcu[component_id] += m_comp_v_samp[component_id];
-      }
-    }
-  }
-}
-
-// Decode a progressively encoded image.
-void jpeg_decoder::init_progressive()
-{
-  int i;
-
-  if (m_comps_in_frame == 4)
-    stop_decoding(JPGD_UNSUPPORTED_COLORSPACE);
-
-  // Allocate the coefficient buffers.
-  for (i = 0; i < m_comps_in_frame; i++)
-  {
-    m_dc_coeffs[i] = coeff_buf_open(m_max_mcus_per_row * m_comp_h_samp[i], m_max_mcus_per_col * m_comp_v_samp[i], 1, 1);
-    m_ac_coeffs[i] = coeff_buf_open(m_max_mcus_per_row * m_comp_h_samp[i], m_max_mcus_per_col * m_comp_v_samp[i], 8, 8);
-  }
-
-  for ( ; ; )
-  {
-    int dc_only_scan, refinement_scan;
-    pDecode_block_func decode_block_func;
-
-    if (!init_scan())
-      break;
-
-    dc_only_scan = (m_spectral_start == 0);
-    refinement_scan = (m_successive_high != 0);
-
-    if ((m_spectral_start > m_spectral_end) || (m_spectral_end > 63))
-      stop_decoding(JPGD_BAD_SOS_SPECTRAL);
-
-    if (dc_only_scan)
-    {
-      if (m_spectral_end)
-        stop_decoding(JPGD_BAD_SOS_SPECTRAL);
-    }
-    else if (m_comps_in_scan != 1)  /* AC scans can only contain one component */
-      stop_decoding(JPGD_BAD_SOS_SPECTRAL);
-
-    if ((refinement_scan) && (m_successive_low != m_successive_high - 1))
-      stop_decoding(JPGD_BAD_SOS_SUCCESSIVE);
-
-    if (dc_only_scan)
-    {
-      if (refinement_scan)
-        decode_block_func = decode_block_dc_refine;
-      else
-        decode_block_func = decode_block_dc_first;
-    }
-    else
-    {
-      if (refinement_scan)
-        decode_block_func = decode_block_ac_refine;
-      else
-        decode_block_func = decode_block_ac_first;
-    }
-
-    decode_scan(decode_block_func);
-
-    m_bits_left = 16;
-    get_bits(16);
-    get_bits(16);
-  }
-
-  m_comps_in_scan = m_comps_in_frame;
-
-  for (i = 0; i < m_comps_in_frame; i++)
-    m_comp_list[i] = i;
-
-  calc_mcu_block_order();
-}
-
-void jpeg_decoder::init_sequential()
-{
-  if (!init_scan())
-    stop_decoding(JPGD_UNEXPECTED_MARKER);
-}
-
-void jpeg_decoder::decode_start()
-{
-  init_frame();
-
-  if (m_progressive_flag)
-    init_progressive();
-  else
-    init_sequential();
-}
-
-void jpeg_decoder::decode_init(jpeg_decoder_stream *pStream)
-{
-  init(pStream);
-  locate_sof_marker();
-}
-
-jpeg_decoder::jpeg_decoder(jpeg_decoder_stream *pStream)
-{
-  if (setjmp(m_jmp_state))
-    return;
-  decode_init(pStream);
-}
-
-int jpeg_decoder::begin_decoding()
-{
-  if (m_ready_flag)
-    return JPGD_SUCCESS;
-
-  if (m_error_code)
-    return JPGD_FAILED;
-
-  if (setjmp(m_jmp_state))
-    return JPGD_FAILED;
-
-  decode_start();
-
-  m_ready_flag = true;
-
-  return JPGD_SUCCESS;
-}
-
-jpeg_decoder::~jpeg_decoder()
-{
-  free_all_blocks();
-}
-
-jpeg_decoder_file_stream::jpeg_decoder_file_stream()
-{
-  m_pFile = NULL;
-  m_eof_flag = false;
-  m_error_flag = false;
-}
-
-void jpeg_decoder_file_stream::close()
-{
-  if (m_pFile)
-  {
-    fclose(m_pFile);
-    m_pFile = NULL;
-  }
-
-  m_eof_flag = false;
-  m_error_flag = false;
-}
-
-jpeg_decoder_file_stream::~jpeg_decoder_file_stream()
-{
-  close();
-}
-
-bool jpeg_decoder_file_stream::open(const char *Pfilename)
-{
-  close();
-
-  m_eof_flag = false;
-  m_error_flag = false;
+
+	// YCbCr H2V2 (2x2:1:1, 6 m_blocks per MCU) to RGB
+	void jpeg_decoder::H2V2Convert()
+	{
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+		uint8* d0 = m_pScan_line_0;
+		uint8* d1 = m_pScan_line_1;
+		uint8* y;
+		uint8* c;
+
+		if (row < 8)
+			y = m_pSample_buf + row * 8;
+		else
+			y = m_pSample_buf + 64 * 2 + (row & 7) * 8;
+
+		c = m_pSample_buf + 64 * 4 + (row >> 1) * 8;
+
+		for (int i = m_max_mcus_per_row; i > 0; i--)
+		{
+			for (int l = 0; l < 2; l++)
+			{
+				for (int j = 0; j < 8; j += 2)
+				{
+					int cb = c[0];
+					int cr = c[64];
+
+					int rc = m_crr[cr];
+					int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+					int bc = m_cbb[cb];
+
+					int yy = y[j];
+					d0[0] = clamp(yy + rc);
+					d0[1] = clamp(yy + gc);
+					d0[2] = clamp(yy + bc);
+					d0[3] = 255;
+
+					yy = y[j + 1];
+					d0[4] = clamp(yy + rc);
+					d0[5] = clamp(yy + gc);
+					d0[6] = clamp(yy + bc);
+					d0[7] = 255;
+
+					yy = y[j + 8];
+					d1[0] = clamp(yy + rc);
+					d1[1] = clamp(yy + gc);
+					d1[2] = clamp(yy + bc);
+					d1[3] = 255;
+
+					yy = y[j + 8 + 1];
+					d1[4] = clamp(yy + rc);
+					d1[5] = clamp(yy + gc);
+					d1[6] = clamp(yy + bc);
+					d1[7] = 255;
+
+					d0 += 8;
+					d1 += 8;
+
+					c++;
+				}
+				y += 64;
+			}
+
+			y += 64 * 6 - 64 * 2;
+			c += 64 * 6 - 8;
+		}
+	}
+
+	uint32_t jpeg_decoder::H2V2ConvertFiltered()
+	{
+		const uint BLOCKS_PER_MCU = 6;
+		int y = m_image_y_size - m_total_lines_left;
+		int row = y & 15;
+
+		const int half_image_y_size = (m_image_y_size >> 1) - 1;
+
+		uint8* d0 = m_pScan_line_0;
+
+		int c_y0 = (y - 1) >> 1;
+		int c_y1 = JPGD_MIN(c_y0 + 1, half_image_y_size);
+
+		const uint8_t* p_YSamples = m_pSample_buf;
+		const uint8_t* p_C0Samples = m_pSample_buf;
+		if ((c_y0 >= 0) && (((row & 15) == 0) || ((row & 15) == 15)) && (m_total_lines_left > 1))
+		{
+			assert(y > 0);
+			assert(m_sample_buf_prev_valid);
+
+			if ((row & 15) == 15)
+				p_YSamples = m_pSample_buf_prev;
+
+			p_C0Samples = m_pSample_buf_prev;
+		}
+
+		const int y_sample_base_ofs = ((row & 8) ? 128 : 0) + (row & 7) * 8;
+		const int y0_base = (c_y0 & 7) * 8 + 256;
+		const int y1_base = (c_y1 & 7) * 8 + 256;
+
+		const int half_image_x_size = (m_image_x_size >> 1) - 1;
+
+		static const uint8_t s_muls[2][2][4] =
+		{
+			{ { 1, 3, 3, 9 }, { 3, 9, 1, 3 }, },
+			{ { 3, 1, 9, 3 }, { 9, 3, 3, 1 } }
+		};
+
+		if (((row & 15) >= 1) && ((row & 15) <= 14))
+		{
+			assert((row & 1) == 1);
+			assert(((y + 1 - 1) >> 1) == c_y0);
+
+			assert(p_YSamples == m_pSample_buf);
+			assert(p_C0Samples == m_pSample_buf);
+
+			uint8* d1 = m_pScan_line_1;
+			const int y_sample_base_ofs1 = (((row + 1) & 8) ? 128 : 0) + ((row + 1) & 7) * 8;
+
+			for (int x = 0; x < m_image_x_size; x++)
+			{
+				int k = (x >> 4) * BLOCKS_PER_MCU * 64 + ((x & 8) ? 64 : 0) + (x & 7);
+				int y_sample0 = p_YSamples[check_sample_buf_ofs(k + y_sample_base_ofs)];
+				int y_sample1 = p_YSamples[check_sample_buf_ofs(k + y_sample_base_ofs1)];
+
+				int c_x0 = (x - 1) >> 1;
+				int c_x1 = JPGD_MIN(c_x0 + 1, half_image_x_size);
+				c_x0 = JPGD_MAX(c_x0, 0);
+
+				int a = (c_x0 >> 3) * BLOCKS_PER_MCU * 64 + (c_x0 & 7);
+				int cb00_sample = p_C0Samples[check_sample_buf_ofs(a + y0_base)];
+				int cr00_sample = p_C0Samples[check_sample_buf_ofs(a + y0_base + 64)];
+
+				int cb01_sample = m_pSample_buf[check_sample_buf_ofs(a + y1_base)];
+				int cr01_sample = m_pSample_buf[check_sample_buf_ofs(a + y1_base + 64)];
+
+				int b = (c_x1 >> 3) * BLOCKS_PER_MCU * 64 + (c_x1 & 7);
+				int cb10_sample = p_C0Samples[check_sample_buf_ofs(b + y0_base)];
+				int cr10_sample = p_C0Samples[check_sample_buf_ofs(b + y0_base + 64)];
+
+				int cb11_sample = m_pSample_buf[check_sample_buf_ofs(b + y1_base)];
+				int cr11_sample = m_pSample_buf[check_sample_buf_ofs(b + y1_base + 64)];
+
+				{
+					const uint8_t* pMuls = &s_muls[row & 1][x & 1][0];
+					int cb = (cb00_sample * pMuls[0] + cb01_sample * pMuls[1] + cb10_sample * pMuls[2] + cb11_sample * pMuls[3] + 8) >> 4;
+					int cr = (cr00_sample * pMuls[0] + cr01_sample * pMuls[1] + cr10_sample * pMuls[2] + cr11_sample * pMuls[3] + 8) >> 4;
+
+					int rc = m_crr[cr];
+					int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+					int bc = m_cbb[cb];
+
+					d0[0] = clamp(y_sample0 + rc);
+					d0[1] = clamp(y_sample0 + gc);
+					d0[2] = clamp(y_sample0 + bc);
+					d0[3] = 255;
+
+					d0 += 4;
+				}
+
+				{
+					const uint8_t* pMuls = &s_muls[(row + 1) & 1][x & 1][0];
+					int cb = (cb00_sample * pMuls[0] + cb01_sample * pMuls[1] + cb10_sample * pMuls[2] + cb11_sample * pMuls[3] + 8) >> 4;
+					int cr = (cr00_sample * pMuls[0] + cr01_sample * pMuls[1] + cr10_sample * pMuls[2] + cr11_sample * pMuls[3] + 8) >> 4;
+
+					int rc = m_crr[cr];
+					int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+					int bc = m_cbb[cb];
+
+					d1[0] = clamp(y_sample1 + rc);
+					d1[1] = clamp(y_sample1 + gc);
+					d1[2] = clamp(y_sample1 + bc);
+					d1[3] = 255;
+
+					d1 += 4;
+				}
+
+				if (((x & 1) == 1) && (x < m_image_x_size - 1))
+				{
+					const int nx = x + 1;
+					assert(c_x0 == (nx - 1) >> 1);
+
+					k = (nx >> 4) * BLOCKS_PER_MCU * 64 + ((nx & 8) ? 64 : 0) + (nx & 7);
+					y_sample0 = p_YSamples[check_sample_buf_ofs(k + y_sample_base_ofs)];
+					y_sample1 = p_YSamples[check_sample_buf_ofs(k + y_sample_base_ofs1)];
+
+					{
+						const uint8_t* pMuls = &s_muls[row & 1][nx & 1][0];
+						int cb = (cb00_sample * pMuls[0] + cb01_sample * pMuls[1] + cb10_sample * pMuls[2] + cb11_sample * pMuls[3] + 8) >> 4;
+						int cr = (cr00_sample * pMuls[0] + cr01_sample * pMuls[1] + cr10_sample * pMuls[2] + cr11_sample * pMuls[3] + 8) >> 4;
+
+						int rc = m_crr[cr];
+						int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+						int bc = m_cbb[cb];
+
+						d0[0] = clamp(y_sample0 + rc);
+						d0[1] = clamp(y_sample0 + gc);
+						d0[2] = clamp(y_sample0 + bc);
+						d0[3] = 255;
+
+						d0 += 4;
+					}
+
+					{
+						const uint8_t* pMuls = &s_muls[(row + 1) & 1][nx & 1][0];
+						int cb = (cb00_sample * pMuls[0] + cb01_sample * pMuls[1] + cb10_sample * pMuls[2] + cb11_sample * pMuls[3] + 8) >> 4;
+						int cr = (cr00_sample * pMuls[0] + cr01_sample * pMuls[1] + cr10_sample * pMuls[2] + cr11_sample * pMuls[3] + 8) >> 4;
+
+						int rc = m_crr[cr];
+						int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+						int bc = m_cbb[cb];
+
+						d1[0] = clamp(y_sample1 + rc);
+						d1[1] = clamp(y_sample1 + gc);
+						d1[2] = clamp(y_sample1 + bc);
+						d1[3] = 255;
+
+						d1 += 4;
+					}
+
+					++x;
+				}
+			}
+
+			return 2;
+		}
+		else
+		{
+			for (int x = 0; x < m_image_x_size; x++)
+			{
+				int y_sample = p_YSamples[check_sample_buf_ofs((x >> 4) * BLOCKS_PER_MCU * 64 + ((x & 8) ? 64 : 0) + (x & 7) + y_sample_base_ofs)];
+
+				int c_x0 = (x - 1) >> 1;
+				int c_x1 = JPGD_MIN(c_x0 + 1, half_image_x_size);
+				c_x0 = JPGD_MAX(c_x0, 0);
+
+				int a = (c_x0 >> 3) * BLOCKS_PER_MCU * 64 + (c_x0 & 7);
+				int cb00_sample = p_C0Samples[check_sample_buf_ofs(a + y0_base)];
+				int cr00_sample = p_C0Samples[check_sample_buf_ofs(a + y0_base + 64)];
+
+				int cb01_sample = m_pSample_buf[check_sample_buf_ofs(a + y1_base)];
+				int cr01_sample = m_pSample_buf[check_sample_buf_ofs(a + y1_base + 64)];
+
+				int b = (c_x1 >> 3) * BLOCKS_PER_MCU * 64 + (c_x1 & 7);
+				int cb10_sample = p_C0Samples[check_sample_buf_ofs(b + y0_base)];
+				int cr10_sample = p_C0Samples[check_sample_buf_ofs(b + y0_base + 64)];
+
+				int cb11_sample = m_pSample_buf[check_sample_buf_ofs(b + y1_base)];
+				int cr11_sample = m_pSample_buf[check_sample_buf_ofs(b + y1_base + 64)];
+
+				const uint8_t* pMuls = &s_muls[row & 1][x & 1][0];
+				int cb = (cb00_sample * pMuls[0] + cb01_sample * pMuls[1] + cb10_sample * pMuls[2] + cb11_sample * pMuls[3] + 8) >> 4;
+				int cr = (cr00_sample * pMuls[0] + cr01_sample * pMuls[1] + cr10_sample * pMuls[2] + cr11_sample * pMuls[3] + 8) >> 4;
+
+				int rc = m_crr[cr];
+				int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+				int bc = m_cbb[cb];
+
+				d0[0] = clamp(y_sample + rc);
+				d0[1] = clamp(y_sample + gc);
+				d0[2] = clamp(y_sample + bc);
+				d0[3] = 255;
+
+				d0 += 4;
+			}
+
+			return 1;
+		}
+	}
+
+	// Y (1 block per MCU) to 8-bit grayscale
+	void jpeg_decoder::gray_convert()
+	{
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+		uint8* d = m_pScan_line_0;
+		uint8* s = m_pSample_buf + row * 8;
+
+		for (int i = m_max_mcus_per_row; i > 0; i--)
+		{
+			*(uint*)d = *(uint*)s;
+			*(uint*)(&d[4]) = *(uint*)(&s[4]);
+
+			s += 64;
+			d += 8;
+		}
+	}
+
+	// Find end of image (EOI) marker, so we can return to the user the exact size of the input stream.
+	void jpeg_decoder::find_eoi()
+	{
+		if (!m_progressive_flag)
+		{
+			// Attempt to read the EOI marker.
+			//get_bits_no_markers(m_bits_left & 7);
+
+			// Prime the bit buffer
+			m_bits_left = 16;
+			get_bits(16);
+			get_bits(16);
+
+			// The next marker _should_ be EOI
+			process_markers();
+		}
+
+		m_total_bytes_read -= m_in_buf_left;
+	}
+
+	int jpeg_decoder::decode_next_mcu_row()
+	{
+		if (setjmp(m_jmp_state))
+			return JPGD_FAILED;
+
+		const bool chroma_y_filtering = ((m_flags & cFlagBoxChromaFiltering) == 0) && ((m_scan_type == JPGD_YH2V2) || (m_scan_type == JPGD_YH1V2));
+		if (chroma_y_filtering)
+		{
+			std::swap(m_pSample_buf, m_pSample_buf_prev);
+
+			m_sample_buf_prev_valid = true;
+		}
+
+		if (m_progressive_flag)
+			load_next_row();
+		else
+			decode_next_row();
+
+		// Find the EOI marker if that was the last row.
+		if (m_total_lines_left <= m_max_mcu_y_size)
+			find_eoi();
+
+		m_mcu_lines_left = m_max_mcu_y_size;
+		return 0;
+	}
+
+	int jpeg_decoder::decode(const void** pScan_line, uint* pScan_line_len)
+	{
+		if ((m_error_code) || (!m_ready_flag))
+			return JPGD_FAILED;
+
+		if (m_total_lines_left == 0)
+			return JPGD_DONE;
+
+		const bool chroma_y_filtering = ((m_flags & cFlagBoxChromaFiltering) == 0) && ((m_scan_type == JPGD_YH2V2) || (m_scan_type == JPGD_YH1V2));
+
+		bool get_another_mcu_row = false;
+		bool got_mcu_early = false;
+		if (chroma_y_filtering)
+		{
+			if (m_total_lines_left == m_image_y_size)
+				get_another_mcu_row = true;
+			else if ((m_mcu_lines_left == 1) && (m_total_lines_left > 1))
+			{
+				get_another_mcu_row = true;
+				got_mcu_early = true;
+			}
+		}
+		else
+		{
+			get_another_mcu_row = (m_mcu_lines_left == 0);
+		}
+
+		if (get_another_mcu_row)
+		{
+			int status = decode_next_mcu_row();
+			if (status != 0)
+				return status;
+		}
+
+		switch (m_scan_type)
+		{
+		case JPGD_YH2V2:
+		{
+			if ((m_flags & cFlagBoxChromaFiltering) == 0)
+			{
+				if (m_num_buffered_scanlines == 1)
+				{
+					*pScan_line = m_pScan_line_1;
+				}
+				else if (m_num_buffered_scanlines == 0)
+				{
+					m_num_buffered_scanlines = H2V2ConvertFiltered();
+					*pScan_line = m_pScan_line_0;
+				}
+
+				m_num_buffered_scanlines--;
+			}
+			else
+			{
+				if ((m_mcu_lines_left & 1) == 0)
+				{
+					H2V2Convert();
+					*pScan_line = m_pScan_line_0;
+				}
+				else
+					*pScan_line = m_pScan_line_1;
+			}
+
+			break;
+		}
+		case JPGD_YH2V1:
+		{
+			if ((m_flags & cFlagBoxChromaFiltering) == 0)
+				H2V1ConvertFiltered();
+			else
+				H2V1Convert();
+			*pScan_line = m_pScan_line_0;
+			break;
+		}
+		case JPGD_YH1V2:
+		{
+			if (chroma_y_filtering)
+			{
+				H1V2ConvertFiltered();
+				*pScan_line = m_pScan_line_0;
+			}
+			else
+			{
+				if ((m_mcu_lines_left & 1) == 0)
+				{
+					H1V2Convert();
+					*pScan_line = m_pScan_line_0;
+				}
+				else
+					*pScan_line = m_pScan_line_1;
+			}
+
+			break;
+		}
+		case JPGD_YH1V1:
+		{
+			H1V1Convert();
+			*pScan_line = m_pScan_line_0;
+			break;
+		}
+		case JPGD_GRAYSCALE:
+		{
+			gray_convert();
+			*pScan_line = m_pScan_line_0;
+
+			break;
+		}
+		}
+
+		*pScan_line_len = m_real_dest_bytes_per_scan_line;
+
+		if (!got_mcu_early)
+		{
+			m_mcu_lines_left--;
+		}
+
+		m_total_lines_left--;
+
+		return JPGD_SUCCESS;
+	}
+
+	// Creates the tables needed for efficient Huffman decoding.
+	void jpeg_decoder::make_huff_table(int index, huff_tables* pH)
+	{
+		int p, i, l, si;
+		uint8 huffsize[258];
+		uint huffcode[258];
+		uint code;
+		uint subtree;
+		int code_size;
+		int lastp;
+		int nextfreeentry;
+		int currententry;
+
+		pH->ac_table = m_huff_ac[index] != 0;
+
+		p = 0;
+
+		for (l = 1; l <= 16; l++)
+		{
+			for (i = 1; i <= m_huff_num[index][l]; i++)
+			{
+				if (p >= 257)
+					stop_decoding(JPGD_DECODE_ERROR);
+				huffsize[p++] = static_cast<uint8>(l);
+			}
+		}
+
+		assert(p < 258);
+		huffsize[p] = 0;
+
+		lastp = p;
+
+		code = 0;
+		si = huffsize[0];
+		p = 0;
+
+		while (huffsize[p])
+		{
+			while (huffsize[p] == si)
+			{
+				if (p >= 257)
+					stop_decoding(JPGD_DECODE_ERROR);
+				huffcode[p++] = code;
+				code++;
+			}
+
+			code <<= 1;
+			si++;
+		}
+
+		memset(pH->look_up, 0, sizeof(pH->look_up));
+		memset(pH->look_up2, 0, sizeof(pH->look_up2));
+		memset(pH->tree, 0, sizeof(pH->tree));
+		memset(pH->code_size, 0, sizeof(pH->code_size));
+
+		nextfreeentry = -1;
+
+		p = 0;
+
+		while (p < lastp)
+		{
+			i = m_huff_val[index][p];
+
+			code = huffcode[p];
+			code_size = huffsize[p];
+
+			assert(i < JPGD_HUFF_CODE_SIZE_MAX_LENGTH);
+			pH->code_size[i] = static_cast<uint8>(code_size);
+
+			if (code_size <= 8)
+			{
+				code <<= (8 - code_size);
+
+				for (l = 1 << (8 - code_size); l > 0; l--)
+				{
+					if (code >= 256)
+						stop_decoding(JPGD_DECODE_ERROR);
+
+					pH->look_up[code] = i;
+
+					bool has_extrabits = false;
+					int extra_bits = 0;
+					int num_extra_bits = i & 15;
+
+					int bits_to_fetch = code_size;
+					if (num_extra_bits)
+					{
+						int total_codesize = code_size + num_extra_bits;
+						if (total_codesize <= 8)
+						{
+							has_extrabits = true;
+							extra_bits = ((1 << num_extra_bits) - 1) & (code >> (8 - total_codesize));
+
+							if (extra_bits > 0x7FFF)
+								stop_decoding(JPGD_DECODE_ERROR);
+
+							bits_to_fetch += num_extra_bits;
+						}
+					}
+
+					if (!has_extrabits)
+						pH->look_up2[code] = i | (bits_to_fetch << 8);
+					else
+						pH->look_up2[code] = i | 0x8000 | (extra_bits << 16) | (bits_to_fetch << 8);
+
+					code++;
+				}
+			}
+			else
+			{
+				subtree = (code >> (code_size - 8)) & 0xFF;
+
+				currententry = pH->look_up[subtree];
+
+				if (currententry == 0)
+				{
+					pH->look_up[subtree] = currententry = nextfreeentry;
+					pH->look_up2[subtree] = currententry = nextfreeentry;
+
+					nextfreeentry -= 2;
+				}
+
+				code <<= (16 - (code_size - 8));
+
+				for (l = code_size; l > 9; l--)
+				{
+					if ((code & 0x8000) == 0)
+						currententry--;
+
+					unsigned int idx = -currententry - 1;
+
+					if (idx >= JPGD_HUFF_TREE_MAX_LENGTH)
+						stop_decoding(JPGD_DECODE_ERROR);
+
+					if (pH->tree[idx] == 0)
+					{
+						pH->tree[idx] = nextfreeentry;
+
+						currententry = nextfreeentry;
+
+						nextfreeentry -= 2;
+					}
+					else
+					{
+						currententry = pH->tree[idx];
+					}
+
+					code <<= 1;
+				}
+
+				if ((code & 0x8000) == 0)
+					currententry--;
+
+				if ((-currententry - 1) >= JPGD_HUFF_TREE_MAX_LENGTH)
+					stop_decoding(JPGD_DECODE_ERROR);
+
+				pH->tree[-currententry - 1] = i;
+			}
+
+			p++;
+		}
+	}
+
+	// Verifies the quantization tables needed for this scan are available.
+	void jpeg_decoder::check_quant_tables()
+	{
+		for (int i = 0; i < m_comps_in_scan; i++)
+			if (m_quant[m_comp_quant[m_comp_list[i]]] == nullptr)
+				stop_decoding(JPGD_UNDEFINED_QUANT_TABLE);
+	}
+
+	// Verifies that all the Huffman tables needed for this scan are available.
+	void jpeg_decoder::check_huff_tables()
+	{
+		for (int i = 0; i < m_comps_in_scan; i++)
+		{
+			if ((m_spectral_start == 0) && (m_huff_num[m_comp_dc_tab[m_comp_list[i]]] == nullptr))
+				stop_decoding(JPGD_UNDEFINED_HUFF_TABLE);
+
+			if ((m_spectral_end > 0) && (m_huff_num[m_comp_ac_tab[m_comp_list[i]]] == nullptr))
+				stop_decoding(JPGD_UNDEFINED_HUFF_TABLE);
+		}
+
+		for (int i = 0; i < JPGD_MAX_HUFF_TABLES; i++)
+			if (m_huff_num[i])
+			{
+				if (!m_pHuff_tabs[i])
+					m_pHuff_tabs[i] = (huff_tables*)alloc(sizeof(huff_tables));
+
+				make_huff_table(i, m_pHuff_tabs[i]);
+			}
+	}
+
+	// Determines the component order inside each MCU.
+	// Also calcs how many MCU's are on each row, etc.
+	bool jpeg_decoder::calc_mcu_block_order()
+	{
+		int component_num, component_id;
+		int max_h_samp = 0, max_v_samp = 0;
+
+		for (component_id = 0; component_id < m_comps_in_frame; component_id++)
+		{
+			if (m_comp_h_samp[component_id] > max_h_samp)
+				max_h_samp = m_comp_h_samp[component_id];
+
+			if (m_comp_v_samp[component_id] > max_v_samp)
+				max_v_samp = m_comp_v_samp[component_id];
+		}
+
+		for (component_id = 0; component_id < m_comps_in_frame; component_id++)
+		{
+			m_comp_h_blocks[component_id] = ((((m_image_x_size * m_comp_h_samp[component_id]) + (max_h_samp - 1)) / max_h_samp) + 7) / 8;
+			m_comp_v_blocks[component_id] = ((((m_image_y_size * m_comp_v_samp[component_id]) + (max_v_samp - 1)) / max_v_samp) + 7) / 8;
+		}
+
+		if (m_comps_in_scan == 1)
+		{
+			m_mcus_per_row = m_comp_h_blocks[m_comp_list[0]];
+			m_mcus_per_col = m_comp_v_blocks[m_comp_list[0]];
+		}
+		else
+		{
+			m_mcus_per_row = (((m_image_x_size + 7) / 8) + (max_h_samp - 1)) / max_h_samp;
+			m_mcus_per_col = (((m_image_y_size + 7) / 8) + (max_v_samp - 1)) / max_v_samp;
+		}
+
+		if (m_comps_in_scan == 1)
+		{
+			m_mcu_org[0] = m_comp_list[0];
+
+			m_blocks_per_mcu = 1;
+		}
+		else
+		{
+			m_blocks_per_mcu = 0;
+
+			for (component_num = 0; component_num < m_comps_in_scan; component_num++)
+			{
+				int num_blocks;
+
+				component_id = m_comp_list[component_num];
+
+				num_blocks = m_comp_h_samp[component_id] * m_comp_v_samp[component_id];
+
+				while (num_blocks--)
+					m_mcu_org[m_blocks_per_mcu++] = component_id;
+			}
+		}
+
+		if (m_blocks_per_mcu > m_max_blocks_per_mcu)
+			return false;
+
+		for (int mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
+		{
+			int comp_id = m_mcu_org[mcu_block];
+			if (comp_id >= JPGD_MAX_QUANT_TABLES)
+				return false;
+		}
+
+		return true;
+	}
+
+	// Starts a new scan.
+	int jpeg_decoder::init_scan()
+	{
+		if (!locate_sos_marker())
+			return JPGD_FALSE;
+
+		if (!calc_mcu_block_order())
+			return JPGD_FALSE;
+
+		check_huff_tables();
+
+		check_quant_tables();
+
+		memset(m_last_dc_val, 0, m_comps_in_frame * sizeof(uint));
+
+		m_eob_run = 0;
+
+		if (m_restart_interval)
+		{
+			m_restarts_left = m_restart_interval;
+			m_next_restart_num = 0;
+		}
+
+		fix_in_buffer();
+
+		return JPGD_TRUE;
+	}
+
+	// Starts a frame. Determines if the number of components or sampling factors
+	// are supported.
+	void jpeg_decoder::init_frame()
+	{
+		int i;
+
+		if (m_comps_in_frame == 1)
+		{
+			if ((m_comp_h_samp[0] != 1) || (m_comp_v_samp[0] != 1))
+				stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
+
+			m_scan_type = JPGD_GRAYSCALE;
+			m_max_blocks_per_mcu = 1;
+			m_max_mcu_x_size = 8;
+			m_max_mcu_y_size = 8;
+		}
+		else if (m_comps_in_frame == 3)
+		{
+			if (((m_comp_h_samp[1] != 1) || (m_comp_v_samp[1] != 1)) ||
+				((m_comp_h_samp[2] != 1) || (m_comp_v_samp[2] != 1)))
+				stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
+
+			if ((m_comp_h_samp[0] == 1) && (m_comp_v_samp[0] == 1))
+			{
+				m_scan_type = JPGD_YH1V1;
+
+				m_max_blocks_per_mcu = 3;
+				m_max_mcu_x_size = 8;
+				m_max_mcu_y_size = 8;
+			}
+			else if ((m_comp_h_samp[0] == 2) && (m_comp_v_samp[0] == 1))
+			{
+				m_scan_type = JPGD_YH2V1;
+				m_max_blocks_per_mcu = 4;
+				m_max_mcu_x_size = 16;
+				m_max_mcu_y_size = 8;
+			}
+			else if ((m_comp_h_samp[0] == 1) && (m_comp_v_samp[0] == 2))
+			{
+				m_scan_type = JPGD_YH1V2;
+				m_max_blocks_per_mcu = 4;
+				m_max_mcu_x_size = 8;
+				m_max_mcu_y_size = 16;
+			}
+			else if ((m_comp_h_samp[0] == 2) && (m_comp_v_samp[0] == 2))
+			{
+				m_scan_type = JPGD_YH2V2;
+				m_max_blocks_per_mcu = 6;
+				m_max_mcu_x_size = 16;
+				m_max_mcu_y_size = 16;
+			}
+			else
+				stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
+		}
+		else
+			stop_decoding(JPGD_UNSUPPORTED_COLORSPACE);
+
+		m_max_mcus_per_row = (m_image_x_size + (m_max_mcu_x_size - 1)) / m_max_mcu_x_size;
+		m_max_mcus_per_col = (m_image_y_size + (m_max_mcu_y_size - 1)) / m_max_mcu_y_size;
+
+		// These values are for the *destination* pixels: after conversion.
+		if (m_scan_type == JPGD_GRAYSCALE)
+			m_dest_bytes_per_pixel = 1;
+		else
+			m_dest_bytes_per_pixel = 4;
+
+		m_dest_bytes_per_scan_line = ((m_image_x_size + 15) & 0xFFF0) * m_dest_bytes_per_pixel;
+
+		m_real_dest_bytes_per_scan_line = (m_image_x_size * m_dest_bytes_per_pixel);
+
+		// Initialize two scan line buffers.
+		m_pScan_line_0 = (uint8*)alloc_aligned(m_dest_bytes_per_scan_line, true);
+		if ((m_scan_type == JPGD_YH1V2) || (m_scan_type == JPGD_YH2V2))
+			m_pScan_line_1 = (uint8*)alloc_aligned(m_dest_bytes_per_scan_line, true);
+
+		m_max_blocks_per_row = m_max_mcus_per_row * m_max_blocks_per_mcu;
+
+		// Should never happen
+		if (m_max_blocks_per_row > JPGD_MAX_BLOCKS_PER_ROW)
+			stop_decoding(JPGD_DECODE_ERROR);
+
+		// Allocate the coefficient buffer, enough for one MCU
+		m_pMCU_coefficients = (jpgd_block_coeff_t *)alloc_aligned(m_max_blocks_per_mcu * 64 * sizeof(jpgd_block_coeff_t));
+				
+		for (i = 0; i < m_max_blocks_per_mcu; i++)
+			m_mcu_block_max_zag[i] = 64;
+
+		m_pSample_buf = (uint8*)alloc_aligned(m_max_blocks_per_row * 64);
+		m_pSample_buf_prev = (uint8*)alloc_aligned(m_max_blocks_per_row * 64);
+
+		m_total_lines_left = m_image_y_size;
+
+		m_mcu_lines_left = 0;
+
+		create_look_ups();
+	}
+
+	// The coeff_buf series of methods originally stored the coefficients
+	// into a "virtual" file which was located in EMS, XMS, or a disk file. A cache
+	// was used to make this process more efficient. Now, we can store the entire
+	// thing in RAM.
+	jpeg_decoder::coeff_buf* jpeg_decoder::coeff_buf_open(int block_num_x, int block_num_y, int block_len_x, int block_len_y)
+	{
+		coeff_buf* cb = (coeff_buf*)alloc(sizeof(coeff_buf));
+
+		cb->block_num_x = block_num_x;
+		cb->block_num_y = block_num_y;
+		cb->block_len_x = block_len_x;
+		cb->block_len_y = block_len_y;
+		cb->block_size = (block_len_x * block_len_y) * sizeof(jpgd_block_coeff_t);
+		cb->pData = (uint8*)alloc(cb->block_size * block_num_x * block_num_y, true);
+		return cb;
+	}
+
+	inline jpgd_block_coeff_t* jpeg_decoder::coeff_buf_getp(coeff_buf* cb, int block_x, int block_y)
+	{
+		if ((block_x >= cb->block_num_x) || (block_y >= cb->block_num_y))
+			stop_decoding(JPGD_DECODE_ERROR);
+
+		return (jpgd_block_coeff_t*)(cb->pData + block_x * cb->block_size + block_y * (cb->block_size * cb->block_num_x));
+	}
+
+	// The following methods decode the various types of m_blocks encountered
+	// in progressively encoded images.
+	void jpeg_decoder::decode_block_dc_first(jpeg_decoder* pD, int component_id, int block_x, int block_y)
+	{
+		int s, r;
+		jpgd_block_coeff_t* p = pD->coeff_buf_getp(pD->m_dc_coeffs[component_id], block_x, block_y);
+
+		if ((s = pD->huff_decode(pD->m_pHuff_tabs[pD->m_comp_dc_tab[component_id]])) != 0)
+		{
+			if (s >= 16)
+				pD->stop_decoding(JPGD_DECODE_ERROR);
+
+			r = pD->get_bits_no_markers(s);
+			s = JPGD_HUFF_EXTEND(r, s);
+		}
+
+		pD->m_last_dc_val[component_id] = (s += pD->m_last_dc_val[component_id]);
+
+		p[0] = static_cast<jpgd_block_coeff_t>(s << pD->m_successive_low);
+	}
+
+	void jpeg_decoder::decode_block_dc_refine(jpeg_decoder* pD, int component_id, int block_x, int block_y)
+	{
+		if (pD->get_bits_no_markers(1))
+		{
+			jpgd_block_coeff_t* p = pD->coeff_buf_getp(pD->m_dc_coeffs[component_id], block_x, block_y);
+
+			p[0] |= (1 << pD->m_successive_low);
+		}
+	}
+
+	void jpeg_decoder::decode_block_ac_first(jpeg_decoder* pD, int component_id, int block_x, int block_y)
+	{
+		int k, s, r;
+
+		if (pD->m_eob_run)
+		{
+			pD->m_eob_run--;
+			return;
+		}
+
+		jpgd_block_coeff_t* p = pD->coeff_buf_getp(pD->m_ac_coeffs[component_id], block_x, block_y);
+
+		for (k = pD->m_spectral_start; k <= pD->m_spectral_end; k++)
+		{
+			unsigned int idx = pD->m_comp_ac_tab[component_id];
+			if (idx >= JPGD_MAX_HUFF_TABLES)
+				pD->stop_decoding(JPGD_DECODE_ERROR);
+
+			s = pD->huff_decode(pD->m_pHuff_tabs[idx]);
+
+			r = s >> 4;
+			s &= 15;
+
+			if (s)
+			{
+				if ((k += r) > 63)
+					pD->stop_decoding(JPGD_DECODE_ERROR);
+
+				r = pD->get_bits_no_markers(s);
+				s = JPGD_HUFF_EXTEND(r, s);
+
+				p[g_ZAG[k]] = static_cast<jpgd_block_coeff_t>(s << pD->m_successive_low);
+			}
+			else
+			{
+				if (r == 15)
+				{
+					if ((k += 15) > 63)
+						pD->stop_decoding(JPGD_DECODE_ERROR);
+				}
+				else
+				{
+					pD->m_eob_run = 1 << r;
+
+					if (r)
+						pD->m_eob_run += pD->get_bits_no_markers(r);
+
+					pD->m_eob_run--;
+
+					break;
+				}
+			}
+		}
+	}
+
+	void jpeg_decoder::decode_block_ac_refine(jpeg_decoder* pD, int component_id, int block_x, int block_y)
+	{
+		int s, k, r;
+
+		int p1 = 1 << pD->m_successive_low;
+
+		//int m1 = (-1) << pD->m_successive_low;
+		int m1 = static_cast<int>((UINT32_MAX << pD->m_successive_low));
+
+		jpgd_block_coeff_t* p = pD->coeff_buf_getp(pD->m_ac_coeffs[component_id], block_x, block_y);
+		if (pD->m_spectral_end > 63)
+			pD->stop_decoding(JPGD_DECODE_ERROR);
+
+		k = pD->m_spectral_start;
+
+		if (pD->m_eob_run == 0)
+		{
+			for (; k <= pD->m_spectral_end; k++)
+			{
+				unsigned int idx = pD->m_comp_ac_tab[component_id];
+				if (idx >= JPGD_MAX_HUFF_TABLES)
+					pD->stop_decoding(JPGD_DECODE_ERROR);
+
+				s = pD->huff_decode(pD->m_pHuff_tabs[idx]);
+
+				r = s >> 4;
+				s &= 15;
+
+				if (s)
+				{
+					if (s != 1)
+						pD->stop_decoding(JPGD_DECODE_ERROR);
+
+					if (pD->get_bits_no_markers(1))
+						s = p1;
+					else
+						s = m1;
+				}
+				else
+				{
+					if (r != 15)
+					{
+						pD->m_eob_run = 1 << r;
+
+						if (r)
+							pD->m_eob_run += pD->get_bits_no_markers(r);
+
+						break;
+					}
+				}
+
+				do
+				{
+					jpgd_block_coeff_t* this_coef = p + g_ZAG[k & 63];
+
+					if (*this_coef != 0)
+					{
+						if (pD->get_bits_no_markers(1))
+						{
+							if ((*this_coef & p1) == 0)
+							{
+								if (*this_coef >= 0)
+									*this_coef = static_cast<jpgd_block_coeff_t>(*this_coef + p1);
+								else
+									*this_coef = static_cast<jpgd_block_coeff_t>(*this_coef + m1);
+							}
+						}
+					}
+					else
+					{
+						if (--r < 0)
+							break;
+					}
+
+					k++;
+
+				} while (k <= pD->m_spectral_end);
+
+				if ((s) && (k < 64))
+				{
+					p[g_ZAG[k]] = static_cast<jpgd_block_coeff_t>(s);
+				}
+			}
+		}
+
+		if (pD->m_eob_run > 0)
+		{
+			for (; k <= pD->m_spectral_end; k++)
+			{
+				jpgd_block_coeff_t* this_coef = p + g_ZAG[k & 63]; // logical AND to shut up static code analysis
+
+				if (*this_coef != 0)
+				{
+					if (pD->get_bits_no_markers(1))
+					{
+						if ((*this_coef & p1) == 0)
+						{
+							if (*this_coef >= 0)
+								*this_coef = static_cast<jpgd_block_coeff_t>(*this_coef + p1);
+							else
+								*this_coef = static_cast<jpgd_block_coeff_t>(*this_coef + m1);
+						}
+					}
+				}
+			}
+
+			pD->m_eob_run--;
+		}
+	}
+
+	// Decode a scan in a progressively encoded image.
+	void jpeg_decoder::decode_scan(pDecode_block_func decode_block_func)
+	{
+		int mcu_row, mcu_col, mcu_block;
+		int block_x_mcu[JPGD_MAX_COMPONENTS], block_y_mcu[JPGD_MAX_COMPONENTS];
+
+		memset(block_y_mcu, 0, sizeof(block_y_mcu));
+
+		for (mcu_col = 0; mcu_col < m_mcus_per_col; mcu_col++)
+		{
+			int component_num, component_id;
+
+			memset(block_x_mcu, 0, sizeof(block_x_mcu));
+
+			for (mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
+			{
+				int block_x_mcu_ofs = 0, block_y_mcu_ofs = 0;
+
+				if ((m_restart_interval) && (m_restarts_left == 0))
+					process_restart();
+
+				for (mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
+				{
+					component_id = m_mcu_org[mcu_block];
+
+					decode_block_func(this, component_id, block_x_mcu[component_id] + block_x_mcu_ofs, block_y_mcu[component_id] + block_y_mcu_ofs);
+
+					if (m_comps_in_scan == 1)
+						block_x_mcu[component_id]++;
+					else
+					{
+						if (++block_x_mcu_ofs == m_comp_h_samp[component_id])
+						{
+							block_x_mcu_ofs = 0;
+
+							if (++block_y_mcu_ofs == m_comp_v_samp[component_id])
+							{
+								block_y_mcu_ofs = 0;
+								block_x_mcu[component_id] += m_comp_h_samp[component_id];
+							}
+						}
+					}
+				}
+
+				m_restarts_left--;
+			}
+
+			if (m_comps_in_scan == 1)
+				block_y_mcu[m_comp_list[0]]++;
+			else
+			{
+				for (component_num = 0; component_num < m_comps_in_scan; component_num++)
+				{
+					component_id = m_comp_list[component_num];
+					block_y_mcu[component_id] += m_comp_v_samp[component_id];
+				}
+			}
+		}
+	}
+
+	// Decode a progressively encoded image.
+	void jpeg_decoder::init_progressive()
+	{
+		int i;
+
+		if (m_comps_in_frame == 4)
+			stop_decoding(JPGD_UNSUPPORTED_COLORSPACE);
+
+		// Allocate the coefficient buffers.
+		for (i = 0; i < m_comps_in_frame; i++)
+		{
+			m_dc_coeffs[i] = coeff_buf_open(m_max_mcus_per_row * m_comp_h_samp[i], m_max_mcus_per_col * m_comp_v_samp[i], 1, 1);
+			m_ac_coeffs[i] = coeff_buf_open(m_max_mcus_per_row * m_comp_h_samp[i], m_max_mcus_per_col * m_comp_v_samp[i], 8, 8);
+		}
+
+		// See https://libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf
+		uint32_t total_scans = 0;
+		const uint32_t MAX_SCANS_TO_PROCESS = 1000;
+
+		for (; ; )
+		{
+			int dc_only_scan, refinement_scan;
+			pDecode_block_func decode_block_func;
+
+			if (!init_scan())
+				break;
+
+			dc_only_scan = (m_spectral_start == 0);
+			refinement_scan = (m_successive_high != 0);
+
+			if ((m_spectral_start > m_spectral_end) || (m_spectral_end > 63))
+				stop_decoding(JPGD_BAD_SOS_SPECTRAL);
+
+			if (dc_only_scan)
+			{
+				if (m_spectral_end)
+					stop_decoding(JPGD_BAD_SOS_SPECTRAL);
+			}
+			else if (m_comps_in_scan != 1)  /* AC scans can only contain one component */
+				stop_decoding(JPGD_BAD_SOS_SPECTRAL);
+
+			if ((refinement_scan) && (m_successive_low != m_successive_high - 1))
+				stop_decoding(JPGD_BAD_SOS_SUCCESSIVE);
+
+			if (dc_only_scan)
+			{
+				if (refinement_scan)
+					decode_block_func = decode_block_dc_refine;
+				else
+					decode_block_func = decode_block_dc_first;
+			}
+			else
+			{
+				if (refinement_scan)
+					decode_block_func = decode_block_ac_refine;
+				else
+					decode_block_func = decode_block_ac_first;
+			}
+
+			decode_scan(decode_block_func);
+
+			m_bits_left = 16;
+			get_bits(16);
+			get_bits(16);
+
+			total_scans++;
+			if (total_scans > MAX_SCANS_TO_PROCESS)
+				stop_decoding(JPGD_TOO_MANY_SCANS);
+		}
+
+		m_comps_in_scan = m_comps_in_frame;
+
+		for (i = 0; i < m_comps_in_frame; i++)
+			m_comp_list[i] = i;
+
+		if (!calc_mcu_block_order())
+			stop_decoding(JPGD_DECODE_ERROR);
+	}
+
+	void jpeg_decoder::init_sequential()
+	{
+		if (!init_scan())
+			stop_decoding(JPGD_UNEXPECTED_MARKER);
+	}
+
+	void jpeg_decoder::decode_start()
+	{
+		init_frame();
+
+		if (m_progressive_flag)
+			init_progressive();
+		else
+			init_sequential();
+	}
+
+	void jpeg_decoder::decode_init(jpeg_decoder_stream* pStream, uint32_t flags)
+	{
+		init(pStream, flags);
+		locate_sof_marker();
+	}
+
+	jpeg_decoder::jpeg_decoder(jpeg_decoder_stream* pStream, uint32_t flags)
+	{
+		if (setjmp(m_jmp_state))
+			return;
+		decode_init(pStream, flags);
+	}
+
+	int jpeg_decoder::begin_decoding()
+	{
+		if (m_ready_flag)
+			return JPGD_SUCCESS;
+
+		if (m_error_code)
+			return JPGD_FAILED;
+
+		if (setjmp(m_jmp_state))
+			return JPGD_FAILED;
+
+		decode_start();
+
+		m_ready_flag = true;
+
+		return JPGD_SUCCESS;
+	}
+
+	jpeg_decoder::~jpeg_decoder()
+	{
+		free_all_blocks();
+	}
+
+	jpeg_decoder_file_stream::jpeg_decoder_file_stream()
+	{
+		m_pFile = nullptr;
+		m_eof_flag = false;
+		m_error_flag = false;
+	}
+
+	void jpeg_decoder_file_stream::close()
+	{
+		if (m_pFile)
+		{
+			fclose(m_pFile);
+			m_pFile = nullptr;
+		}
+
+		m_eof_flag = false;
+		m_error_flag = false;
+	}
+
+	jpeg_decoder_file_stream::~jpeg_decoder_file_stream()
+	{
+		close();
+	}
+
+	bool jpeg_decoder_file_stream::open(const char* Pfilename)
+	{
+		close();
+
+		m_eof_flag = false;
+		m_error_flag = false;
 
 #if defined(_MSC_VER)
-  m_pFile = NULL;
-  fopen_s(&m_pFile, Pfilename, "rb");
+		m_pFile = nullptr;
+		fopen_s(&m_pFile, Pfilename, "rb");
 #else
-  m_pFile = fopen(Pfilename, "rb");
+		m_pFile = fopen(Pfilename, "rb");
 #endif
-  return m_pFile != NULL;
-}
+		return m_pFile != nullptr;
+	}
 
-int jpeg_decoder_file_stream::read(uint8 *pBuf, int max_bytes_to_read, bool *pEOF_flag)
-{
-  if (!m_pFile)
-    return -1;
+	int jpeg_decoder_file_stream::read(uint8* pBuf, int max_bytes_to_read, bool* pEOF_flag)
+	{
+		if (!m_pFile)
+			return -1;
 
-  if (m_eof_flag)
-  {
-    *pEOF_flag = true;
-    return 0;
-  }
+		if (m_eof_flag)
+		{
+			*pEOF_flag = true;
+			return 0;
+		}
 
-  if (m_error_flag)
-    return -1;
+		if (m_error_flag)
+			return -1;
 
-  int bytes_read = static_cast<int>(fread(pBuf, 1, max_bytes_to_read, m_pFile));
-  if (bytes_read < max_bytes_to_read)
-  {
-    if (ferror(m_pFile))
-    {
-      m_error_flag = true;
-      return -1;
-    }
+		int bytes_read = static_cast<int>(fread(pBuf, 1, max_bytes_to_read, m_pFile));
+		if (bytes_read < max_bytes_to_read)
+		{
+			if (ferror(m_pFile))
+			{
+				m_error_flag = true;
+				return -1;
+			}
 
-    m_eof_flag = true;
-    *pEOF_flag = true;
-  }
+			m_eof_flag = true;
+			*pEOF_flag = true;
+		}
 
-  return bytes_read;
-}
+		return bytes_read;
+	}
 
-bool jpeg_decoder_mem_stream::open(const uint8 *pSrc_data, uint size)
-{
-  close();
-  m_pSrc_data = pSrc_data;
-  m_ofs = 0;
-  m_size = size;
-  return true;
-}
+	bool jpeg_decoder_mem_stream::open(const uint8* pSrc_data, uint size)
+	{
+		close();
+		m_pSrc_data = pSrc_data;
+		m_ofs = 0;
+		m_size = size;
+		return true;
+	}
 
-int jpeg_decoder_mem_stream::read(uint8 *pBuf, int max_bytes_to_read, bool *pEOF_flag)
-{
-  *pEOF_flag = false;
+	int jpeg_decoder_mem_stream::read(uint8* pBuf, int max_bytes_to_read, bool* pEOF_flag)
+	{
+		*pEOF_flag = false;
 
-  if (!m_pSrc_data)
-    return -1;
+		if (!m_pSrc_data)
+			return -1;
 
-  uint bytes_remaining = m_size - m_ofs;
-  if ((uint)max_bytes_to_read > bytes_remaining)
-  {
-    max_bytes_to_read = bytes_remaining;
-    *pEOF_flag = true;
-  }
+		uint bytes_remaining = m_size - m_ofs;
+		if ((uint)max_bytes_to_read > bytes_remaining)
+		{
+			max_bytes_to_read = bytes_remaining;
+			*pEOF_flag = true;
+		}
 
-  memcpy(pBuf, m_pSrc_data + m_ofs, max_bytes_to_read);
-  m_ofs += max_bytes_to_read;
+		memcpy(pBuf, m_pSrc_data + m_ofs, max_bytes_to_read);
+		m_ofs += max_bytes_to_read;
 
-  return max_bytes_to_read;
-}
+		return max_bytes_to_read;
+	}
 
-unsigned char *decompress_jpeg_image_from_stream(jpeg_decoder_stream *pStream, int *width, int *height, int *actual_comps, int req_comps)
-{
-  if (!actual_comps)
-    return NULL;
-  *actual_comps = 0;
+	unsigned char* decompress_jpeg_image_from_stream(jpeg_decoder_stream* pStream, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags)
+	{
+		if (!actual_comps)
+			return nullptr;
+		*actual_comps = 0;
 
-  if ((!pStream) || (!width) || (!height) || (!req_comps))
-    return NULL;
+		if ((!pStream) || (!width) || (!height) || (!req_comps))
+			return nullptr;
 
-  if ((req_comps != 1) && (req_comps != 3) && (req_comps != 4))
-    return NULL;
+		if ((req_comps != 1) && (req_comps != 3) && (req_comps != 4))
+			return nullptr;
 
-  jpeg_decoder decoder(pStream);
-  if (decoder.get_error_code() != JPGD_SUCCESS)
-    return NULL;
+		jpeg_decoder decoder(pStream, flags);
+		if (decoder.get_error_code() != JPGD_SUCCESS)
+			return nullptr;
 
-  const int image_width = decoder.get_width(), image_height = decoder.get_height();
-  *width = image_width;
-  *height = image_height;
-  *actual_comps = decoder.get_num_components();
+		const int image_width = decoder.get_width(), image_height = decoder.get_height();
+		*width = image_width;
+		*height = image_height;
+		*actual_comps = decoder.get_num_components();
 
-  if (decoder.begin_decoding() != JPGD_SUCCESS)
-    return NULL;
+		if (decoder.begin_decoding() != JPGD_SUCCESS)
+			return nullptr;
 
-  const int dst_bpl = image_width * req_comps;
+		const int dst_bpl = image_width * req_comps;
 
-  uint8 *pImage_data = (uint8*)jpgd_malloc(dst_bpl * image_height);
-  if (!pImage_data)
-    return NULL;
+		uint8* pImage_data = (uint8*)jpgd_malloc(dst_bpl * image_height);
+		if (!pImage_data)
+			return nullptr;
 
-  for (int y = 0; y < image_height; y++)
-  {
-    const uint8* pScan_line;
-    uint scan_line_len;
-    if (decoder.decode((const void**)&pScan_line, &scan_line_len) != JPGD_SUCCESS)
-    {
-      jpgd_free(pImage_data);
-      return NULL;
-    }
+		for (int y = 0; y < image_height; y++)
+		{
+			const uint8* pScan_line;
+			uint scan_line_len;
+			if (decoder.decode((const void**)&pScan_line, &scan_line_len) != JPGD_SUCCESS)
+			{
+				jpgd_free(pImage_data);
+				return nullptr;
+			}
 
-    uint8 *pDst = pImage_data + y * dst_bpl;
+			uint8* pDst = pImage_data + y * dst_bpl;
 
-    if (((req_comps == 1) && (decoder.get_num_components() == 1)) || ((req_comps == 4) && (decoder.get_num_components() == 3)))
-      memcpy(pDst, pScan_line, dst_bpl);
-    else if (decoder.get_num_components() == 1)
-    {
-      if (req_comps == 3)
-      {
-        for (int x = 0; x < image_width; x++)
-        {
-          uint8 luma = pScan_line[x];
-          pDst[0] = luma;
-          pDst[1] = luma;
-          pDst[2] = luma;
-          pDst += 3;
-        }
-      }
-      else
-      {
-        for (int x = 0; x < image_width; x++)
-        {
-          uint8 luma = pScan_line[x];
-          pDst[0] = luma;
-          pDst[1] = luma;
-          pDst[2] = luma;
-          pDst[3] = 255;
-          pDst += 4;
-        }
-      }
-    }
-    else if (decoder.get_num_components() == 3)
-    {
-      if (req_comps == 1)
-      {
-        const int YR = 19595, YG = 38470, YB = 7471;
-        for (int x = 0; x < image_width; x++)
-        {
-          int r = pScan_line[x*4+0];
-          int g = pScan_line[x*4+1];
-          int b = pScan_line[x*4+2];
-          *pDst++ = static_cast<uint8>((r * YR + g * YG + b * YB + 32768) >> 16);
-        }
-      }
-      else
-      {
-        for (int x = 0; x < image_width; x++)
-        {
-          pDst[0] = pScan_line[x*4+0];
-          pDst[1] = pScan_line[x*4+1];
-          pDst[2] = pScan_line[x*4+2];
-          pDst += 3;
-        }
-      }
-    }
-  }
+			if (((req_comps == 1) && (decoder.get_num_components() == 1)) || ((req_comps == 4) && (decoder.get_num_components() == 3)))
+				memcpy(pDst, pScan_line, dst_bpl);
+			else if (decoder.get_num_components() == 1)
+			{
+				if (req_comps == 3)
+				{
+					for (int x = 0; x < image_width; x++)
+					{
+						uint8 luma = pScan_line[x];
+						pDst[0] = luma;
+						pDst[1] = luma;
+						pDst[2] = luma;
+						pDst += 3;
+					}
+				}
+				else
+				{
+					for (int x = 0; x < image_width; x++)
+					{
+						uint8 luma = pScan_line[x];
+						pDst[0] = luma;
+						pDst[1] = luma;
+						pDst[2] = luma;
+						pDst[3] = 255;
+						pDst += 4;
+					}
+				}
+			}
+			else if (decoder.get_num_components() == 3)
+			{
+				if (req_comps == 1)
+				{
+					const int YR = 19595, YG = 38470, YB = 7471;
+					for (int x = 0; x < image_width; x++)
+					{
+						int r = pScan_line[x * 4 + 0];
+						int g = pScan_line[x * 4 + 1];
+						int b = pScan_line[x * 4 + 2];
+						*pDst++ = static_cast<uint8>((r * YR + g * YG + b * YB + 32768) >> 16);
+					}
+				}
+				else
+				{
+					for (int x = 0; x < image_width; x++)
+					{
+						pDst[0] = pScan_line[x * 4 + 0];
+						pDst[1] = pScan_line[x * 4 + 1];
+						pDst[2] = pScan_line[x * 4 + 2];
+						pDst += 3;
+					}
+				}
+			}
+		}
 
-  return pImage_data;
-}
+		return pImage_data;
+	}
 
-unsigned char *decompress_jpeg_image_from_memory(const unsigned char *pSrc_data, int src_data_size, int *width, int *height, int *actual_comps, int req_comps)
-{
-  jpgd::jpeg_decoder_mem_stream mem_stream(pSrc_data, src_data_size);
-  return decompress_jpeg_image_from_stream(&mem_stream, width, height, actual_comps, req_comps);
-}
+	unsigned char* decompress_jpeg_image_from_memory(const unsigned char* pSrc_data, int src_data_size, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags)
+	{
+		jpgd::jpeg_decoder_mem_stream mem_stream(pSrc_data, src_data_size);
+		return decompress_jpeg_image_from_stream(&mem_stream, width, height, actual_comps, req_comps, flags);
+	}
 
-unsigned char *decompress_jpeg_image_from_file(const char *pSrc_filename, int *width, int *height, int *actual_comps, int req_comps)
-{
-  jpgd::jpeg_decoder_file_stream file_stream;
-  if (!file_stream.open(pSrc_filename))
-    return NULL;
-  return decompress_jpeg_image_from_stream(&file_stream, width, height, actual_comps, req_comps);
-}
+	unsigned char* decompress_jpeg_image_from_file(const char* pSrc_filename, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags)
+	{
+		jpgd::jpeg_decoder_file_stream file_stream;
+		if (!file_stream.open(pSrc_filename))
+			return nullptr;
+		return decompress_jpeg_image_from_stream(&file_stream, width, height, actual_comps, req_comps, flags);
+	}
 
-} // namespace jpgd
\ No newline at end of file
+} // namespace jpgd
diff --git a/thirdparty/jpeg-compressor/jpgd.h b/thirdparty/jpeg-compressor/jpgd.h
index 150b9a0b26..39136696ba 100644
--- a/thirdparty/jpeg-compressor/jpgd.h
+++ b/thirdparty/jpeg-compressor/jpgd.h
@@ -1,319 +1,351 @@
 // jpgd.h - C++ class for JPEG decompression.
-// Public domain, Rich Geldreich <richgel99@gmail.com>
+// Richard Geldreich <richgel99@gmail.com>
+// See jpgd.cpp for license (Public Domain or Apache 2.0).
 #ifndef JPEG_DECODER_H
 #define JPEG_DECODER_H
 
 #include <stdlib.h>
 #include <stdio.h>
 #include <setjmp.h>
+#include <assert.h>
+#include <stdint.h>
 
 #ifdef _MSC_VER
-  #define JPGD_NORETURN __declspec(noreturn) 
+#define JPGD_NORETURN __declspec(noreturn) 
 #elif defined(__GNUC__)
-  #define JPGD_NORETURN __attribute__ ((noreturn))
+#define JPGD_NORETURN __attribute__ ((noreturn))
 #else
-  #define JPGD_NORETURN
+#define JPGD_NORETURN
 #endif
 
+#define JPGD_HUFF_TREE_MAX_LENGTH 512
+#define JPGD_HUFF_CODE_SIZE_MAX_LENGTH 256
+
 namespace jpgd
 {
-  typedef unsigned char  uint8;
-  typedef   signed short int16;
-  typedef unsigned short uint16;
-  typedef unsigned int   uint;
-  typedef   signed int   int32;
+	typedef unsigned char  uint8;
+	typedef   signed short int16;
+	typedef unsigned short uint16;
+	typedef unsigned int   uint;
+	typedef   signed int   int32;
 
-  // Loads a JPEG image from a memory buffer or a file.
-  // req_comps can be 1 (grayscale), 3 (RGB), or 4 (RGBA).
-  // On return, width/height will be set to the image's dimensions, and actual_comps will be set to the either 1 (grayscale) or 3 (RGB).
-  // Notes: For more control over where and how the source data is read, see the decompress_jpeg_image_from_stream() function below, or call the jpeg_decoder class directly.
-  // Requesting a 8 or 32bpp image is currently a little faster than 24bpp because the jpeg_decoder class itself currently always unpacks to either 8 or 32bpp.
-  unsigned char *decompress_jpeg_image_from_memory(const unsigned char *pSrc_data, int src_data_size, int *width, int *height, int *actual_comps, int req_comps);
-  unsigned char *decompress_jpeg_image_from_file(const char *pSrc_filename, int *width, int *height, int *actual_comps, int req_comps);
+	// Loads a JPEG image from a memory buffer or a file.
+	// req_comps can be 1 (grayscale), 3 (RGB), or 4 (RGBA).
+	// On return, width/height will be set to the image's dimensions, and actual_comps will be set to the either 1 (grayscale) or 3 (RGB).
+	// Notes: For more control over where and how the source data is read, see the decompress_jpeg_image_from_stream() function below, or call the jpeg_decoder class directly.
+	// Requesting a 8 or 32bpp image is currently a little faster than 24bpp because the jpeg_decoder class itself currently always unpacks to either 8 or 32bpp.
+	unsigned char* decompress_jpeg_image_from_memory(const unsigned char* pSrc_data, int src_data_size, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags = 0);
+	unsigned char* decompress_jpeg_image_from_file(const char* pSrc_filename, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags = 0);
 
-  // Success/failure error codes.
-  enum jpgd_status
-  {
-    JPGD_SUCCESS = 0, JPGD_FAILED = -1, JPGD_DONE = 1,
-    JPGD_BAD_DHT_COUNTS = -256, JPGD_BAD_DHT_INDEX, JPGD_BAD_DHT_MARKER, JPGD_BAD_DQT_MARKER, JPGD_BAD_DQT_TABLE, 
-    JPGD_BAD_PRECISION, JPGD_BAD_HEIGHT, JPGD_BAD_WIDTH, JPGD_TOO_MANY_COMPONENTS, 
-    JPGD_BAD_SOF_LENGTH, JPGD_BAD_VARIABLE_MARKER, JPGD_BAD_DRI_LENGTH, JPGD_BAD_SOS_LENGTH,
-    JPGD_BAD_SOS_COMP_ID, JPGD_W_EXTRA_BYTES_BEFORE_MARKER, JPGD_NO_ARITHMITIC_SUPPORT, JPGD_UNEXPECTED_MARKER,
-    JPGD_NOT_JPEG, JPGD_UNSUPPORTED_MARKER, JPGD_BAD_DQT_LENGTH, JPGD_TOO_MANY_BLOCKS,
-    JPGD_UNDEFINED_QUANT_TABLE, JPGD_UNDEFINED_HUFF_TABLE, JPGD_NOT_SINGLE_SCAN, JPGD_UNSUPPORTED_COLORSPACE,
-    JPGD_UNSUPPORTED_SAMP_FACTORS, JPGD_DECODE_ERROR, JPGD_BAD_RESTART_MARKER, JPGD_ASSERTION_ERROR,
-    JPGD_BAD_SOS_SPECTRAL, JPGD_BAD_SOS_SUCCESSIVE, JPGD_STREAM_READ, JPGD_NOTENOUGHMEM
-  };
-    
-  // Input stream interface.
-  // Derive from this class to read input data from sources other than files or memory. Set m_eof_flag to true when no more data is available.
-  // The decoder is rather greedy: it will keep on calling this method until its internal input buffer is full, or until the EOF flag is set.
-  // It the input stream contains data after the JPEG stream's EOI (end of image) marker it will probably be pulled into the internal buffer.
-  // Call the get_total_bytes_read() method to determine the actual size of the JPEG stream after successful decoding.
-  class jpeg_decoder_stream
-  {
-  public:
-    jpeg_decoder_stream() { }
-    virtual ~jpeg_decoder_stream() { }
+	// Success/failure error codes.
+	enum jpgd_status
+	{
+		JPGD_SUCCESS = 0, JPGD_FAILED = -1, JPGD_DONE = 1,
+		JPGD_BAD_DHT_COUNTS = -256, JPGD_BAD_DHT_INDEX, JPGD_BAD_DHT_MARKER, JPGD_BAD_DQT_MARKER, JPGD_BAD_DQT_TABLE,
+		JPGD_BAD_PRECISION, JPGD_BAD_HEIGHT, JPGD_BAD_WIDTH, JPGD_TOO_MANY_COMPONENTS,
+		JPGD_BAD_SOF_LENGTH, JPGD_BAD_VARIABLE_MARKER, JPGD_BAD_DRI_LENGTH, JPGD_BAD_SOS_LENGTH,
+		JPGD_BAD_SOS_COMP_ID, JPGD_W_EXTRA_BYTES_BEFORE_MARKER, JPGD_NO_ARITHMITIC_SUPPORT, JPGD_UNEXPECTED_MARKER,
+		JPGD_NOT_JPEG, JPGD_UNSUPPORTED_MARKER, JPGD_BAD_DQT_LENGTH, JPGD_TOO_MANY_BLOCKS,
+		JPGD_UNDEFINED_QUANT_TABLE, JPGD_UNDEFINED_HUFF_TABLE, JPGD_NOT_SINGLE_SCAN, JPGD_UNSUPPORTED_COLORSPACE,
+		JPGD_UNSUPPORTED_SAMP_FACTORS, JPGD_DECODE_ERROR, JPGD_BAD_RESTART_MARKER,
+		JPGD_BAD_SOS_SPECTRAL, JPGD_BAD_SOS_SUCCESSIVE, JPGD_STREAM_READ, JPGD_NOTENOUGHMEM, JPGD_TOO_MANY_SCANS
+	};
 
-    // The read() method is called when the internal input buffer is empty.
-    // Parameters:
-    // pBuf - input buffer
-    // max_bytes_to_read - maximum bytes that can be written to pBuf
-    // pEOF_flag - set this to true if at end of stream (no more bytes remaining)
-    // Returns -1 on error, otherwise return the number of bytes actually written to the buffer (which may be 0).
-    // Notes: This method will be called in a loop until you set *pEOF_flag to true or the internal buffer is full.
-    virtual int read(uint8 *pBuf, int max_bytes_to_read, bool *pEOF_flag) = 0;
-  };
+	// Input stream interface.
+	// Derive from this class to read input data from sources other than files or memory. Set m_eof_flag to true when no more data is available.
+	// The decoder is rather greedy: it will keep on calling this method until its internal input buffer is full, or until the EOF flag is set.
+	// It the input stream contains data after the JPEG stream's EOI (end of image) marker it will probably be pulled into the internal buffer.
+	// Call the get_total_bytes_read() method to determine the actual size of the JPEG stream after successful decoding.
+	class jpeg_decoder_stream
+	{
+	public:
+		jpeg_decoder_stream() { }
+		virtual ~jpeg_decoder_stream() { }
 
-  // stdio FILE stream class.
-  class jpeg_decoder_file_stream : public jpeg_decoder_stream
-  {
-    jpeg_decoder_file_stream(const jpeg_decoder_file_stream &);
-    jpeg_decoder_file_stream &operator =(const jpeg_decoder_file_stream &);
+		// The read() method is called when the internal input buffer is empty.
+		// Parameters:
+		// pBuf - input buffer
+		// max_bytes_to_read - maximum bytes that can be written to pBuf
+		// pEOF_flag - set this to true if at end of stream (no more bytes remaining)
+		// Returns -1 on error, otherwise return the number of bytes actually written to the buffer (which may be 0).
+		// Notes: This method will be called in a loop until you set *pEOF_flag to true or the internal buffer is full.
+		virtual int read(uint8* pBuf, int max_bytes_to_read, bool* pEOF_flag) = 0;
+	};
 
-    FILE *m_pFile;
-    bool m_eof_flag, m_error_flag;
+	// stdio FILE stream class.
+	class jpeg_decoder_file_stream : public jpeg_decoder_stream
+	{
+		jpeg_decoder_file_stream(const jpeg_decoder_file_stream&);
+		jpeg_decoder_file_stream& operator =(const jpeg_decoder_file_stream&);
 
-  public:
-    jpeg_decoder_file_stream();
-    virtual ~jpeg_decoder_file_stream();
-    
-    bool open(const char *Pfilename);
-    void close();
+		FILE* m_pFile;
+		bool m_eof_flag, m_error_flag;
 
-    virtual int read(uint8 *pBuf, int max_bytes_to_read, bool *pEOF_flag);
-  };
+	public:
+		jpeg_decoder_file_stream();
+		virtual ~jpeg_decoder_file_stream();
 
-  // Memory stream class.
-  class jpeg_decoder_mem_stream : public jpeg_decoder_stream
-  {
-    const uint8 *m_pSrc_data;
-    uint m_ofs, m_size;
+		bool open(const char* Pfilename);
+		void close();
 
-  public:
-    jpeg_decoder_mem_stream() : m_pSrc_data(NULL), m_ofs(0), m_size(0) { }
-    jpeg_decoder_mem_stream(const uint8 *pSrc_data, uint size) : m_pSrc_data(pSrc_data), m_ofs(0), m_size(size) { }
+		virtual int read(uint8* pBuf, int max_bytes_to_read, bool* pEOF_flag);
+	};
 
-    virtual ~jpeg_decoder_mem_stream() { }
+	// Memory stream class.
+	class jpeg_decoder_mem_stream : public jpeg_decoder_stream
+	{
+		const uint8* m_pSrc_data;
+		uint m_ofs, m_size;
 
-    bool open(const uint8 *pSrc_data, uint size);
-    void close() { m_pSrc_data = NULL; m_ofs = 0; m_size = 0; }
-    
-    virtual int read(uint8 *pBuf, int max_bytes_to_read, bool *pEOF_flag);
-  };
+	public:
+		jpeg_decoder_mem_stream() : m_pSrc_data(NULL), m_ofs(0), m_size(0) { }
+		jpeg_decoder_mem_stream(const uint8* pSrc_data, uint size) : m_pSrc_data(pSrc_data), m_ofs(0), m_size(size) { }
 
-  // Loads JPEG file from a jpeg_decoder_stream.
-  unsigned char *decompress_jpeg_image_from_stream(jpeg_decoder_stream *pStream, int *width, int *height, int *actual_comps, int req_comps);
+		virtual ~jpeg_decoder_mem_stream() { }
 
-  enum 
-  { 
-    JPGD_IN_BUF_SIZE = 8192, JPGD_MAX_BLOCKS_PER_MCU = 10, JPGD_MAX_HUFF_TABLES = 8, JPGD_MAX_QUANT_TABLES = 4, 
-    JPGD_MAX_COMPONENTS = 4, JPGD_MAX_COMPS_IN_SCAN = 4, JPGD_MAX_BLOCKS_PER_ROW = 8192, JPGD_MAX_HEIGHT = 16384, JPGD_MAX_WIDTH = 16384 
-  };
-          
-  typedef int16 jpgd_quant_t;
-  typedef int16 jpgd_block_t;
+		bool open(const uint8* pSrc_data, uint size);
+		void close() { m_pSrc_data = NULL; m_ofs = 0; m_size = 0; }
 
-  class jpeg_decoder
-  {
-  public:
-    // Call get_error_code() after constructing to determine if the stream is valid or not. You may call the get_width(), get_height(), etc.
-    // methods after the constructor is called. You may then either destruct the object, or begin decoding the image by calling begin_decoding(), then decode() on each scanline.
-    jpeg_decoder(jpeg_decoder_stream *pStream);
+		virtual int read(uint8* pBuf, int max_bytes_to_read, bool* pEOF_flag);
+	};
 
-    ~jpeg_decoder();
+	// Loads JPEG file from a jpeg_decoder_stream.
+	unsigned char* decompress_jpeg_image_from_stream(jpeg_decoder_stream* pStream, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags = 0);
 
-    // Call this method after constructing the object to begin decompression.
-    // If JPGD_SUCCESS is returned you may then call decode() on each scanline.
-    int begin_decoding();
+	enum
+	{
+		JPGD_IN_BUF_SIZE = 8192, JPGD_MAX_BLOCKS_PER_MCU = 10, JPGD_MAX_HUFF_TABLES = 8, JPGD_MAX_QUANT_TABLES = 4,
+		JPGD_MAX_COMPONENTS = 4, JPGD_MAX_COMPS_IN_SCAN = 4, JPGD_MAX_BLOCKS_PER_ROW = 16384, JPGD_MAX_HEIGHT = 32768, JPGD_MAX_WIDTH = 32768
+	};
 
-    // Returns the next scan line.
-    // For grayscale images, pScan_line will point to a buffer containing 8-bit pixels (get_bytes_per_pixel() will return 1). 
-    // Otherwise, it will always point to a buffer containing 32-bit RGBA pixels (A will always be 255, and get_bytes_per_pixel() will return 4).
-    // Returns JPGD_SUCCESS if a scan line has been returned.
-    // Returns JPGD_DONE if all scan lines have been returned.
-    // Returns JPGD_FAILED if an error occurred. Call get_error_code() for a more info.
-    int decode(const void** pScan_line, uint* pScan_line_len);
-    
-    inline jpgd_status get_error_code() const { return m_error_code; }
+	typedef int16 jpgd_quant_t;
+	typedef int16 jpgd_block_coeff_t;
 
-    inline int get_width() const { return m_image_x_size; }
-    inline int get_height() const { return m_image_y_size; }
+	class jpeg_decoder
+	{
+	public:
+		enum
+		{
+			cFlagBoxChromaFiltering = 1,
+			cFlagDisableSIMD = 2
+		};
 
-    inline int get_num_components() const { return m_comps_in_frame; }
+		// Call get_error_code() after constructing to determine if the stream is valid or not. You may call the get_width(), get_height(), etc.
+		// methods after the constructor is called. You may then either destruct the object, or begin decoding the image by calling begin_decoding(), then decode() on each scanline.
+		jpeg_decoder(jpeg_decoder_stream* pStream, uint32_t flags = 0);
 
-    inline int get_bytes_per_pixel() const { return m_dest_bytes_per_pixel; }
-    inline int get_bytes_per_scan_line() const { return m_image_x_size * get_bytes_per_pixel(); }
+		~jpeg_decoder();
 
-    // Returns the total number of bytes actually consumed by the decoder (which should equal the actual size of the JPEG file).
-    inline int get_total_bytes_read() const { return m_total_bytes_read; }
-    
-  private:
-    jpeg_decoder(const jpeg_decoder &);
-    jpeg_decoder &operator =(const jpeg_decoder &);
+		// Call this method after constructing the object to begin decompression.
+		// If JPGD_SUCCESS is returned you may then call decode() on each scanline.
 
-    typedef void (*pDecode_block_func)(jpeg_decoder *, int, int, int);
+		int begin_decoding();
 
-    struct huff_tables
-    {
-      bool ac_table;
-      uint  look_up[256];
-      uint  look_up2[256];
-      uint8 code_size[256];
-      uint  tree[512];
-    };
+		// Returns the next scan line.
+		// For grayscale images, pScan_line will point to a buffer containing 8-bit pixels (get_bytes_per_pixel() will return 1). 
+		// Otherwise, it will always point to a buffer containing 32-bit RGBA pixels (A will always be 255, and get_bytes_per_pixel() will return 4).
+		// Returns JPGD_SUCCESS if a scan line has been returned.
+		// Returns JPGD_DONE if all scan lines have been returned.
+		// Returns JPGD_FAILED if an error occurred. Call get_error_code() for a more info.
+		int decode(const void** pScan_line, uint* pScan_line_len);
 
-    struct coeff_buf
-    {
-      uint8 *pData;
-      int block_num_x, block_num_y;
-      int block_len_x, block_len_y;
-      int block_size;
-    };
+		inline jpgd_status get_error_code() const { return m_error_code; }
 
-    struct mem_block
-    {
-      mem_block *m_pNext;
-      size_t m_used_count;
-      size_t m_size;
-      char m_data[1];
-    };
+		inline int get_width() const { return m_image_x_size; }
+		inline int get_height() const { return m_image_y_size; }
 
-    jmp_buf m_jmp_state;
-    mem_block *m_pMem_blocks;
-    int m_image_x_size;
-    int m_image_y_size;
-    jpeg_decoder_stream *m_pStream;
-    int m_progressive_flag;
-    uint8 m_huff_ac[JPGD_MAX_HUFF_TABLES];
-    uint8* m_huff_num[JPGD_MAX_HUFF_TABLES];      // pointer to number of Huffman codes per bit size
-    uint8* m_huff_val[JPGD_MAX_HUFF_TABLES];      // pointer to Huffman codes per bit size
-    jpgd_quant_t* m_quant[JPGD_MAX_QUANT_TABLES]; // pointer to quantization tables
-    int m_scan_type;                              // Gray, Yh1v1, Yh1v2, Yh2v1, Yh2v2 (CMYK111, CMYK4114 no longer supported)
-    int m_comps_in_frame;                         // # of components in frame
-    int m_comp_h_samp[JPGD_MAX_COMPONENTS];       // component's horizontal sampling factor
-    int m_comp_v_samp[JPGD_MAX_COMPONENTS];       // component's vertical sampling factor
-    int m_comp_quant[JPGD_MAX_COMPONENTS];        // component's quantization table selector
-    int m_comp_ident[JPGD_MAX_COMPONENTS];        // component's ID
-    int m_comp_h_blocks[JPGD_MAX_COMPONENTS];
-    int m_comp_v_blocks[JPGD_MAX_COMPONENTS];
-    int m_comps_in_scan;                          // # of components in scan
-    int m_comp_list[JPGD_MAX_COMPS_IN_SCAN];      // components in this scan
-    int m_comp_dc_tab[JPGD_MAX_COMPONENTS];       // component's DC Huffman coding table selector
-    int m_comp_ac_tab[JPGD_MAX_COMPONENTS];       // component's AC Huffman coding table selector
-    int m_spectral_start;                         // spectral selection start
-    int m_spectral_end;                           // spectral selection end
-    int m_successive_low;                         // successive approximation low
-    int m_successive_high;                        // successive approximation high
-    int m_max_mcu_x_size;                         // MCU's max. X size in pixels
-    int m_max_mcu_y_size;                         // MCU's max. Y size in pixels
-    int m_blocks_per_mcu;
-    int m_max_blocks_per_row;
-    int m_mcus_per_row, m_mcus_per_col;
-    int m_mcu_org[JPGD_MAX_BLOCKS_PER_MCU];
-    int m_total_lines_left;                       // total # lines left in image
-    int m_mcu_lines_left;                         // total # lines left in this MCU
-    int m_real_dest_bytes_per_scan_line;
-    int m_dest_bytes_per_scan_line;               // rounded up
-    int m_dest_bytes_per_pixel;                   // 4 (RGB) or 1 (Y)
-    huff_tables* m_pHuff_tabs[JPGD_MAX_HUFF_TABLES];
-    coeff_buf* m_dc_coeffs[JPGD_MAX_COMPONENTS];
-    coeff_buf* m_ac_coeffs[JPGD_MAX_COMPONENTS];
-    int m_eob_run;
-    int m_block_y_mcu[JPGD_MAX_COMPONENTS];
-    uint8* m_pIn_buf_ofs;
-    int m_in_buf_left;
-    int m_tem_flag;
-    bool m_eof_flag;
-    uint8 m_in_buf_pad_start[128];
-    uint8 m_in_buf[JPGD_IN_BUF_SIZE + 128];
-    uint8 m_in_buf_pad_end[128];
-    int m_bits_left;
-    uint m_bit_buf;
-    int m_restart_interval;
-    int m_restarts_left;
-    int m_next_restart_num;
-    int m_max_mcus_per_row;
-    int m_max_blocks_per_mcu;
-    int m_expanded_blocks_per_mcu;
-    int m_expanded_blocks_per_row;
-    int m_expanded_blocks_per_component;
-    bool  m_freq_domain_chroma_upsample;
-    int m_max_mcus_per_col;
-    uint m_last_dc_val[JPGD_MAX_COMPONENTS];
-    jpgd_block_t* m_pMCU_coefficients;
-    int m_mcu_block_max_zag[JPGD_MAX_BLOCKS_PER_MCU];
-    uint8* m_pSample_buf;
-    int m_crr[256];
-    int m_cbb[256];
-    int m_crg[256];
-    int m_cbg[256];
-    uint8* m_pScan_line_0;
-    uint8* m_pScan_line_1;
-    jpgd_status m_error_code;
-    bool m_ready_flag;
-    int m_total_bytes_read;
+		inline int get_num_components() const { return m_comps_in_frame; }
+
+		inline int get_bytes_per_pixel() const { return m_dest_bytes_per_pixel; }
+		inline int get_bytes_per_scan_line() const { return m_image_x_size * get_bytes_per_pixel(); }
+
+		// Returns the total number of bytes actually consumed by the decoder (which should equal the actual size of the JPEG file).
+		inline int get_total_bytes_read() const { return m_total_bytes_read; }
+
+	private:
+		jpeg_decoder(const jpeg_decoder&);
+		jpeg_decoder& operator =(const jpeg_decoder&);
+
+		typedef void (*pDecode_block_func)(jpeg_decoder*, int, int, int);
+
+		struct huff_tables
+		{
+			bool ac_table;
+			uint  look_up[256];
+			uint  look_up2[256];
+			uint8 code_size[JPGD_HUFF_CODE_SIZE_MAX_LENGTH];
+			uint  tree[JPGD_HUFF_TREE_MAX_LENGTH];
+		};
+
+		struct coeff_buf
+		{
+			uint8* pData;
+			int block_num_x, block_num_y;
+			int block_len_x, block_len_y;
+			int block_size;
+		};
+
+		struct mem_block
+		{
+			mem_block* m_pNext;
+			size_t m_used_count;
+			size_t m_size;
+			char m_data[1];
+		};
+
+		jmp_buf m_jmp_state;
+		uint32_t m_flags;
+		mem_block* m_pMem_blocks;
+		int m_image_x_size;
+		int m_image_y_size;
+		jpeg_decoder_stream* m_pStream;
+
+		int m_progressive_flag;
+
+		uint8 m_huff_ac[JPGD_MAX_HUFF_TABLES];
+		uint8* m_huff_num[JPGD_MAX_HUFF_TABLES];      // pointer to number of Huffman codes per bit size
+		uint8* m_huff_val[JPGD_MAX_HUFF_TABLES];      // pointer to Huffman codes per bit size
+		jpgd_quant_t* m_quant[JPGD_MAX_QUANT_TABLES]; // pointer to quantization tables
+		int m_scan_type;                              // Gray, Yh1v1, Yh1v2, Yh2v1, Yh2v2 (CMYK111, CMYK4114 no longer supported)
+		int m_comps_in_frame;                         // # of components in frame
+		int m_comp_h_samp[JPGD_MAX_COMPONENTS];       // component's horizontal sampling factor
+		int m_comp_v_samp[JPGD_MAX_COMPONENTS];       // component's vertical sampling factor
+		int m_comp_quant[JPGD_MAX_COMPONENTS];        // component's quantization table selector
+		int m_comp_ident[JPGD_MAX_COMPONENTS];        // component's ID
+		int m_comp_h_blocks[JPGD_MAX_COMPONENTS];
+		int m_comp_v_blocks[JPGD_MAX_COMPONENTS];
+		int m_comps_in_scan;                          // # of components in scan
+		int m_comp_list[JPGD_MAX_COMPS_IN_SCAN];      // components in this scan
+		int m_comp_dc_tab[JPGD_MAX_COMPONENTS];       // component's DC Huffman coding table selector
+		int m_comp_ac_tab[JPGD_MAX_COMPONENTS];       // component's AC Huffman coding table selector
+		int m_spectral_start;                         // spectral selection start
+		int m_spectral_end;                           // spectral selection end
+		int m_successive_low;                         // successive approximation low
+		int m_successive_high;                        // successive approximation high
+		int m_max_mcu_x_size;                         // MCU's max. X size in pixels
+		int m_max_mcu_y_size;                         // MCU's max. Y size in pixels
+		int m_blocks_per_mcu;
+		int m_max_blocks_per_row;
+		int m_mcus_per_row, m_mcus_per_col;
+		int m_mcu_org[JPGD_MAX_BLOCKS_PER_MCU];
+		int m_total_lines_left;                       // total # lines left in image
+		int m_mcu_lines_left;                         // total # lines left in this MCU
+		int m_num_buffered_scanlines;
+		int m_real_dest_bytes_per_scan_line;
+		int m_dest_bytes_per_scan_line;               // rounded up
+		int m_dest_bytes_per_pixel;                   // 4 (RGB) or 1 (Y)
+		huff_tables* m_pHuff_tabs[JPGD_MAX_HUFF_TABLES];
+		coeff_buf* m_dc_coeffs[JPGD_MAX_COMPONENTS];
+		coeff_buf* m_ac_coeffs[JPGD_MAX_COMPONENTS];
+		int m_eob_run;
+		int m_block_y_mcu[JPGD_MAX_COMPONENTS];
+		uint8* m_pIn_buf_ofs;
+		int m_in_buf_left;
+		int m_tem_flag;
+
+		uint8 m_in_buf_pad_start[64];
+		uint8 m_in_buf[JPGD_IN_BUF_SIZE + 128];
+		uint8 m_in_buf_pad_end[64];
+
+		int m_bits_left;
+		uint m_bit_buf;
+		int m_restart_interval;
+		int m_restarts_left;
+		int m_next_restart_num;
+		int m_max_mcus_per_row;
+		int m_max_blocks_per_mcu;
+
+		int m_max_mcus_per_col;
+		uint m_last_dc_val[JPGD_MAX_COMPONENTS];
+		jpgd_block_coeff_t* m_pMCU_coefficients;
+		int m_mcu_block_max_zag[JPGD_MAX_BLOCKS_PER_MCU];
+		uint8* m_pSample_buf;
+		uint8* m_pSample_buf_prev;
+		int m_crr[256];
+		int m_cbb[256];
+		int m_crg[256];
+		int m_cbg[256];
+		uint8* m_pScan_line_0;
+		uint8* m_pScan_line_1;
+		jpgd_status m_error_code;
+		int m_total_bytes_read;
+
+		bool m_ready_flag;
+		bool m_eof_flag;
+		bool m_sample_buf_prev_valid;
+		bool m_has_sse2;
+
+		inline int check_sample_buf_ofs(int ofs) const { assert(ofs >= 0); assert(ofs < m_max_blocks_per_row * 64); return ofs; }
+		void free_all_blocks();
+		JPGD_NORETURN void stop_decoding(jpgd_status status);
+		void* alloc(size_t n, bool zero = false);
+		void* alloc_aligned(size_t nSize, uint32_t align = 16, bool zero = false);
+		void word_clear(void* p, uint16 c, uint n);
+		void prep_in_buffer();
+		void read_dht_marker();
+		void read_dqt_marker();
+		void read_sof_marker();
+		void skip_variable_marker();
+		void read_dri_marker();
+		void read_sos_marker();
+		int next_marker();
+		int process_markers();
+		void locate_soi_marker();
+		void locate_sof_marker();
+		int locate_sos_marker();
+		void init(jpeg_decoder_stream* pStream, uint32_t flags);
+		void create_look_ups();
+		void fix_in_buffer();
+		void transform_mcu(int mcu_row);
+		coeff_buf* coeff_buf_open(int block_num_x, int block_num_y, int block_len_x, int block_len_y);
+		inline jpgd_block_coeff_t* coeff_buf_getp(coeff_buf* cb, int block_x, int block_y);
+		void load_next_row();
+		void decode_next_row();
+		void make_huff_table(int index, huff_tables* pH);
+		void check_quant_tables();
+		void check_huff_tables();
+		bool calc_mcu_block_order();
+		int init_scan();
+		void init_frame();
+		void process_restart();
+		void decode_scan(pDecode_block_func decode_block_func);
+		void init_progressive();
+		void init_sequential();
+		void decode_start();
+		void decode_init(jpeg_decoder_stream* pStream, uint32_t flags);
+		void H2V2Convert();
+		uint32_t H2V2ConvertFiltered();
+		void H2V1Convert();
+		void H2V1ConvertFiltered();
+		void H1V2Convert();
+		void H1V2ConvertFiltered();
+		void H1V1Convert();
+		void gray_convert();
+		void find_eoi();
+		inline uint get_char();
+		inline uint get_char(bool* pPadding_flag);
+		inline void stuff_char(uint8 q);
+		inline uint8 get_octet();
+		inline uint get_bits(int num_bits);
+		inline uint get_bits_no_markers(int numbits);
+		inline int huff_decode(huff_tables* pH);
+		inline int huff_decode(huff_tables* pH, int& extrabits);
+
+		// Clamps a value between 0-255.
+		static inline uint8 clamp(int i)
+		{
+			if (static_cast<uint>(i) > 255)
+				i = (((~i) >> 31) & 0xFF);
+			return static_cast<uint8>(i);
+		}
+		int decode_next_mcu_row();
+
+		static void decode_block_dc_first(jpeg_decoder* pD, int component_id, int block_x, int block_y);
+		static void decode_block_dc_refine(jpeg_decoder* pD, int component_id, int block_x, int block_y);
+		static void decode_block_ac_first(jpeg_decoder* pD, int component_id, int block_x, int block_y);
+		static void decode_block_ac_refine(jpeg_decoder* pD, int component_id, int block_x, int block_y);
+	};
 
-    void free_all_blocks();
-    JPGD_NORETURN void stop_decoding(jpgd_status status);
-    void *alloc(size_t n, bool zero = false);
-    void word_clear(void *p, uint16 c, uint n);
-    void prep_in_buffer();
-    void read_dht_marker();
-    void read_dqt_marker();
-    void read_sof_marker();
-    void skip_variable_marker();
-    void read_dri_marker();
-    void read_sos_marker();
-    int next_marker();
-    int process_markers();
-    void locate_soi_marker();
-    void locate_sof_marker();
-    int locate_sos_marker();
-    void init(jpeg_decoder_stream * pStream);
-    void create_look_ups();
-    void fix_in_buffer();
-    void transform_mcu(int mcu_row);
-    void transform_mcu_expand(int mcu_row);
-    coeff_buf* coeff_buf_open(int block_num_x, int block_num_y, int block_len_x, int block_len_y);
-    inline jpgd_block_t *coeff_buf_getp(coeff_buf *cb, int block_x, int block_y);
-    void load_next_row();
-    void decode_next_row();
-    void make_huff_table(int index, huff_tables *pH);
-    void check_quant_tables();
-    void check_huff_tables();
-    void calc_mcu_block_order();
-    int init_scan();
-    void init_frame();
-    void process_restart();
-    void decode_scan(pDecode_block_func decode_block_func);
-    void init_progressive();
-    void init_sequential();
-    void decode_start();
-    void decode_init(jpeg_decoder_stream * pStream);
-    void H2V2Convert();
-    void H2V1Convert();
-    void H1V2Convert();
-    void H1V1Convert();
-    void gray_convert();
-    void expanded_convert();
-    void find_eoi();
-    inline uint get_char();
-    inline uint get_char(bool *pPadding_flag);
-    inline void stuff_char(uint8 q);
-    inline uint8 get_octet();
-    inline uint get_bits(int num_bits);
-    inline uint get_bits_no_markers(int numbits);
-    inline int huff_decode(huff_tables *pH);
-    inline int huff_decode(huff_tables *pH, int& extrabits);
-    static inline uint8 clamp(int i);
-    static void decode_block_dc_first(jpeg_decoder *pD, int component_id, int block_x, int block_y);
-    static void decode_block_dc_refine(jpeg_decoder *pD, int component_id, int block_x, int block_y);
-    static void decode_block_ac_first(jpeg_decoder *pD, int component_id, int block_x, int block_y);
-    static void decode_block_ac_refine(jpeg_decoder *pD, int component_id, int block_x, int block_y);
-  };
-  
 } // namespace jpgd
 
 #endif // JPEG_DECODER_H
diff --git a/thirdparty/jpeg-compressor/jpgd_idct.h b/thirdparty/jpeg-compressor/jpgd_idct.h
new file mode 100644
index 0000000000..876425a959
--- /dev/null
+++ b/thirdparty/jpeg-compressor/jpgd_idct.h
@@ -0,0 +1,462 @@
+// Copyright 2009 Intel Corporation
+// All Rights Reserved
+//
+// Permission is granted to use, copy, distribute and prepare derivative works of this
+// software for any purpose and without fee, provided, that the above copyright notice
+// and this statement appear in all copies.  Intel makes no representations about the
+// suitability of this software for any purpose.  THIS SOFTWARE IS PROVIDED "AS IS."
+// INTEL SPECIFICALLY DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, AND ALL LIABILITY,
+// INCLUDING CONSEQUENTIAL AND OTHER INDIRECT DAMAGES, FOR THE USE OF THIS SOFTWARE,
+// INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PROPRIETARY RIGHTS, AND INCLUDING THE
+// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  Intel does not
+// assume any responsibility for any errors which may appear in this software nor any
+// responsibility to update it.
+//
+// From:
+// https://software.intel.com/sites/default/files/m/d/4/1/d/8/UsingIntelAVXToImplementIDCT-r1_5.pdf
+// https://software.intel.com/file/29048
+//
+// Requires SSE
+//
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+#include <immintrin.h>
+
+#ifdef _MSC_VER
+	#define JPGD_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+#else
+	#define JPGD_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+#endif
+
+#define BITS_INV_ACC 4
+#define SHIFT_INV_ROW 16 - BITS_INV_ACC
+#define SHIFT_INV_COL 1 + BITS_INV_ACC
+const short IRND_INV_ROW = 1024 * (6 - BITS_INV_ACC);	//1 << (SHIFT_INV_ROW-1)
+const short IRND_INV_COL = 16 * (BITS_INV_ACC - 3);		// 1 << (SHIFT_INV_COL-1)
+const short IRND_INV_CORR = IRND_INV_COL - 1;			// correction -1.0 and round
+
+JPGD_SIMD_ALIGN(short, shortM128_one_corr[8]) = {1, 1, 1, 1, 1, 1, 1, 1};
+JPGD_SIMD_ALIGN(short, shortM128_round_inv_row[8]) = {IRND_INV_ROW, 0, IRND_INV_ROW, 0, IRND_INV_ROW, 0, IRND_INV_ROW, 0};
+JPGD_SIMD_ALIGN(short, shortM128_round_inv_col[8]) = {IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL, IRND_INV_COL};
+JPGD_SIMD_ALIGN(short, shortM128_round_inv_corr[8])= {IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR, IRND_INV_CORR};
+JPGD_SIMD_ALIGN(short, shortM128_tg_1_16[8]) = {13036, 13036, 13036, 13036, 13036, 13036, 13036, 13036}; // tg * (2<<16) + 0.5
+JPGD_SIMD_ALIGN(short, shortM128_tg_2_16[8]) = {27146, 27146, 27146, 27146, 27146, 27146, 27146, 27146}; // tg * (2<<16) + 0.5
+JPGD_SIMD_ALIGN(short, shortM128_tg_3_16[8]) = {-21746, -21746, -21746, -21746, -21746, -21746, -21746, -21746}; // tg * (2<<16) + 0.5
+JPGD_SIMD_ALIGN(short, shortM128_cos_4_16[8]) = {-19195, -19195, -19195, -19195, -19195, -19195, -19195, -19195};// cos * (2<<16) + 0.5
+
+//-----------------------------------------------------------------------------
+// Table for rows 0,4 - constants are multiplied on cos_4_16
+// w15 w14 w11 w10 w07 w06 w03 w02
+// w29 w28 w25 w24 w21 w20 w17 w16
+// w31 w30 w27 w26 w23 w22 w19 w18
+//movq -> w05 w04 w01 w00
+JPGD_SIMD_ALIGN(short, shortM128_tab_i_04[]) = {
+	16384, 21407, 16384, 8867,
+	16384, -8867, 16384, -21407, // w13 w12 w09 w08
+	16384, 8867, -16384, -21407, // w07 w06 w03 w02
+	-16384, 21407, 16384, -8867, // w15 w14 w11 w10
+	22725, 19266, 19266, -4520, // w21 w20 w17 w16
+	12873, -22725, 4520, -12873, // w29 w28 w25 w24
+	12873, 4520, -22725, -12873, // w23 w22 w19 w18
+	4520, 19266, 19266, -22725}; // w31 w30 w27 w26
+
+	// Table for rows 1,7 - constants are multiplied on cos_1_16
+//movq -> w05 w04 w01 w00
+JPGD_SIMD_ALIGN(short, shortM128_tab_i_17[]) = {
+	22725, 29692, 22725, 12299,
+	22725, -12299, 22725, -29692, // w13 w12 w09 w08
+	22725, 12299, -22725, -29692, // w07 w06 w03 w02
+	-22725, 29692, 22725, -12299, // w15 w14 w11 w10
+	31521, 26722, 26722, -6270, // w21 w20 w17 w16
+	17855, -31521, 6270, -17855, // w29 w28 w25 w24
+	17855, 6270, -31521, -17855, // w23 w22 w19 w18
+	6270, 26722, 26722, -31521}; // w31 w30 w27 w26
+
+// Table for rows 2,6 - constants are multiplied on cos_2_16
+//movq -> w05 w04 w01 w00
+JPGD_SIMD_ALIGN(short, shortM128_tab_i_26[]) = {
+	21407, 27969, 21407, 11585,
+	21407, -11585, 21407, -27969, // w13 w12 w09 w08
+	21407, 11585, -21407, -27969, // w07 w06 w03 w02
+	-21407, 27969, 21407, -11585, // w15 w14 w11 w10
+	29692, 25172, 25172, -5906,	// w21 w20 w17 w16
+	16819, -29692, 5906, -16819, // w29 w28 w25 w24
+	16819, 5906, -29692, -16819, // w23 w22 w19 w18
+	5906, 25172, 25172, -29692}; // w31 w30 w27 w26
+// Table for rows 3,5 - constants are multiplied on cos_3_16
+//movq -> w05 w04 w01 w00
+JPGD_SIMD_ALIGN(short, shortM128_tab_i_35[]) = {
+	19266, 25172, 19266, 10426,
+	19266, -10426, 19266, -25172, // w13 w12 w09 w08
+	19266, 10426, -19266, -25172, // w07 w06 w03 w02
+	-19266, 25172, 19266, -10426, // w15 w14 w11 w10
+	26722, 22654, 22654, -5315, // w21 w20 w17 w16
+	15137, -26722, 5315, -15137, // w29 w28 w25 w24
+	15137, 5315, -26722, -15137, // w23 w22 w19 w18
+	5315, 22654, 22654, -26722}; // w31 w30 w27 w26
+
+JPGD_SIMD_ALIGN(short, shortM128_128[8]) = { 128, 128, 128, 128, 128, 128, 128, 128 };
+
+void idctSSEShortU8(const short *pInput, uint8_t * pOutputUB)
+{
+	__m128i r_xmm0, r_xmm4;
+	__m128i r_xmm1, r_xmm2, r_xmm3, r_xmm5, r_xmm6, r_xmm7;
+	__m128i row0, row1, row2, row3, row4, row5, row6, row7;
+	short * pTab_i_04 = shortM128_tab_i_04;
+	short * pTab_i_26 = shortM128_tab_i_26;
+
+	//Get pointers for this input and output
+	pTab_i_04 = shortM128_tab_i_04;
+	pTab_i_26 = shortM128_tab_i_26;
+
+	//Row 1 and Row 3
+	r_xmm0 = _mm_load_si128((__m128i *) pInput);
+	r_xmm4 = _mm_load_si128((__m128i *) (&pInput[2*8]));
+
+	// *** Work on the data in xmm0
+	//low shuffle mask = 0xd8 = 11 01 10 00
+	//get short 2 and short 0 into ls 32-bits
+	r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8);
+
+	// copy short 2 and short 0 to all locations
+	r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
+		
+	// add to those copies
+	r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04));
+
+	// shuffle mask = 0x55 = 01 01 01 01
+	// copy short 3 and short 1 to all locations
+	r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
+		
+	// high shuffle mask = 0xd8 = 11 01 10 00
+	// get short 6 and short 4 into bit positions 64-95
+	// get short 7 and short 5 into bit positions 96-127
+	r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
+		
+	// add to short 3 and short 1
+	r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16]));
+		
+	// shuffle mask = 0xaa = 10 10 10 10
+	// copy short 6 and short 4 to all locations
+	r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
+		
+	// shuffle mask = 0xaa = 11 11 11 11
+	// copy short 7 and short 5 to all locations
+	r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
+		
+	// add to short 6 and short 4
+	r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8])); 
+		
+	// *** Work on the data in xmm4
+	// high shuffle mask = 0xd8 11 01 10 00
+	// get short 6 and short 4 into bit positions 64-95
+	// get short 7 and short 5 into bit positions 96-127
+	r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
+		
+	// (xmm0 short 2 and short 0 plus pSi) + some constants
+	r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row));
+	r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
+	r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24]));
+	r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
+	r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
+	r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &shortM128_tab_i_26[0]));
+	r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
+	r_xmm2 = r_xmm1;
+	r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
+	r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &shortM128_tab_i_26[8])); 
+	r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
+	r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
+	r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
+	r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &shortM128_tab_i_26[16])); 
+	r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
+	r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
+	r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row));
+	r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &shortM128_tab_i_26[24]));
+	r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
+	r_xmm6 = r_xmm5;
+	r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
+	r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b);
+	row0 = _mm_packs_epi32(r_xmm0, r_xmm2);
+	r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7);
+	r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4);
+	r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5);
+	r_xmm6 = _mm_srai_epi32(r_xmm6, 12);
+	r_xmm4 = _mm_srai_epi32(r_xmm4, 12);
+	r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b);
+	row2 = _mm_packs_epi32(r_xmm4, r_xmm6);
+
+	//Row 5 and row 7
+	r_xmm0 = _mm_load_si128((__m128i *) (&pInput[4*8]));
+	r_xmm4 = _mm_load_si128((__m128i *) (&pInput[6*8]));
+
+	r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8);
+	r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
+	r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04));
+	r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
+	r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
+	r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16]));
+	r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
+	r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
+	r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8])); 
+	r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
+	r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row));
+	r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
+	r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24]));
+	r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
+	r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
+	r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &shortM128_tab_i_26[0]));
+	r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
+	r_xmm2 = r_xmm1;
+	r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
+	r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &shortM128_tab_i_26[8])); 
+	r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
+	r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
+	r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
+	r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &shortM128_tab_i_26[16])); 
+	r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
+	r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
+	r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row));
+	r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &shortM128_tab_i_26[24]));
+	r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
+	r_xmm6 = r_xmm5;
+	r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
+	r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b);
+	row4 = _mm_packs_epi32(r_xmm0, r_xmm2);
+	r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7);
+	r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4);
+	r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5);
+	r_xmm6 = _mm_srai_epi32(r_xmm6, 12);
+	r_xmm4 = _mm_srai_epi32(r_xmm4, 12);
+	r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b);
+	row6 = _mm_packs_epi32(r_xmm4, r_xmm6);
+
+	//Row 4 and row 2
+	pTab_i_04 = shortM128_tab_i_35;
+	pTab_i_26 = shortM128_tab_i_17;
+	r_xmm0 = _mm_load_si128((__m128i *) (&pInput[3*8]));
+	r_xmm4 = _mm_load_si128((__m128i *) (&pInput[1*8]));
+
+	r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8);
+	r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
+	r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04));
+	r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
+	r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
+	r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16]));
+	r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
+	r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
+	r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8])); 
+	r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
+	r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row));
+	r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
+	r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24]));
+	r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
+	r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
+	r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &pTab_i_26[0]));
+	r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
+	r_xmm2 = r_xmm1;
+	r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
+	r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &pTab_i_26[8])); 
+	r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
+	r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
+	r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
+	r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &pTab_i_26[16])); 
+	r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
+	r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
+	r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row));
+	r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &pTab_i_26[24]));
+	r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
+	r_xmm6 = r_xmm5;
+	r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
+	r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b);
+	row3 = _mm_packs_epi32(r_xmm0, r_xmm2);
+	r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7);
+	r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4);
+	r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5);
+	r_xmm6 = _mm_srai_epi32(r_xmm6, 12);
+	r_xmm4 = _mm_srai_epi32(r_xmm4, 12);
+	r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b);
+	row1 = _mm_packs_epi32(r_xmm4, r_xmm6);
+
+	//Row 6 and row 8
+	r_xmm0 = _mm_load_si128((__m128i *) (&pInput[5*8]));
+	r_xmm4 = _mm_load_si128((__m128i *) (&pInput[7*8]));
+
+	r_xmm0 = _mm_shufflelo_epi16(r_xmm0, 0xd8);
+	r_xmm1 = _mm_shuffle_epi32(r_xmm0, 0);
+	r_xmm1 = _mm_madd_epi16(r_xmm1, *((__m128i *) pTab_i_04));
+	r_xmm3 = _mm_shuffle_epi32(r_xmm0, 0x55);
+	r_xmm0 = _mm_shufflehi_epi16(r_xmm0, 0xd8);
+	r_xmm3 = _mm_madd_epi16(r_xmm3, *((__m128i *) &pTab_i_04[16]));
+	r_xmm2 = _mm_shuffle_epi32(r_xmm0, 0xaa);
+	r_xmm0 = _mm_shuffle_epi32(r_xmm0, 0xff);
+	r_xmm2 = _mm_madd_epi16(r_xmm2, *((__m128i *) &pTab_i_04[8])); 
+	r_xmm4 = _mm_shufflehi_epi16(r_xmm4, 0xd8);
+	r_xmm1 = _mm_add_epi32(r_xmm1, *((__m128i *) shortM128_round_inv_row));
+	r_xmm4 = _mm_shufflelo_epi16(r_xmm4, 0xd8);
+	r_xmm0 = _mm_madd_epi16(r_xmm0, *((__m128i *) &pTab_i_04[24]));
+	r_xmm5 = _mm_shuffle_epi32(r_xmm4, 0);
+	r_xmm6 = _mm_shuffle_epi32(r_xmm4, 0xaa);
+	r_xmm5 = _mm_madd_epi16(r_xmm5, *((__m128i *) &pTab_i_26[0]));
+	r_xmm1 = _mm_add_epi32(r_xmm1, r_xmm2);
+	r_xmm2 = r_xmm1;
+	r_xmm7 = _mm_shuffle_epi32(r_xmm4, 0x55);
+	r_xmm6 = _mm_madd_epi16(r_xmm6, *((__m128i *) &pTab_i_26[8])); 
+	r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm3);
+	r_xmm4 = _mm_shuffle_epi32(r_xmm4, 0xff);
+	r_xmm2 = _mm_sub_epi32(r_xmm2, r_xmm0);
+	r_xmm7 = _mm_madd_epi16(r_xmm7, *((__m128i *) &pTab_i_26[16])); 
+	r_xmm0 = _mm_add_epi32(r_xmm0, r_xmm1);
+	r_xmm2 = _mm_srai_epi32(r_xmm2, 12);
+	r_xmm5 = _mm_add_epi32(r_xmm5, *((__m128i *) shortM128_round_inv_row));
+	r_xmm4 = _mm_madd_epi16(r_xmm4, *((__m128i *) &pTab_i_26[24]));
+	r_xmm5 = _mm_add_epi32(r_xmm5, r_xmm6);
+	r_xmm6 = r_xmm5;
+	r_xmm0 = _mm_srai_epi32(r_xmm0, 12);
+	r_xmm2 = _mm_shuffle_epi32(r_xmm2, 0x1b);
+	row5 = _mm_packs_epi32(r_xmm0, r_xmm2);
+	r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm7);
+	r_xmm6 = _mm_sub_epi32(r_xmm6, r_xmm4);
+	r_xmm4 = _mm_add_epi32(r_xmm4, r_xmm5);
+	r_xmm6 = _mm_srai_epi32(r_xmm6, 12);
+	r_xmm4 = _mm_srai_epi32(r_xmm4, 12);
+	r_xmm6 = _mm_shuffle_epi32(r_xmm6, 0x1b);
+	row7 = _mm_packs_epi32(r_xmm4, r_xmm6);
+
+	r_xmm1 = _mm_load_si128((__m128i *) shortM128_tg_3_16);
+	r_xmm2 = row5;
+	r_xmm3 = row3;
+	r_xmm0 = _mm_mulhi_epi16(row5, r_xmm1);
+
+	r_xmm1 = _mm_mulhi_epi16(r_xmm1, r_xmm3);
+	r_xmm5 = _mm_load_si128((__m128i *) shortM128_tg_1_16);
+	r_xmm6 = row7;
+	r_xmm4 = _mm_mulhi_epi16(row7, r_xmm5);
+
+	r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm2);
+	r_xmm5 = _mm_mulhi_epi16(r_xmm5, row1);
+	r_xmm1 = _mm_adds_epi16(r_xmm1, r_xmm3);
+	r_xmm7 = row6;
+
+	r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm3);
+	r_xmm3 = _mm_load_si128((__m128i *) shortM128_tg_2_16);
+	r_xmm2 = _mm_subs_epi16(r_xmm2, r_xmm1);
+	r_xmm7 = _mm_mulhi_epi16(r_xmm7, r_xmm3);
+	r_xmm1 = r_xmm0;
+	r_xmm3 = _mm_mulhi_epi16(r_xmm3, row2);
+	r_xmm5 = _mm_subs_epi16(r_xmm5, r_xmm6);
+	r_xmm4 = _mm_adds_epi16(r_xmm4, row1);
+	r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm4);
+	r_xmm0 = _mm_adds_epi16(r_xmm0, *((__m128i *) shortM128_one_corr));
+	r_xmm4 = _mm_subs_epi16(r_xmm4, r_xmm1);
+	r_xmm6 = r_xmm5;
+	r_xmm5 = _mm_subs_epi16(r_xmm5, r_xmm2);
+	r_xmm5 = _mm_adds_epi16(r_xmm5, *((__m128i *) shortM128_one_corr));
+	r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm2);
+
+	//Intermediate results, needed later
+	__m128i temp3, temp7;
+	temp7 = r_xmm0;
+
+	r_xmm1 = r_xmm4;
+	r_xmm0 = _mm_load_si128((__m128i *) shortM128_cos_4_16);
+	r_xmm4 = _mm_adds_epi16(r_xmm4, r_xmm5);
+	r_xmm2 = _mm_load_si128((__m128i *) shortM128_cos_4_16);
+	r_xmm2 = _mm_mulhi_epi16(r_xmm2, r_xmm4);
+
+	//Intermediate results, needed later
+	temp3 = r_xmm6;
+
+	r_xmm1 = _mm_subs_epi16(r_xmm1, r_xmm5);
+	r_xmm7 = _mm_adds_epi16(r_xmm7, row2);
+	r_xmm3 = _mm_subs_epi16(r_xmm3, row6);
+	r_xmm6 = row0;
+	r_xmm0 = _mm_mulhi_epi16(r_xmm0, r_xmm1);
+	r_xmm5 = row4;
+	r_xmm5 = _mm_adds_epi16(r_xmm5, r_xmm6);
+	r_xmm6 = _mm_subs_epi16(r_xmm6, row4);
+	r_xmm4 = _mm_adds_epi16(r_xmm4, r_xmm2);
+
+	r_xmm4 = _mm_or_si128(r_xmm4, *((__m128i *) shortM128_one_corr));
+	r_xmm0 = _mm_adds_epi16(r_xmm0, r_xmm1);
+	r_xmm0 = _mm_or_si128(r_xmm0, *((__m128i *) shortM128_one_corr));
+
+	r_xmm2 = r_xmm5;
+	r_xmm5 = _mm_adds_epi16(r_xmm5, r_xmm7);
+	r_xmm1 = r_xmm6;
+	r_xmm5 = _mm_adds_epi16(r_xmm5, *((__m128i *) shortM128_round_inv_col));
+	r_xmm2 = _mm_subs_epi16(r_xmm2, r_xmm7);
+	r_xmm7 = temp7;
+	r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm3);
+	r_xmm6 = _mm_adds_epi16(r_xmm6, *((__m128i *) shortM128_round_inv_col));
+	r_xmm7 = _mm_adds_epi16(r_xmm7, r_xmm5);
+	r_xmm7 = _mm_srai_epi16(r_xmm7, SHIFT_INV_COL);
+	r_xmm1 = _mm_subs_epi16(r_xmm1, r_xmm3);
+	r_xmm1 = _mm_adds_epi16(r_xmm1, *((__m128i *) shortM128_round_inv_corr));
+	r_xmm3 = r_xmm6;
+	r_xmm2 = _mm_adds_epi16(r_xmm2, *((__m128i *) shortM128_round_inv_corr));
+	r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm4);
+
+	//Store results for row 0
+	//_mm_store_si128((__m128i *) pOutput, r_xmm7);
+	__m128i r0 = r_xmm7;
+
+	r_xmm6 = _mm_srai_epi16(r_xmm6, SHIFT_INV_COL);
+	r_xmm7 = r_xmm1;
+	r_xmm1 = _mm_adds_epi16(r_xmm1, r_xmm0);
+
+	//Store results for row 1
+	//_mm_store_si128((__m128i *) (&pOutput[1*8]), r_xmm6); 
+	__m128i r1 = r_xmm6;
+
+	r_xmm1 = _mm_srai_epi16(r_xmm1, SHIFT_INV_COL);
+	r_xmm6 = temp3;
+	r_xmm7 = _mm_subs_epi16(r_xmm7, r_xmm0);
+	r_xmm7 = _mm_srai_epi16(r_xmm7, SHIFT_INV_COL);
+
+	//Store results for row 2
+	//_mm_store_si128((__m128i *) (&pOutput[2*8]), r_xmm1); 
+	__m128i r2 = r_xmm1;
+
+	r_xmm5 = _mm_subs_epi16(r_xmm5, temp7); 
+	r_xmm5 = _mm_srai_epi16(r_xmm5, SHIFT_INV_COL);
+
+	//Store results for row 7
+	//_mm_store_si128((__m128i *) (&pOutput[7*8]), r_xmm5); 
+	__m128i r7 = r_xmm5;
+
+	r_xmm3 = _mm_subs_epi16(r_xmm3, r_xmm4);
+	r_xmm6 = _mm_adds_epi16(r_xmm6, r_xmm2);
+	r_xmm2 = _mm_subs_epi16(r_xmm2, temp3); 
+	r_xmm6 = _mm_srai_epi16(r_xmm6, SHIFT_INV_COL);
+	r_xmm2 = _mm_srai_epi16(r_xmm2, SHIFT_INV_COL);
+
+	//Store results for row 3
+	//_mm_store_si128((__m128i *) (&pOutput[3*8]), r_xmm6); 
+	__m128i r3 = r_xmm6;
+
+	r_xmm3 = _mm_srai_epi16(r_xmm3, SHIFT_INV_COL);
+
+	//Store results for rows 4, 5, and 6
+	//_mm_store_si128((__m128i *) (&pOutput[4*8]), r_xmm2);
+	//_mm_store_si128((__m128i *) (&pOutput[5*8]), r_xmm7);
+	//_mm_store_si128((__m128i *) (&pOutput[6*8]), r_xmm3);
+
+	__m128i r4 = r_xmm2;
+	__m128i r5 = r_xmm7;
+	__m128i r6 = r_xmm3;
+
+	r0 = _mm_add_epi16(*(const __m128i *)shortM128_128, r0);
+	r1 = _mm_add_epi16(*(const __m128i *)shortM128_128, r1);
+	r2 = _mm_add_epi16(*(const __m128i *)shortM128_128, r2);
+	r3 = _mm_add_epi16(*(const __m128i *)shortM128_128, r3);
+	r4 = _mm_add_epi16(*(const __m128i *)shortM128_128, r4);
+	r5 = _mm_add_epi16(*(const __m128i *)shortM128_128, r5);
+	r6 = _mm_add_epi16(*(const __m128i *)shortM128_128, r6);
+	r7 = _mm_add_epi16(*(const __m128i *)shortM128_128, r7);
+
+	((__m128i *)pOutputUB)[0] = _mm_packus_epi16(r0, r1);
+	((__m128i *)pOutputUB)[1] = _mm_packus_epi16(r2, r3);
+	((__m128i *)pOutputUB)[2] = _mm_packus_epi16(r4, r5);
+	((__m128i *)pOutputUB)[3] = _mm_packus_epi16(r6, r7);
+}