New files
diff --git a/encoder/cppspmd_flow.h b/encoder/cppspmd_flow.h
new file mode 100644
index 0000000..f693047
--- /dev/null
+++ b/encoder/cppspmd_flow.h
@@ -0,0 +1,590 @@
+// Do not include this header directly.
+// Control flow functionality in common between all the headers.
+//
+// Copyright 2020-2021 Binomial LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef _DEBUG
+CPPSPMD_FORCE_INLINE void spmd_kernel::check_masks()
+{
+	assert(!any(andnot(m_kernel_exec, m_exec)));
+}
+#endif
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_break()
+{
+#ifdef _DEBUG
+	assert(m_in_loop);
+#endif
+
+	m_exec = exec_mask::all_off();
+}
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_continue()
+{
+#ifdef _DEBUG
+	assert(m_in_loop);
+#endif
+
+	// Kill any active lanes, and remember which lanes were active so we can re-enable them at the end of the loop body.
+	m_continue_mask = m_continue_mask | m_exec;
+	m_exec = exec_mask::all_off();
+}
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_return()
+{
+	// Permenantly kill all active lanes
+	m_kernel_exec = andnot(m_exec, m_kernel_exec);
+	m_exec = exec_mask::all_off();
+}
+			
+template<typename UnmaskedBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_unmasked(const UnmaskedBody& unmaskedBody)
+{
+	exec_mask orig_exec = m_exec, orig_kernel_exec = m_kernel_exec;
+
+	m_kernel_exec = exec_mask::all_on();
+	m_exec = exec_mask::all_on();
+
+	unmaskedBody();
+
+	m_kernel_exec = m_kernel_exec & orig_kernel_exec;
+	m_exec = m_exec & orig_exec;
+	
+	check_masks();
+}
+
+struct scoped_unmasked_restorer
+{
+	spmd_kernel *m_pKernel;
+	exec_mask m_orig_exec, m_orig_kernel_exec;
+				
+	CPPSPMD_FORCE_INLINE scoped_unmasked_restorer(spmd_kernel *pKernel) : 
+		m_pKernel(pKernel), 
+		m_orig_exec(pKernel->m_exec),
+		m_orig_kernel_exec(pKernel->m_kernel_exec)
+	{
+		pKernel->m_kernel_exec = exec_mask::all_on();
+		pKernel->m_exec = exec_mask::all_on();
+	}
+
+	CPPSPMD_FORCE_INLINE ~scoped_unmasked_restorer() 
+	{ 
+		m_pKernel->m_kernel_exec = m_pKernel->m_kernel_exec & m_orig_kernel_exec;
+		m_pKernel->m_exec = m_pKernel->m_exec & m_orig_exec;
+		m_pKernel->check_masks();
+	}
+};
+
+#define SPMD_UNMASKED_BEGIN { scoped_unmasked_restorer _unmasked_restorer(this); 
+#define SPMD_UNMASKED_END }
+
+#if 0
+template<typename SPMDKernel, typename... Args>
+CPPSPMD_FORCE_INLINE decltype(auto) spmd_kernel::spmd_call(Args&&... args)
+{
+	SPMDKernel kernel;
+	kernel.init(m_exec);
+	return kernel._call(std::forward<Args>(args)...);
+}
+#else
+template<typename SPMDKernel, typename... Args>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_call(Args&&... args)
+{
+	SPMDKernel kernel;
+	kernel.init(m_exec);
+	kernel._call(std::forward<Args>(args)...);
+}
+#endif
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_if_break(const vbool& cond)
+{
+#ifdef _DEBUG
+	assert(m_in_loop);
+#endif
+	
+	exec_mask cond_exec(cond);
+					
+	m_exec = andnot(m_exec & cond_exec, m_exec);
+
+	check_masks();
+}
+
+// No SPMD breaks, continues, etc. allowed
+template<typename IfBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_sif(const vbool& cond, const IfBody& ifBody)
+{
+	exec_mask im = m_exec & exec_mask(cond);
+
+	if (any(im))
+	{
+		const exec_mask orig_exec = m_exec;
+		m_exec = im;
+		ifBody();
+		m_exec = orig_exec;
+	}
+}
+
+// No SPMD breaks, continues, etc. allowed
+template<typename IfBody, typename ElseBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_sifelse(const vbool& cond, const IfBody& ifBody, const ElseBody &elseBody)
+{
+	const exec_mask orig_exec = m_exec;
+
+	exec_mask im = m_exec & exec_mask(cond);
+
+	if (any(im))
+	{
+		m_exec = im;
+		ifBody();
+	}
+
+	exec_mask em = orig_exec & exec_mask(!cond);
+
+	if (any(em))
+	{
+		m_exec = em;
+		elseBody();
+	}
+		
+	m_exec = orig_exec;
+}
+
+template<typename IfBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_if(const vbool& cond, const IfBody& ifBody)
+{
+	exec_mask cond_exec(cond);
+		
+	exec_mask pre_if_exec = cond_exec & m_exec;
+
+	if (any(pre_if_exec))
+	{
+		exec_mask unexecuted_lanes = andnot(cond_exec, m_exec);
+		m_exec = pre_if_exec;
+
+		ifBody();
+
+		// Propagate any lanes that got disabled inside the if body into the exec mask outside the if body, but turn on any lanes that didn't execute inside the if body.
+		m_exec = m_exec | unexecuted_lanes;
+
+		check_masks();
+	}
+}
+
+template<typename IfBody, typename ElseBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_ifelse(const vbool& cond, const IfBody& ifBody, const ElseBody& elseBody)
+{
+	bool all_flag = false;
+
+	exec_mask cond_exec(cond);
+		
+	{
+		exec_mask pre_if_exec = cond_exec & m_exec;
+
+		int mask = pre_if_exec.get_movemask();
+		if (mask != 0)
+		{
+			all_flag = ((uint32_t)mask == m_exec.get_movemask());
+
+			exec_mask unexecuted_lanes = andnot(cond_exec, m_exec);
+			m_exec = pre_if_exec;
+
+			ifBody();
+
+			// Propagate any lanes that got disabled inside the if body into the exec mask outside the if body, but turn on any lanes that didn't execute inside the if body.
+			m_exec = m_exec | unexecuted_lanes;
+
+			check_masks();
+		}
+	}
+
+	if (!all_flag)
+	{
+		exec_mask pre_if_exec = andnot(cond_exec, m_exec);
+
+		if (any(pre_if_exec))
+		{
+			exec_mask unexecuted_lanes = cond_exec & m_exec;
+			m_exec = pre_if_exec;
+
+			ifBody();
+
+			// Propagate any lanes that got disabled inside the if body into the exec mask outside the if body, but turn on any lanes that didn't execute inside the if body.
+			m_exec = m_exec | unexecuted_lanes;
+
+			check_masks();
+		}
+	}
+}
+
+struct scoped_exec_restorer
+{
+	exec_mask *m_pMask;
+	exec_mask m_prev_mask;
+	CPPSPMD_FORCE_INLINE scoped_exec_restorer(exec_mask *pExec_mask) : m_pMask(pExec_mask), m_prev_mask(*pExec_mask) { }
+	CPPSPMD_FORCE_INLINE ~scoped_exec_restorer() { *m_pMask = m_prev_mask; }
+};
+
+// Cannot use SPMD break, continue, or return inside "simple" if/else
+#define SPMD_SIF(cond) exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(vbool(cond))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ CPPSPMD::scoped_exec_restorer CPPSPMD_GLUER2(_exec_restore_, __LINE__)(&m_exec); m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SELSE(cond) } exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(!vbool(cond))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ CPPSPMD::scoped_exec_restorer CPPSPMD_GLUER2(_exec_restore_, __LINE__)(&m_exec); m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SENDIF }
+
+// Same as SPMD_SIF, except doesn't use a scoped object
+#define SPMD_SIF2(cond) exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(vbool(cond))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ exec_mask _orig_exec = m_exec; m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SELSE2(cond) m_exec = _orig_exec; } exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(!vbool(cond))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ exec_mask _orig_exec = m_exec; m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SEND_IF2 m_exec = _orig_exec; }
+
+// Same as SPMD_SIF(), except the if/else blocks are always executed
+#define SPMD_SAIF(cond) exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(vbool(cond))); { CPPSPMD::scoped_exec_restorer CPPSPMD_GLUER2(_exec_restore_, __LINE__)(&m_exec); \
+	m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SAELSE(cond) } exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(!vbool(cond))); { CPPSPMD::scoped_exec_restorer CPPSPMD_GLUER2(_exec_restore_, __LINE__)(&m_exec); \
+	m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SAENDIF }
+
+// Cannot use SPMD break, continue, or return inside sselect
+#define SPMD_SSELECT(var)		do { vint_t _select_var = var; scoped_exec_restorer _orig_exec(&m_exec); exec_mask _select_executed(exec_mask::all_off());
+#define SPMD_SCASE(value)		exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(_orig_exec.m_prev_mask & exec_mask(vbool(_select_var == (value)))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__); _select_executed = _select_executed | m_exec;
+
+//#define SPMD_SCASE_END			if (_select_executed.get_movemask() == _orig_exec.m_prev_mask.get_movemask()) break; }
+#define SPMD_SCASE_END			if (!any(_select_executed ^ _orig_exec.m_prev_mask)) break; }
+#define SPMD_SDEFAULT			exec_mask _all_other_lanes(andnot(_select_executed, _orig_exec.m_prev_mask)); if (any(_all_other_lanes)) { m_exec = _all_other_lanes;
+#define SPMD_SDEFAULT_END		}
+#define SPMD_SSELECT_END		} while(0);
+
+// Same as SPMD_SSELECT, except all cases are executed.
+// Cannot use SPMD break, continue, or return inside sselect
+#define SPMD_SASELECT(var)		do { vint_t _select_var = var; scoped_exec_restorer _orig_exec(&m_exec); exec_mask _select_executed(exec_mask::all_off());
+
+#define SPMD_SACASE(value)		exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(_orig_exec.m_prev_mask & exec_mask(vbool(_select_var == (value)))); { m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__); \
+	_select_executed = _select_executed | m_exec;
+
+#define SPMD_SACASE_END			}
+#define SPMD_SADEFAULT			exec_mask _all_other_lanes(andnot(_select_executed, _orig_exec.m_prev_mask)); { m_exec = _all_other_lanes;
+#define SPMD_SADEFAULT_END		}
+#define SPMD_SASELECT_END		} while(0);
+
+struct scoped_exec_restorer2
+{
+	spmd_kernel *m_pKernel;
+	exec_mask m_unexecuted_lanes;
+		
+	CPPSPMD_FORCE_INLINE scoped_exec_restorer2(spmd_kernel *pKernel, const vbool &cond) : 
+		m_pKernel(pKernel)
+	{ 
+		exec_mask cond_exec(cond);
+		m_unexecuted_lanes = andnot(cond_exec, pKernel->m_exec);
+		pKernel->m_exec = cond_exec & pKernel->m_exec;
+	}
+
+	CPPSPMD_FORCE_INLINE ~scoped_exec_restorer2() 
+	{ 
+		m_pKernel->m_exec = m_pKernel->m_exec | m_unexecuted_lanes;
+		m_pKernel->check_masks();
+	}
+};
+
+#define SPMD_IF(cond) { CPPSPMD::scoped_exec_restorer2 CPPSPMD_GLUER2(_exec_restore2_, __LINE__)(this, vbool(cond)); if (any(m_exec)) {
+#define SPMD_ELSE(cond) } } { CPPSPMD::scoped_exec_restorer2 CPPSPMD_GLUER2(_exec_restore2_, __LINE__)(this, !vbool(cond)); if (any(m_exec)) {
+#define SPMD_END_IF } }
+
+// Same as SPMD_IF, except the conditional block is always executed.
+#define SPMD_AIF(cond) { CPPSPMD::scoped_exec_restorer2 CPPSPMD_GLUER2(_exec_restore2_, __LINE__)(this, vbool(cond)); {
+#define SPMD_AELSE(cond) } } { CPPSPMD::scoped_exec_restorer2 CPPSPMD_GLUER2(_exec_restore2_, __LINE__)(this, !vbool(cond)); {
+#define SPMD_AEND_IF } }
+
+class scoped_exec_saver
+{
+	exec_mask m_exec, m_kernel_exec, m_continue_mask;
+	spmd_kernel *m_pKernel;
+#ifdef _DEBUG
+	bool m_in_loop;
+#endif
+
+public:
+	inline scoped_exec_saver(spmd_kernel *pKernel) :
+		m_exec(pKernel->m_exec), m_kernel_exec(pKernel->m_kernel_exec), m_continue_mask(pKernel->m_continue_mask),
+		m_pKernel(pKernel)
+	{ 
+#ifdef _DEBUG
+		m_in_loop = pKernel->m_in_loop;
+#endif
+	}
+		
+	inline ~scoped_exec_saver()
+	{ 
+		m_pKernel->m_exec = m_exec; 
+		m_pKernel->m_continue_mask = m_continue_mask; 
+		m_pKernel->m_kernel_exec = m_kernel_exec; 
+#ifdef _DEBUG
+		m_pKernel->m_in_loop = m_in_loop;
+		m_pKernel->check_masks();
+#endif
+	}
+};
+
+#define SPMD_BEGIN_CALL scoped_exec_saver CPPSPMD_GLUER2(_begin_call_scoped_exec_saver, __LINE__)(this); m_continue_mask = exec_mask::all_off();
+#define SPMD_BEGIN_CALL_ALL_LANES scoped_exec_saver CPPSPMD_GLUER2(_begin_call_scoped_exec_saver, __LINE__)(this); m_exec = exec_mask::all_on(); m_continue_mask = exec_mask::all_off();
+
+template<typename ForeachBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_foreach(int begin, int end, const ForeachBody& foreachBody)
+{
+	if (begin == end)
+		return;
+	
+	if (!any(m_exec))
+		return;
+
+	// We don't support iterating backwards.
+	if (begin > end)
+		std::swap(begin, end);
+
+	exec_mask prev_continue_mask = m_continue_mask, prev_exec = m_exec;
+	
+	int total_full = (end - begin) / PROGRAM_COUNT;
+	int total_partial = (end - begin) % PROGRAM_COUNT;
+
+	lint_t loop_index = begin + program_index;
+	
+	const int total_loops = total_full + (total_partial ? 1 : 0);
+
+	m_continue_mask = exec_mask::all_off();
+
+	for (int i = 0; i < total_loops; i++)
+	{
+		int n = PROGRAM_COUNT;
+		if ((i == (total_loops - 1)) && (total_partial))
+		{
+			exec_mask partial_mask = exec_mask(vint_t(total_partial) > vint_t(program_index));
+			m_exec = m_exec & partial_mask;
+			n = total_partial;
+		}
+
+		foreachBody(loop_index, n);
+
+		m_exec = m_exec | m_continue_mask;
+		if (!any(m_exec))
+			break;
+
+		m_continue_mask = exec_mask::all_off();
+		check_masks();
+				
+		store_all(loop_index, loop_index + PROGRAM_COUNT);
+	}
+
+	m_exec = prev_exec & m_kernel_exec;
+	m_continue_mask = prev_continue_mask;
+	check_masks();
+}
+
+template<typename WhileCondBody, typename WhileBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_while(const WhileCondBody& whileCondBody, const WhileBody& whileBody)
+{
+	exec_mask orig_exec = m_exec;
+
+	exec_mask orig_continue_mask = m_continue_mask;
+	m_continue_mask = exec_mask::all_off();
+
+#ifdef _DEBUG
+	const bool prev_in_loop = m_in_loop;
+	m_in_loop = true;
+#endif
+
+	while(true)
+	{
+		exec_mask cond_exec = exec_mask(whileCondBody());
+		m_exec = m_exec & cond_exec;
+
+		if (!any(m_exec))
+			break;
+
+		whileBody();
+
+		m_exec = m_exec | m_continue_mask;
+		m_continue_mask = exec_mask::all_off();
+		check_masks();
+	}
+
+#ifdef _DEBUG
+	m_in_loop = prev_in_loop;
+#endif
+
+	m_exec = orig_exec & m_kernel_exec;
+	m_continue_mask = orig_continue_mask;
+	check_masks();
+}
+
+struct scoped_while_restorer
+{
+	spmd_kernel *m_pKernel;
+	exec_mask m_orig_exec, m_orig_continue_mask;
+#ifdef _DEBUG
+	bool m_prev_in_loop;
+#endif
+				
+	CPPSPMD_FORCE_INLINE scoped_while_restorer(spmd_kernel *pKernel) : 
+		m_pKernel(pKernel), 
+		m_orig_exec(pKernel->m_exec),
+		m_orig_continue_mask(pKernel->m_continue_mask)
+	{
+		pKernel->m_continue_mask.all_off();
+
+#ifdef _DEBUG
+		m_prev_in_loop = pKernel->m_in_loop;
+		pKernel->m_in_loop = true;
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE ~scoped_while_restorer() 
+	{ 
+		m_pKernel->m_exec = m_orig_exec & m_pKernel->m_kernel_exec;
+		m_pKernel->m_continue_mask = m_orig_continue_mask;
+#ifdef _DEBUG
+		m_pKernel->m_in_loop = m_prev_in_loop;
+		m_pKernel->check_masks();
+#endif
+	}
+};
+
+#undef SPMD_WHILE
+#undef SPMD_WEND
+#define SPMD_WHILE(cond) { scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); while(true) { exec_mask CPPSPMD_GLUER2(cond_exec, __LINE__) = exec_mask(vbool(cond)); \
+	m_exec = m_exec & CPPSPMD_GLUER2(cond_exec, __LINE__); if (!any(m_exec)) break;
+
+#define SPMD_WEND m_exec = m_exec | m_continue_mask; m_continue_mask = exec_mask::all_off(); check_masks(); } }
+
+// Nesting is not supported (although it will compile, but the results won't make much sense).
+#define SPMD_FOREACH(loop_var, bi, ei) if (((bi) != (ei)) && (any(m_exec))) { \
+	scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); \
+	uint32_t b = (uint32_t)(bi), e = (uint32_t)(ei); if ((b) > (e)) { std::swap(b, e); } const uint32_t total_full = ((e) - (b)) >> PROGRAM_COUNT_SHIFT, total_partial = ((e) - (b)) & (PROGRAM_COUNT - 1); \
+	lint_t loop_var = program_index + (int)b; const uint32_t total_loops = total_full + (total_partial ? 1U : 0U); \
+	for (uint32_t CPPSPMD_GLUER2(_foreach_counter, __LINE__) = 0; CPPSPMD_GLUER2(_foreach_counter, __LINE__) < total_loops; ++CPPSPMD_GLUER2(_foreach_counter, __LINE__)) { \
+		if ((CPPSPMD_GLUER2(_foreach_counter, __LINE__) == (total_loops - 1)) && (total_partial)) { exec_mask partial_mask = exec_mask(vint_t((int)total_partial) > vint_t(program_index)); m_exec = m_exec & partial_mask; }
+
+#define SPMD_FOREACH_END(loop_var) m_exec = m_exec | m_continue_mask; if (!any(m_exec)) break; m_continue_mask = exec_mask::all_off(); check_masks(); store_all(loop_var, loop_var + PROGRAM_COUNT); } }
+
+// Okay to use spmd_continue or spmd_return, but not spmd_break
+#define SPMD_FOREACH_ACTIVE(index_var) int64_t index_var; { uint64_t _movemask = m_exec.get_movemask(); if (_movemask) { scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); \
+	for (uint32_t _i = 0; _i < PROGRAM_COUNT; ++_i) { \
+		if (_movemask & (1U << _i)) { \
+			m_exec.enable_lane(_i); m_exec = m_exec & m_kernel_exec; \
+			(index_var) = _i; \
+
+#define SPMD_FOREACH_ACTIVE_END } } } }
+
+// Okay to use spmd_continue, but not spmd_break/spmd_continue
+#define SPMD_FOREACH_UNIQUE_INT(index_var, var) { scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); \
+	CPPSPMD_DECL(int_t, _vals[PROGRAM_COUNT]); store_linear_all(_vals, var); std::sort(_vals, _vals + PROGRAM_COUNT); \
+	const int _n = (int)(std::unique(_vals, _vals + PROGRAM_COUNT) - _vals); \
+	for (int _i = 0; _i < _n; ++_i) { int index_var = _vals[_i]; vbool cond = (vint_t(var) == vint_t(index_var)); m_exec = exec_mask(cond);
+
+#define SPMD_FOREACH_UNIQUE_INT_END } }
+
+struct scoped_simple_while_restorer
+{
+	spmd_kernel* m_pKernel;
+	exec_mask m_orig_exec;
+#ifdef _DEBUG
+	bool m_prev_in_loop;
+#endif
+
+	CPPSPMD_FORCE_INLINE scoped_simple_while_restorer(spmd_kernel* pKernel) :
+		m_pKernel(pKernel),
+		m_orig_exec(pKernel->m_exec)
+	{
+			
+#ifdef _DEBUG
+		m_prev_in_loop = pKernel->m_in_loop;
+		pKernel->m_in_loop = true;
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE ~scoped_simple_while_restorer()
+	{
+		m_pKernel->m_exec = m_orig_exec;
+#ifdef _DEBUG
+		m_pKernel->m_in_loop = m_prev_in_loop;
+		m_pKernel->check_masks();
+#endif
+	}
+};
+
+// Cannot use SPMD break, continue, or return inside simple while
+
+#define SPMD_SWHILE(cond) { scoped_simple_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); \
+	while(true) { \
+		exec_mask CPPSPMD_GLUER2(cond_exec, __LINE__) = exec_mask(vbool(cond)); m_exec = m_exec & CPPSPMD_GLUER2(cond_exec, __LINE__); if (!any(m_exec)) break;
+#define SPMD_SWEND } }	
+
+// Cannot use SPMD break, continue, or return inside simple do
+#define SPMD_SDO { scoped_simple_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); while(true) {
+#define SPMD_SEND_DO(cond) exec_mask CPPSPMD_GLUER2(cond_exec, __LINE__) = exec_mask(vbool(cond)); m_exec = m_exec & CPPSPMD_GLUER2(cond_exec, __LINE__); if (!any(m_exec)) break; } }	
+
+#undef SPMD_FOR
+#undef SPMD_END_FOR
+#define SPMD_FOR(for_init, for_cond) { for_init; scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); while(true) { exec_mask CPPSPMD_GLUER2(cond_exec, __LINE__) = exec_mask(vbool(for_cond)); \
+	m_exec = m_exec & CPPSPMD_GLUER2(cond_exec, __LINE__); if (!any(m_exec)) break;
+#define SPMD_END_FOR(for_inc) m_exec = m_exec | m_continue_mask; m_continue_mask = exec_mask::all_off(); check_masks(); for_inc; } }
+		
+template<typename ForInitBody, typename ForCondBody, typename ForIncrBody, typename ForBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_for(const ForInitBody& forInitBody, const ForCondBody& forCondBody, const ForIncrBody& forIncrBody, const ForBody& forBody)
+{
+	exec_mask orig_exec = m_exec;
+
+	forInitBody();
+
+	exec_mask orig_continue_mask = m_continue_mask;
+	m_continue_mask = exec_mask::all_off();
+
+#ifdef _DEBUG
+	const bool prev_in_loop = m_in_loop;
+	m_in_loop = true;
+#endif
+
+	while(true)
+	{
+		exec_mask cond_exec = exec_mask(forCondBody());
+		m_exec = m_exec & cond_exec;
+
+		if (!any(m_exec))
+			break;
+
+		forBody();
+
+		m_exec = m_exec | m_continue_mask;
+		m_continue_mask = exec_mask::all_off();
+		check_masks();
+			
+		forIncrBody();
+	}
+
+	m_exec = orig_exec & m_kernel_exec;
+	m_continue_mask = orig_continue_mask;
+
+#ifdef _DEBUG
+	m_in_loop = prev_in_loop;
+	check_masks();
+#endif
+}
diff --git a/encoder/cppspmd_math.h b/encoder/cppspmd_math.h
new file mode 100644
index 0000000..e7b3202
--- /dev/null
+++ b/encoder/cppspmd_math.h
@@ -0,0 +1,725 @@
+// Do not include this header directly.
+//
+// Copyright 2020-2021 Binomial LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The general goal of these vectorized estimated math functions is scalability/performance.
+// There are explictly no checks NaN's/Inf's on the input arguments. There are no assertions either. 
+// These are fast estimate functions - if you need more than that, use stdlib. Please do a proper 
+// engineering analysis before relying on them.
+// I have chosen functions written by others, ported them to CppSPMD, then measured their abs/rel errors.
+// I compared each to the ones in DirectXMath and stdlib's for accuracy/performance.
+
+CPPSPMD_FORCE_INLINE vfloat fmod_inv(const vfloat& a, const vfloat& b, const vfloat& b_inv) 
+{ 
+	vfloat c = frac(abs(a * b_inv)) * abs(b); 
+	return spmd_ternaryf(a < 0, -c, c); 
+}
+
+CPPSPMD_FORCE_INLINE vfloat fmod_inv_p(const vfloat& a, const vfloat& b, const vfloat& b_inv) 
+{ 
+	return frac(a * b_inv) * b; 
+}
+
+// Avoids dividing by zero or very small values.
+CPPSPMD_FORCE_INLINE vfloat safe_div(vfloat a, vfloat b, float fDivThresh = 1e-7f)
+{
+	return a / spmd_ternaryf( abs(b) > fDivThresh, b, spmd_ternaryf(b < 0.0f, -fDivThresh, fDivThresh) );
+}
+
+/*
+	clang 9.0.0 for win /fp:precise release
+	f range: 0.0000000000001250 10000000000.0000000000000000, vals: 1073741824
+
+	log2_est():
+	max abs err: 0.0000023076808731
+	max rel err: 0.0000000756678881
+	avg abs err: 0.0000007535452724
+	avg rel err: 0.0000000235117843
+
+	XMVectorLog2():
+	max abs err: 0.0000023329709933
+	max rel err: 0.0000000826961046
+	avg abs err: 0.0000007564889684
+	avg rel err: 0.0000000236051899
+
+	std::log2f():
+	max abs err: 0.0000020265979401
+	max rel err: 0.0000000626647654
+	avg abs err: 0.0000007494445227
+	avg rel err: 0.0000000233800985
+*/
+
+// See https://tech.ebayinc.com/engineering/fast-approximate-logarithms-part-iii-the-formulas/
+inline vfloat spmd_kernel::log2_est(vfloat v)
+{
+	vfloat signif, fexp;
+
+	// Just clamp to a very small value, instead of checking for invalid inputs.
+	vfloat x = max(v, 2.2e-38f);
+
+	/*
+	 * Assume IEEE representation, which is sgn(1):exp(8):frac(23)
+	 * representing (1+frac)*2^(exp-127).  Call 1+frac the significand
+	 */
+
+	 // get exponent
+	vint ux1_i = cast_vfloat_to_vint(x);
+
+	vint exp = VUINT_SHIFT_RIGHT(ux1_i & 0x7F800000, 23);
+
+	// actual exponent is exp-127, will subtract 127 later
+
+	vint ux2_i;
+	vfloat ux2_f;
+
+	vint greater = ux1_i & 0x00400000;  // true if signif > 1.5
+	SPMD_SIF(greater != 0)
+	{
+		// signif >= 1.5 so need to divide by 2.  Accomplish this by stuffing exp = 126 which corresponds to an exponent of -1 
+		store_all(ux2_i, (ux1_i & 0x007FFFFF) | 0x3f000000);
+
+		store_all(ux2_f, cast_vint_to_vfloat(ux2_i));
+
+		// 126 instead of 127 compensates for division by 2
+		store_all(fexp, vfloat(exp - 126));    
+	}
+	SPMD_SELSE(greater != 0)
+	{
+		// get signif by stuffing exp = 127 which corresponds to an exponent of 0
+		store(ux2_i, (ux1_i & 0x007FFFFF) | 0x3f800000);
+
+		store(ux2_f, cast_vint_to_vfloat(ux2_i));
+
+		store(fexp, vfloat(exp - 127));
+	}
+	SPMD_SENDIF
+
+	store_all(signif, ux2_f);
+	store_all(signif, signif - 1.0f);
+
+	const float a = 0.1501692f, b = 3.4226132f, c = 5.0225057f, d = 4.1130283f, e = 3.4813372f;
+
+	vfloat xm1 = signif;
+	vfloat xm1sqr = xm1 * xm1;
+		
+	return fexp + ((a * (xm1sqr * xm1) + b * xm1sqr + c * xm1) / (xm1sqr + d * xm1 + e));
+	
+	// fma lowers accuracy for SSE4.1 - no idea why (compiler reordering?)
+	//return fexp + ((vfma(a, (xm1sqr * xm1), vfma(b, xm1sqr, c * xm1))) / (xm1sqr + vfma(d, xm1, e)));
+}
+
+// Uses log2_est(), so this function must be <= the precision of that.
+inline vfloat spmd_kernel::log_est(vfloat v)
+{
+	return log2_est(v) * 0.693147181f;
+}
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::reduce_expb(vfloat& arg, vfloat& two_int_a, vint& adjustment)
+{
+	// Assume we're using equation (2)
+	store_all(adjustment, 0);
+	
+	// integer part of the input argument
+	vint int_arg = (vint)arg;
+	
+	// if frac(arg) is in [0.5, 1.0]...
+	SPMD_SIF((arg - int_arg) > 0.5f)   
+	{
+		store(adjustment, 1);
+		
+		// then change it to [0.0, 0.5]
+		store(arg, arg - 0.5f);
+	}
+	SPMD_SENDIF
+
+	// arg == just the fractional part
+	store_all(arg, arg - (vfloat)int_arg);
+   
+	// Now compute 2** (int) arg. 
+	store_all(int_arg, min(int_arg + 127, 254));
+	
+	store_all(two_int_a, cast_vint_to_vfloat(VINT_SHIFT_LEFT(int_arg, 23)));
+}
+
+/*
+	clang 9.0.0 for win /fp:precise release
+	f range : -50.0000000000000000 49.9999940395355225, vals : 16777216
+	
+	exp2_est():
+	Total passed near - zero check : 16777216
+	Total sign diffs : 0
+	max abs err: 1668910609.7500000000000000
+	max rel err: 0.0000015642030031
+	avg abs err: 10793794.4007573910057545
+	avg rel err: 0.0000003890893282
+	 
+	XMVectorExp2():
+	Total passed near-zero check: 16777216
+	Total sign diffs: 0
+	max abs err: 1665552836.8750000000000000
+	max rel err: 0.0000114674862370
+	avg abs err: 10771868.2627860084176064
+	avg rel err: 0.0000011218880770
+
+	std::exp2f():
+	Total passed near-zero check: 16777216
+	Total sign diffs: 0
+	max abs err: 1591636585.6250000000000000
+	max rel err: 0.0000014849731018
+	avg abs err: 10775800.3204844966530800
+	avg rel err: 0.0000003851496422
+*/
+
+// http://www.ganssle.com/item/approximations-c-code-exponentiation-log.htm
+inline vfloat spmd_kernel::exp2_est(vfloat arg)
+{
+	SPMD_BEGIN_CALL
+
+	const vfloat P00 = +7.2152891521493f;
+	const vfloat P01 = +0.0576900723731f;
+	const vfloat Q00 = +20.8189237930062f;
+	const vfloat Q01 = +1.0f;
+	const vfloat sqrt2 = 1.4142135623730950488f; // sqrt(2) for scaling 
+
+	vfloat result = 0.0f;
+
+	// Return 0 if arg is too large. 
+	// We're not introducing inf/nan's into calculations, or risk doing so by returning huge default values.
+	SPMD_IF(abs(arg) > 126.0f)
+	{
+		spmd_return();
+	}
+	SPMD_END_IF
+
+	// 2**(int(a))
+	vfloat two_int_a;                
+	
+	// set to 1 by reduce_expb
+	vint adjustment;
+	
+	// 0 if arg is +; 1 if negative
+	vint negative = 0;                 
+
+	// If the input is negative, invert it. At the end we'll take the reciprocal, since n**(-1) = 1/(n**x).
+	SPMD_SIF(arg < 0.0f)
+	{
+		store(arg, -arg);
+		store(negative, 1);
+	}
+	SPMD_SENDIF
+
+	store_all(arg, min(arg, 126.0f));
+
+	// reduce to [0.0, 0.5]
+	reduce_expb(arg, two_int_a, adjustment);
+
+	// The format of the polynomial is:
+	//  answer=(Q(x**2) + x*P(x**2))/(Q(x**2) - x*P(x**2))
+	//
+	//  The following computes the polynomial in several steps:
+
+	// Q(x**2)
+	vfloat Q = vfma(Q01, (arg * arg), Q00);
+	
+	// x*P(x**2)
+	vfloat x_P = arg * (vfma(P01, arg * arg, P00));
+	
+	vfloat answer = (Q + x_P) / (Q - x_P);
+
+	// Now correct for the scaling factor of 2**(int(a))
+	store_all(answer, answer * two_int_a);
+			
+	// If the result had a fractional part > 0.5, correct for that
+	store_all(answer, spmd_ternaryf(adjustment != 0, answer * sqrt2, answer));
+
+	// Correct for a negative input
+	SPMD_SIF(negative != 0)
+	{
+		store(answer, 1.0f / answer);
+	}
+	SPMD_SENDIF
+
+	store(result, answer);
+
+	return result;
+}
+
+inline vfloat spmd_kernel::exp_est(vfloat arg)
+{
+	// e^x = exp2(x / log_base_e(2))
+	// constant is 1.0/(log(2)/log(e)) or 1/log(2)
+	return exp2_est(arg * 1.44269504f);
+}
+
+inline vfloat spmd_kernel::pow_est(vfloat arg1, vfloat arg2)
+{
+	return exp_est(log_est(arg1) * arg2);
+}
+
+/*
+	clang 9.0.0 for win /fp:precise release
+	Total near-zero: 144, output above near-zero tresh: 30
+	Total near-zero avg: 0.0000067941016621 max: 0.0000134706497192
+	Total near-zero sign diffs: 5
+	Total passed near-zero check: 16777072
+	Total sign diffs: 5
+	max abs err: 0.0000031375306036
+	max rel err: 0.1140846017075028
+	avg abs err: 0.0000003026226621
+	avg rel err: 0.0000033564977623
+*/
+
+// Math from this web page: http://developer.download.nvidia.com/cg/sin.html
+// This is ~2x slower than sin_est() or cos_est(), and less accurate, but I'm keeping it here for comparison purposes to help validate/sanity check sin_est() and cos_est().
+inline vfloat spmd_kernel::sincos_est_a(vfloat a, bool sin_flag)
+{
+	const float c0_x = 0.0f, c0_y = 0.5f, c0_z = 1.0f;
+	const float c1_x = 0.25f, c1_y = -9.0f, c1_z = 0.75f, c1_w = 0.159154943091f;
+	const float c2_x = 24.9808039603f, c2_y = -24.9808039603f, c2_z = -60.1458091736f, c2_w = 60.1458091736f;
+	const float c3_x = 85.4537887573f, c3_y = -85.4537887573f, c3_z = -64.9393539429f, c3_w = 64.9393539429f;
+	const float c4_x = 19.7392082214f, c4_y = -19.7392082214f, c4_z = -1.0f, c4_w = 1.0f;
+
+	vfloat r0_x, r0_y, r0_z, r1_x, r1_y, r1_z, r2_x, r2_y, r2_z;
+
+	store_all(r1_x, sin_flag ? vfms(c1_w, a, c1_x) : c1_w * a);
+
+	store_all(r1_y, frac(r1_x));                   
+	
+	store_all(r2_x, (vfloat)(r1_y < c1_x));        
+
+	store_all(r2_y, (vfloat)(r1_y >= c1_y));    
+	store_all(r2_z, (vfloat)(r1_y >= c1_z));    
+
+	store_all(r2_y, vfma(r2_x, c4_z, vfma(r2_y, c4_w, r2_z * c4_z)));
+
+	store_all(r0_x, c0_x - r1_y);                
+	store_all(r0_y, c0_y - r1_y);                
+	store_all(r0_z, c0_z - r1_y);                
+	
+	store_all(r0_x, r0_x * r0_x);
+	store_all(r0_y, r0_y * r0_y);
+	store_all(r0_z, r0_z * r0_z);
+
+	store_all(r1_x, vfma(c2_x, r0_x, c2_z));           
+	store_all(r1_y, vfma(c2_y, r0_y, c2_w));           
+	store_all(r1_z, vfma(c2_x, r0_z, c2_z));           
+	
+	store_all(r1_x, vfma(r1_x, r0_x, c3_x));
+	store_all(r1_y, vfma(r1_y, r0_y, c3_y));
+	store_all(r1_z, vfma(r1_z, r0_z, c3_x));
+		
+	store_all(r1_x, vfma(r1_x, r0_x, c3_z));
+	store_all(r1_y, vfma(r1_y, r0_y, c3_w));
+	store_all(r1_z, vfma(r1_z, r0_z, c3_z));
+	
+	store_all(r1_x, vfma(r1_x, r0_x, c4_x));
+	store_all(r1_y, vfma(r1_y, r0_y, c4_y));
+	store_all(r1_z, vfma(r1_z, r0_z, c4_x));
+
+	store_all(r1_x, vfma(r1_x, r0_x, c4_z));
+	store_all(r1_y, vfma(r1_y, r0_y, c4_w));
+	store_all(r1_z, vfma(r1_z, r0_z, c4_z));
+
+	store_all(r0_x, vfnma(r1_x, r2_x, vfnma(r1_y, r2_y, r1_z * -r2_z)));
+
+	return r0_x;
+}
+
+// positive values only
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::recip_est1(const vfloat& q)
+{
+	//const int mag = 0x7EF312AC; // 2 NR iters, 3 is  0x7EEEEBB3
+	const int mag = 0x7EF311C3;
+	const float fMinThresh = .0000125f;
+
+	vfloat l = spmd_ternaryf(q >= fMinThresh, q, cast_vint_to_vfloat(vint(mag)));
+
+	vint x_l = vint(mag) - cast_vfloat_to_vint(l);
+	
+	vfloat rcp_l = cast_vint_to_vfloat(x_l);
+	
+	return rcp_l * vfnma(rcp_l, q, 2.0f);
+}
+
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::recip_est1_pn(const vfloat& t)
+{
+	//const int mag = 0x7EF312AC; // 2 NR iters, 3 is  0x7EEEEBB3
+	const int mag = 0x7EF311C3;
+	const float fMinThresh = .0000125f;
+
+	vfloat s = sign(t);
+	vfloat q = abs(t);
+
+	vfloat l = spmd_ternaryf(q >= fMinThresh, q, cast_vint_to_vfloat(vint(mag)));
+
+	vint x_l = vint(mag) - cast_vfloat_to_vint(l);
+
+	vfloat rcp_l = cast_vint_to_vfloat(x_l);
+
+	return rcp_l * vfnma(rcp_l, q, 2.0f) * s;
+}
+
+// https://basesandframes.files.wordpress.com/2020/04/even_faster_math_functions_green_2020.pdf
+// https://github.com/hcs0/Hackers-Delight/blob/master/rsqrt.c.txt
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::rsqrt_est1(vfloat x0)
+{
+	vfloat xhalf = 0.5f * x0;
+	vfloat x = cast_vint_to_vfloat(vint(0x5F375A82) - (VINT_SHIFT_RIGHT(cast_vfloat_to_vint(x0), 1)));
+	return x * vfnma(xhalf * x, x, 1.5008909f);
+}
+
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::rsqrt_est2(vfloat x0)
+{
+	vfloat xhalf = 0.5f * x0;
+	vfloat x = cast_vint_to_vfloat(vint(0x5F37599E) - (VINT_SHIFT_RIGHT(cast_vfloat_to_vint(x0), 1)));
+	vfloat x1 = x * vfnma(xhalf * x, x, 1.5);
+	vfloat x2 = x1 * vfnma(xhalf * x1, x1, 1.5);
+	return x2;
+}
+
+// Math from: http://developer.download.nvidia.com/cg/atan2.html
+// TODO: Needs more validation, parameter checking.
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::atan2_est(vfloat y, vfloat x)
+{
+	vfloat t1 = abs(y);
+	vfloat t3 = abs(x);
+	
+	vfloat t0 = max(t3, t1);
+	store_all(t1, min(t3, t1));
+
+	store_all(t3, t1 / t0);
+	
+	vfloat t4 = t3 * t3;
+	store_all(t0, vfma(-0.013480470f, t4, 0.057477314f));
+	store_all(t0, vfms(t0, t4, 0.121239071f));
+	store_all(t0, vfma(t0, t4, 0.195635925f));
+	store_all(t0, vfms(t0, t4, 0.332994597f));
+	store_all(t0, vfma(t0, t4, 0.999995630f));
+	store_all(t3, t0 * t3);
+
+	store_all(t3, spmd_ternaryf(abs(y) > abs(x), vfloat(1.570796327f) - t3, t3));
+
+	store_all(t3, spmd_ternaryf(x < 0.0f, vfloat(3.141592654f) - t3, t3));
+	store_all(t3, spmd_ternaryf(y < 0.0f, -t3, t3));
+
+	return t3;
+}
+
+/*
+    clang 9.0.0 for win /fp:precise release
+	Tested range: -25.1327412287183449 25.1327382326621169, vals : 16777216
+	Skipped angles near 90/270 within +- .001 radians.
+	Near-zero threshold: .0000125f
+	Near-zero output above check threshold: 1e-6f
+
+	Total near-zero: 144, output above near-zero tresh: 20
+	Total near-zero avg: 0.0000067510751968 max: 0.0000133514404297
+	Total near-zero sign diffs: 5
+	Total passed near-zero check: 16766400
+	Total sign diffs: 5
+	max abs err: 1.4982600811139264
+	max rel err: 0.1459155900188041
+	avg rel err: 0.0000054659502568
+
+	XMVectorTan() precise:
+	Total near-zero: 144, output above near-zero tresh: 18
+	Total near-zero avg: 0.0000067641216186 max: 0.0000133524126795
+	Total near-zero sign diffs: 0
+	Total passed near-zero check: 16766400
+	Total sign diffs: 0
+	max abs err: 1.9883573246424930
+	max rel err: 0.1459724171926864
+	avg rel err: 0.0000054965766843
+
+	std::tanf():
+	Total near-zero: 144, output above near-zero tresh: 0
+	Total near-zero avg: 0.0000067116930779 max: 0.0000127713074107
+	Total near-zero sign diffs: 11
+	Total passed near-zero check: 16766400
+	Total sign diffs: 11
+	max abs err: 0.8989131818294709
+	max rel err: 0.0573181403173166
+	avg rel err: 0.0000030791301203
+	
+	Originally from:
+	http://www.ganssle.com/approx.htm
+*/
+
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::tan82(vfloat x)
+{
+	// Original double version was 8.2 digits
+	//double c1 = 211.849369664121f, c2 = -12.5288887278448f, c3 = 269.7350131214121f, c4 = -71.4145309347748f;
+	// Tuned float constants for lower avg rel error (without using FMA3):
+	const float c1 = 211.849350f, c2 = -12.5288887f, c3 = 269.734985f, c4 = -71.4145203f;
+	vfloat x2 = x * x;
+	return (x * (vfma(c2, x2, c1)) / (vfma(x2, (c4 + x2), c3)));
+}
+
+// Don't call this for angles close to 90/270!.
+inline vfloat spmd_kernel::tan_est(vfloat x)
+{
+	const float fPi = 3.141592653589793f, fOneOverPi = 0.3183098861837907f;
+	CPPSPMD_DECL(const uint8_t, s_table0[16]) =	{ 128 + 0, 128 + 2, 128 + -2, 128 + 4,    128 + 0, 128 + 2, 128 + -2, 128 + 4,	  128 + 0, 128 + 2, 128 + -2, 128 + 4,   128 + 0, 128 + 2, 128 + -2, 128 + 4 };
+
+	vint table = init_lookup4(s_table0); // a load
+	vint sgn = cast_vfloat_to_vint(x) & 0x80000000;
+
+	store_all(x, abs(x));
+	vfloat orig_x = x;
+
+	vfloat q = x * fOneOverPi;
+	store_all(x, q - floor(q));
+
+	vfloat x4 = x * 4.0f;
+	vint octant = (vint)(x4);
+
+	vfloat x0 = spmd_ternaryf((octant & 1) != 0, -x4, x4);
+
+	vint k = table_lookup4_8(octant, table) & 0xFF; // a shuffle
+
+	vfloat bias = (vfloat)k + -128.0f;
+	vfloat y = x0 + bias;
+
+	vfloat z = tan82(y);
+
+	vfloat r;
+	
+	vbool octant_one_or_two = (octant == 1) || (octant == 2);
+
+	// SPMD optimization - skip costly divide if we can
+	if (spmd_any(octant_one_or_two))
+	{
+		const float fDivThresh = .4371e-7f;
+		vfloat one_over_z = 1.0f / spmd_ternaryf(abs(z) > fDivThresh, z, spmd_ternaryf(z < 0.0f, -fDivThresh, fDivThresh));
+				
+		vfloat b = spmd_ternaryf(octant_one_or_two, one_over_z, z);
+		store_all(r, spmd_ternaryf((octant & 2) != 0, -b, b));
+	}
+	else
+	{
+		store_all(r, spmd_ternaryf(octant == 0, z, -z));
+	}
+		
+	// Small angle approximation, to decrease the max rel error near Pi.
+	SPMD_SIF(x >= (1.0f - .0003125f*4.0f))
+	{
+		store(r, vfnma(floor(q) + 1.0f, fPi, orig_x));
+	}
+	SPMD_SENDIF
+
+	return cast_vint_to_vfloat(cast_vfloat_to_vint(r) ^ sgn);
+}
+
+inline void spmd_kernel::seed_rand(rand_context& x, vint seed)
+{ 
+	store(x.a, 0xf1ea5eed); 
+	store(x.b, seed ^ 0xd8487b1f); 
+	store(x.c, seed ^ 0xdbadef9a); 
+	store(x.d, seed); 
+	for (int i = 0; i < 20; ++i) 
+		(void)get_randu(x); 
+}
+
+// https://burtleburtle.net/bob/rand/smallprng.html
+// Returns 32-bit unsigned random numbers.
+inline vint spmd_kernel::get_randu(rand_context& x)
+{ 
+	vint e = x.a - VINT_ROT(x.b, 27); 
+	store(x.a, x.b ^ VINT_ROT(x.c, 17)); 
+	store(x.b, x.c + x.d); 
+	store(x.c, x.d + e); 
+	store(x.d, e + x.a);	
+	return x.d; 
+}
+
+// Returns random numbers between [low, high), or low if low >= high
+inline vint spmd_kernel::get_randi(rand_context& x, vint low, vint high)
+{
+	vint rnd = get_randu(x);
+
+	vint range = high - low;
+
+	vint rnd_range = mulhiu(rnd, range);
+	
+	return spmd_ternaryi(low < high, low + rnd_range, low);
+}
+
+// Returns random numbers between [low, high), or low if low >= high
+inline vfloat spmd_kernel::get_randf(rand_context& x, vfloat low, vfloat high)
+{
+	vint rndi = get_randu(x) & 0x7fffff;
+
+	vfloat rnd = (vfloat)(rndi) * (1.0f / 8388608.0f);
+
+	return spmd_ternaryf(low < high, vfma(high - low, rnd, low), low);
+}
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::init_reverse_bits(vint& tab1, vint& tab2)
+{
+	const uint8_t tab1_bytes[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
+	const uint8_t tab2_bytes[16] = { 0, 8 << 4, 4 << 4, 12 << 4, 2 << 4, 10 << 4, 6 << 4, 14 << 4, 1 << 4, 9 << 4, 5 << 4, 13 << 4, 3 << 4, 11 << 4, 7 << 4, 15 << 4 };
+	store_all(tab1, init_lookup4(tab1_bytes));
+	store_all(tab2, init_lookup4(tab2_bytes));
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::reverse_bits(vint k, vint tab1, vint tab2)
+{
+	vint r0 = table_lookup4_8(k & 0x7F7F7F7F, tab2);
+	vint r1 = table_lookup4_8(VUINT_SHIFT_RIGHT(k, 4) & 0x7F7F7F7F, tab1);
+	vint r3 = r0 | r1;
+	return byteswap(r3);
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::count_leading_zeros(vint x)
+{
+	CPPSPMD_DECL(const uint8_t, s_tab[16]) = { 0, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+	vint tab = init_lookup4(s_tab);
+
+	//x <= 0x0000ffff
+	vbool c0 = (x & 0xFFFF0000) == 0;
+	vint n0 = spmd_ternaryi(c0, 16, 0);
+	vint x0 = spmd_ternaryi(c0, VINT_SHIFT_LEFT(x, 16), x);
+
+	//x <= 0x00ffffff
+	vbool c1 = (x0 & 0xFF000000) == 0;
+	vint n1 = spmd_ternaryi(c1, n0 + 8, n0);
+	vint x1 = spmd_ternaryi(c1, VINT_SHIFT_LEFT(x0, 8), x0);
+
+	//x <= 0x0fffffff
+	vbool c2 = (x1 & 0xF0000000) == 0;
+	vint n2 = spmd_ternaryi(c2, n1 + 4, n1);
+	vint x2 = spmd_ternaryi(c2, VINT_SHIFT_LEFT(x1, 4), x1);
+
+	return table_lookup4_8(VUINT_SHIFT_RIGHT(x2, 28), tab) + n2;
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::count_leading_zeros_alt(vint x)
+{
+	//x <= 0x0000ffff
+	vbool c0 = (x & 0xFFFF0000) == 0;
+	vint n0 = spmd_ternaryi(c0, 16, 0);
+	vint x0 = spmd_ternaryi(c0, VINT_SHIFT_LEFT(x, 16), x);
+
+	//x <= 0x00ffffff
+	vbool c1 = (x0 & 0xFF000000) == 0;
+	vint n1 = spmd_ternaryi(c1, n0 + 8, n0);
+	vint x1 = spmd_ternaryi(c1, VINT_SHIFT_LEFT(x0, 8), x0);
+
+	//x <= 0x0fffffff
+	vbool c2 = (x1 & 0xF0000000) == 0;
+	vint n2 = spmd_ternaryi(c2, n1 + 4, n1);
+	vint x2 = spmd_ternaryi(c2, VINT_SHIFT_LEFT(x1, 4), x1);
+
+	// x <= 0x3fffffff
+	vbool c3 = (x2 & 0xC0000000) == 0;
+	vint n3 = spmd_ternaryi(c3, n2 + 2, n2);
+	vint x3 = spmd_ternaryi(c3, VINT_SHIFT_LEFT(x2, 2), x2);
+
+	// x <= 0x7fffffff
+	vbool c4 = (x3 & 0x80000000) == 0;
+	return spmd_ternaryi(c4, n3 + 1, n3);
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::count_trailing_zeros(vint x)
+{
+	// cast the least significant bit in v to a float
+	vfloat f = (vfloat)(x & -x);
+	
+	// extract exponent and adjust
+	return VUINT_SHIFT_RIGHT(cast_vfloat_to_vint(f), 23) - 0x7F;
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::count_set_bits(vint x)
+{
+	vint v = x - (VUINT_SHIFT_RIGHT(x, 1) & 0x55555555);                    
+	vint v1 = (v & 0x33333333) + (VUINT_SHIFT_RIGHT(v, 2) & 0x33333333);     
+	return VUINT_SHIFT_RIGHT(((v1 + VUINT_SHIFT_RIGHT(v1, 4) & 0xF0F0F0F) * 0x1010101), 24);
+}
+
+CPPSPMD_FORCE_INLINE vint cmple_epu16(const vint &a, const vint &b) 
+{ 
+	return cmpeq_epi16(subs_epu16(a, b), vint(0)); 
+}
+
+CPPSPMD_FORCE_INLINE vint cmpge_epu16(const vint &a, const vint &b) 
+{ 
+	return cmple_epu16(b, a);
+}
+
+CPPSPMD_FORCE_INLINE vint cmpgt_epu16(const vint &a, const vint &b)
+{
+	return andnot(cmpeq_epi16(a, b), cmple_epu16(b, a));
+}
+
+CPPSPMD_FORCE_INLINE vint cmplt_epu16(const vint &a, const vint &b)
+{
+	return cmpgt_epu16(b, a);
+}
+
+CPPSPMD_FORCE_INLINE vint cmpge_epi16(const vint &a, const vint &b)
+{
+	return cmpeq_epi16(a, b) | cmpgt_epi16(a, b);
+}
+
+CPPSPMD_FORCE_INLINE vint cmple_epi16(const vint &a, const vint &b)
+{
+	return cmpge_epi16(b, a);
+}
+
+void spmd_kernel::print_vint(vint v) 
+{ 
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+		printf("%i ", extract(v, i)); 
+	printf("\n"); 
+}
+
+void spmd_kernel::print_vbool(vbool v) 
+{ 
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+		printf("%i ", extract(v, i) ? 1 : 0); 
+	printf("\n"); 
+}
+	
+void spmd_kernel::print_vint_hex(vint v) 
+{ 
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+		printf("0x%X ", extract(v, i)); 
+	printf("\n"); 
+}
+
+void spmd_kernel::print_active_lanes(const char *pPrefix) 
+{ 
+	CPPSPMD_DECL(int, flags[PROGRAM_COUNT]);
+	memset(flags, 0, sizeof(flags));
+	storeu_linear(flags, vint(1));
+
+	if (pPrefix)
+		printf("%s", pPrefix);
+
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+	{
+		if (flags[i])
+			printf("%u ", i);
+	}
+	printf("\n");
+}
+	
+void spmd_kernel::print_vfloat(vfloat v) 
+{ 
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+		printf("%f ", extract(v, i)); 
+	printf("\n"); 
+}
diff --git a/encoder/cppspmd_math_declares.h b/encoder/cppspmd_math_declares.h
new file mode 100644
index 0000000..cdb6447
--- /dev/null
+++ b/encoder/cppspmd_math_declares.h
@@ -0,0 +1,89 @@
+// Do not include this header directly.
+// This header defines shared struct spmd_kernel helpers.
+//
+// Copyright 2020-2021 Binomial LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// See cppspmd_math.h for detailed error statistics.
+
+CPPSPMD_FORCE_INLINE void reduce_expb(vfloat& arg, vfloat& two_int_a, vint& adjustment);
+CPPSPMD_FORCE_INLINE vfloat tan56(vfloat x);
+CPPSPMD_FORCE_INLINE vfloat tan82(vfloat x);
+
+inline vfloat log2_est(vfloat v);
+
+inline vfloat log_est(vfloat v);
+
+inline vfloat exp2_est(vfloat arg);
+
+inline vfloat exp_est(vfloat arg);
+
+inline vfloat pow_est(vfloat arg1, vfloat arg2);
+
+CPPSPMD_FORCE_INLINE vfloat recip_est1(const vfloat& q);
+CPPSPMD_FORCE_INLINE vfloat recip_est1_pn(const vfloat& q);
+
+inline vfloat mod_angles(vfloat a);
+
+inline vfloat sincos_est_a(vfloat a, bool sin_flag);
+CPPSPMD_FORCE_INLINE vfloat sin_est_a(vfloat a) { return sincos_est_a(a, true); }
+CPPSPMD_FORCE_INLINE vfloat cos_est_a(vfloat a) { return sincos_est_a(a, false); }
+
+inline vfloat sin_est(vfloat a);
+
+inline vfloat cos_est(vfloat a);
+
+// Don't call with values <= 0.
+CPPSPMD_FORCE_INLINE vfloat rsqrt_est1(vfloat x0);
+
+// Don't call with values <= 0.
+CPPSPMD_FORCE_INLINE vfloat rsqrt_est2(vfloat x0);
+
+CPPSPMD_FORCE_INLINE vfloat atan2_est(vfloat y, vfloat x);
+
+CPPSPMD_FORCE_INLINE vfloat atan_est(vfloat x) { return atan2_est(x, vfloat(1.0f)); }
+
+// Don't call this for angles close to 90/270! 
+inline vfloat tan_est(vfloat x);
+
+// https://burtleburtle.net/bob/rand/smallprng.html
+struct rand_context { vint a, b, c, d; };
+
+inline void seed_rand(rand_context& x, vint seed);
+
+// Returns 32-bit unsigned random numbers.
+inline vint get_randu(rand_context& x);
+
+// Returns random numbers between [low, high), or low if low >= high
+inline vint get_randi(rand_context& x, vint low, vint high);
+
+// Returns random numbers between [low, high), or low if low >= high
+inline vfloat get_randf(rand_context& x, vfloat low, vfloat high);
+
+CPPSPMD_FORCE_INLINE void init_reverse_bits(vint& tab1, vint& tab2);
+CPPSPMD_FORCE_INLINE vint reverse_bits(vint k, vint tab1, vint tab2);
+
+CPPSPMD_FORCE_INLINE vint count_leading_zeros(vint x);
+CPPSPMD_FORCE_INLINE vint count_leading_zeros_alt(vint x);
+
+CPPSPMD_FORCE_INLINE vint count_trailing_zeros(vint x);
+
+CPPSPMD_FORCE_INLINE vint count_set_bits(vint x);
+
+void print_vint(vint v);
+void print_vbool(vbool v);
+void print_vint_hex(vint v);
+void print_active_lanes(const char *pPrefix);
+void print_vfloat(vfloat v);
+
diff --git a/encoder/cppspmd_sse.h b/encoder/cppspmd_sse.h
new file mode 100644
index 0000000..b39cb82
--- /dev/null
+++ b/encoder/cppspmd_sse.h
@@ -0,0 +1,2118 @@
+// cppspmd_sse.h
+// Note for Basis Universal: All of the "cppspmd" code and headers are OPTIONAL to Basis Universal. if BASISU_SUPPORT_SSE is 0, it will never be included and does not impact compilation.
+// SSE 2 or 4.1
+// Originally written by Nicolas Guillemot, Jefferson Amstutz in the "CppSPMD" project.
+// 4/20: Richard Geldreich: Macro control flow, more SIMD instruction sets, optimizations, supports using multiple SIMD instruction sets in same executable. Still a work in progress!
+//
+// Originally Copyright 2016 Nicolas Guillemot
+// Changed from the MIT license to Apache 2.0 with permission from the author.
+//
+// Modifications/enhancements Copyright 2020-2021 Binomial LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <math.h>
+#include <utility>
+#include <algorithm>
+
+#if CPPSPMD_SSE2
+#include <xmmintrin.h>		// SSE
+#include <emmintrin.h>		// SSE2
+#else
+#include <xmmintrin.h>		// SSE
+#include <emmintrin.h>		// SSE2
+#include <pmmintrin.h>		// SSE3
+#include <tmmintrin.h>		// SSSE3
+#include <smmintrin.h>		// SSE4.1
+//#include <nmmintrin.h>		// SSE4.2
+#endif
+
+#undef CPPSPMD_SSE
+#undef CPPSPMD_AVX1
+#undef CPPSPMD_AVX2
+#undef CPPSPMD_AVX
+#undef CPPSPMD_FLOAT4
+#undef CPPSPMD_INT16
+
+#define CPPSPMD_SSE 1
+#define CPPSPMD_AVX 0
+#define CPPSPMD_AVX1 0
+#define CPPSPMD_AVX2 0
+#define CPPSPMD_FLOAT4 0
+#define CPPSPMD_INT16 0
+
+#ifdef _MSC_VER
+	#ifndef CPPSPMD_DECL
+	#define CPPSPMD_DECL(type, name) __declspec(align(16)) type name
+	#endif
+
+	#ifndef CPPSPMD_ALIGN
+	#define CPPSPMD_ALIGN(v) __declspec(align(v))
+	#endif
+
+	#define _mm_undefined_si128 _mm_setzero_si128
+	#define _mm_undefined_ps _mm_setzero_ps
+#else
+	#ifndef CPPSPMD_DECL
+	#define CPPSPMD_DECL(type, name) type name __attribute__((aligned(32)))
+	#endif
+
+	#ifndef CPPSPMD_ALIGN
+	#define CPPSPMD_ALIGN(v) __attribute__((aligned(v)))
+	#endif
+#endif
+
+#ifndef CPPSPMD_FORCE_INLINE
+#ifdef _DEBUG
+#define CPPSPMD_FORCE_INLINE inline
+#else
+	#ifdef _MSC_VER
+		#define CPPSPMD_FORCE_INLINE __forceinline
+	#else
+		#define CPPSPMD_FORCE_INLINE inline
+	#endif
+#endif
+#endif
+
+#undef CPPSPMD
+#undef CPPSPMD_ARCH
+
+#if CPPSPMD_SSE2
+	#define CPPSPMD_SSE41 0
+	#define CPPSPMD cppspmd_sse2
+	#define CPPSPMD_ARCH _sse2
+#else
+	#define CPPSPMD_SSE41 1
+	#define CPPSPMD cppspmd_sse41
+	#define CPPSPMD_ARCH _sse41
+#endif
+
+#ifndef CPPSPMD_GLUER
+	#define CPPSPMD_GLUER(a, b) a##b
+#endif
+
+#ifndef CPPSPMD_GLUER2
+	#define CPPSPMD_GLUER2(a, b) CPPSPMD_GLUER(a, b)
+#endif
+
+#ifndef CPPSPMD_NAME
+#define CPPSPMD_NAME(a) CPPSPMD_GLUER2(a, CPPSPMD_ARCH)
+#endif
+
+#undef VASSERT
+#define VCOND(cond) ((exec_mask(vbool(cond)) & m_exec).get_movemask() == m_exec.get_movemask())
+#define VASSERT(cond) assert( VCOND(cond) )
+
+#define CPPSPMD_ALIGNMENT (16)
+
+#define storeu_si32(p, a) (void)(*(int*)(p) = _mm_cvtsi128_si32((a)))
+
+namespace CPPSPMD
+{
+
+const int PROGRAM_COUNT_SHIFT = 2;
+const int PROGRAM_COUNT = 1 << PROGRAM_COUNT_SHIFT;
+
+template <typename N> inline N* aligned_new() { void* p = _mm_malloc(sizeof(N), 64); new (p) N;	return static_cast<N*>(p); }
+template <typename N> void aligned_delete(N* p) { if (p) { p->~N(); _mm_free(p); } }
+
+CPPSPMD_DECL(const uint32_t, g_allones_128[4]) = { UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX };
+CPPSPMD_DECL(const uint32_t, g_x_128[4]) = { UINT32_MAX, 0, 0, 0 };
+CPPSPMD_DECL(const float, g_onef_128[4]) = { 1.0f, 1.0f, 1.0f, 1.0f };
+CPPSPMD_DECL(const uint32_t, g_oneu_128[4]) = { 1, 1, 1, 1 };
+
+CPPSPMD_DECL(const uint32_t, g_lane_masks_128[4][4]) = 
+{ 
+	{ UINT32_MAX, 0, 0, 0 },
+	{ 0, UINT32_MAX, 0, 0 },
+	{ 0, 0, UINT32_MAX, 0 },
+	{ 0, 0, 0, UINT32_MAX },
+};
+
+#if CPPSPMD_SSE41
+CPPSPMD_FORCE_INLINE __m128i _mm_blendv_epi32(__m128i a, __m128i b, __m128i c) { return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(c))); }
+#endif
+
+CPPSPMD_FORCE_INLINE __m128i blendv_epi8(__m128i a, __m128i b, __m128i mask)
+{
+#if CPPSPMD_SSE2
+	return _mm_castps_si128(_mm_or_ps(_mm_and_ps(_mm_castsi128_ps(mask), _mm_castsi128_ps(b)), _mm_andnot_ps(_mm_castsi128_ps(mask), _mm_castsi128_ps(a))));
+#else
+	return _mm_blendv_epi8(a, b, mask);
+#endif
+}
+
+CPPSPMD_FORCE_INLINE __m128 blendv_mask_ps(__m128 a, __m128 b, __m128 mask)
+{
+#if CPPSPMD_SSE2
+	// We know it's a mask, so we can just emulate the blend.
+	return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
+#else
+	return _mm_blendv_ps(a, b, mask);
+#endif
+}
+
+CPPSPMD_FORCE_INLINE __m128 blendv_ps(__m128 a, __m128 b, __m128 mask)
+{
+#if CPPSPMD_SSE2
+	// Input is not a mask, but MSB bits - so emulate _mm_blendv_ps() by replicating bit 31.
+	mask = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(mask), 31));
+	return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
+#else
+	return _mm_blendv_ps(a, b, mask);
+#endif
+}
+
+CPPSPMD_FORCE_INLINE __m128i blendv_mask_epi32(__m128i a, __m128i b, __m128i mask)
+{
+	return _mm_castps_si128(blendv_mask_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask)));
+}
+
+CPPSPMD_FORCE_INLINE __m128i blendv_epi32(__m128i a, __m128i b, __m128i mask)
+{
+	return _mm_castps_si128(blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask)));
+}
+
+#if CPPSPMD_SSE2
+CPPSPMD_FORCE_INLINE int extract_x(const __m128i& vec) { return _mm_cvtsi128_si32(vec); }
+CPPSPMD_FORCE_INLINE int extract_y(const __m128i& vec) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(vec, 0x55)); }
+CPPSPMD_FORCE_INLINE int extract_z(const __m128i& vec) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(vec, 0xAA)); }
+CPPSPMD_FORCE_INLINE int extract_w(const __m128i& vec) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(vec, 0xFF)); }
+
+// Returns float bits as int, to emulate _mm_extract_ps()
+CPPSPMD_FORCE_INLINE int extract_ps_x(const __m128& vec) { float f = _mm_cvtss_f32(vec); return *(const int*)&f;  }
+CPPSPMD_FORCE_INLINE int extract_ps_y(const __m128& vec) { float f = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0x55)); return *(const int*)&f; }
+CPPSPMD_FORCE_INLINE int extract_ps_z(const __m128& vec) { float f = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xAA)); return *(const int*)&f; }
+CPPSPMD_FORCE_INLINE int extract_ps_w(const __m128& vec) { float f = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xFF)); return *(const int*)&f; }
+
+// Returns floats
+CPPSPMD_FORCE_INLINE float extractf_ps_x(const __m128& vec) { return _mm_cvtss_f32(vec); }
+CPPSPMD_FORCE_INLINE float extractf_ps_y(const __m128& vec) { return _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0x55)); }
+CPPSPMD_FORCE_INLINE float extractf_ps_z(const __m128& vec) { return _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xAA)); }
+CPPSPMD_FORCE_INLINE float extractf_ps_w(const __m128& vec) { return _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xFF)); }
+#else
+CPPSPMD_FORCE_INLINE int extract_x(const __m128i& vec) { return _mm_extract_epi32(vec, 0); }
+CPPSPMD_FORCE_INLINE int extract_y(const __m128i& vec) { return _mm_extract_epi32(vec, 1); }
+CPPSPMD_FORCE_INLINE int extract_z(const __m128i& vec) { return _mm_extract_epi32(vec, 2); }
+CPPSPMD_FORCE_INLINE int extract_w(const __m128i& vec) { return _mm_extract_epi32(vec, 3); }
+
+// Returns float bits as int
+CPPSPMD_FORCE_INLINE int extract_ps_x(const __m128& vec) { return _mm_extract_ps(vec, 0); }
+CPPSPMD_FORCE_INLINE int extract_ps_y(const __m128& vec) { return _mm_extract_ps(vec, 1); }
+CPPSPMD_FORCE_INLINE int extract_ps_z(const __m128& vec) { return _mm_extract_ps(vec, 2); }
+CPPSPMD_FORCE_INLINE int extract_ps_w(const __m128& vec) { return _mm_extract_ps(vec, 3); }
+
+// Returns floats
+CPPSPMD_FORCE_INLINE float extractf_ps_x(const __m128& vec) { int v = extract_ps_x(vec); return *(const float*)&v; }
+CPPSPMD_FORCE_INLINE float extractf_ps_y(const __m128& vec) { int v = extract_ps_y(vec); return *(const float*)&v; }
+CPPSPMD_FORCE_INLINE float extractf_ps_z(const __m128& vec) { int v = extract_ps_z(vec); return *(const float*)&v; }
+CPPSPMD_FORCE_INLINE float extractf_ps_w(const __m128& vec) { int v = extract_ps_w(vec); return *(const float*)&v; }
+#endif
+
+#if CPPSPMD_SSE2
+CPPSPMD_FORCE_INLINE __m128i insert_x(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 0), (uint32_t)v >> 16U, 1); }
+CPPSPMD_FORCE_INLINE __m128i insert_y(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 2), (uint32_t)v >> 16U, 3); }
+CPPSPMD_FORCE_INLINE __m128i insert_z(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 4), (uint32_t)v >> 16U, 5); }
+CPPSPMD_FORCE_INLINE __m128i insert_w(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 6), (uint32_t)v >> 16U, 7); }
+#else
+CPPSPMD_FORCE_INLINE __m128i insert_x(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 0); }
+CPPSPMD_FORCE_INLINE __m128i insert_y(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 1); }
+CPPSPMD_FORCE_INLINE __m128i insert_z(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 2); }
+CPPSPMD_FORCE_INLINE __m128i insert_w(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 3); }
+#endif
+
+#if CPPSPMD_SSE2
+inline __m128i shuffle_epi8(const __m128i& a, const __m128i& b)
+{
+	// Just emulate _mm_shuffle_epi8. This is very slow, but what else can we do?
+	CPPSPMD_ALIGN(16) uint8_t av[16];
+	_mm_store_si128((__m128i*)av, a);
+		
+	CPPSPMD_ALIGN(16) uint8_t bvi[16];
+	_mm_store_ps((float*)bvi, _mm_and_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(_mm_set1_epi32(0x0F0F0F0F))));
+
+	CPPSPMD_ALIGN(16) uint8_t result[16];
+
+	result[0] = av[bvi[0]];
+	result[1] = av[bvi[1]];
+	result[2] = av[bvi[2]];
+	result[3] = av[bvi[3]];
+	
+	result[4] = av[bvi[4]];
+	result[5] = av[bvi[5]];
+	result[6] = av[bvi[6]];
+	result[7] = av[bvi[7]];
+
+	result[8] = av[bvi[8]];
+	result[9] = av[bvi[9]];
+	result[10] = av[bvi[10]];
+	result[11] = av[bvi[11]];
+
+	result[12] = av[bvi[12]];
+	result[13] = av[bvi[13]];
+	result[14] = av[bvi[14]];
+	result[15] = av[bvi[15]];
+
+	return _mm_andnot_si128(_mm_cmplt_epi8(b, _mm_setzero_si128()), _mm_load_si128((__m128i*)result));
+}
+#else
+CPPSPMD_FORCE_INLINE __m128i shuffle_epi8(const __m128i& a, const __m128i& b) 
+{ 
+	return _mm_shuffle_epi8(a, b); 
+}
+#endif
+
+#if CPPSPMD_SSE2
+CPPSPMD_FORCE_INLINE __m128i min_epi32(__m128i a, __m128i b)
+{
+	return blendv_mask_epi32(b, a, _mm_cmplt_epi32(a, b));
+}
+CPPSPMD_FORCE_INLINE __m128i max_epi32(__m128i a, __m128i b)
+{
+	return blendv_mask_epi32(b, a, _mm_cmpgt_epi32(a, b));
+}
+CPPSPMD_FORCE_INLINE __m128i min_epu32(__m128i a, __m128i b)
+{
+	__m128i n = _mm_set1_epi32(0x80000000);
+	__m128i ac = _mm_add_epi32(a, n);
+	__m128i bc = _mm_add_epi32(b, n);
+	return blendv_mask_epi32(b, a, _mm_cmplt_epi32(ac, bc));
+}
+CPPSPMD_FORCE_INLINE __m128i max_epu32(__m128i a, __m128i b)
+{
+	__m128i n = _mm_set1_epi32(0x80000000);
+	__m128i ac = _mm_add_epi32(a, n);
+	__m128i bc = _mm_add_epi32(b, n);
+	return blendv_mask_epi32(b, a, _mm_cmpgt_epi32(ac, bc));
+}
+#else
+CPPSPMD_FORCE_INLINE __m128i min_epi32(__m128i a, __m128i b)
+{
+	return _mm_min_epi32(a, b);
+}
+CPPSPMD_FORCE_INLINE __m128i max_epi32(__m128i a, __m128i b)
+{
+	return _mm_max_epi32(a, b);
+}
+CPPSPMD_FORCE_INLINE __m128i min_epu32(__m128i a, __m128i b)
+{
+	return _mm_min_epu32(a, b);
+}
+CPPSPMD_FORCE_INLINE __m128i max_epu32(__m128i a, __m128i b)
+{
+	return _mm_max_epu32(a, b);
+}
+#endif
+
+#if CPPSPMD_SSE2
+CPPSPMD_FORCE_INLINE __m128i abs_epi32(__m128i a)
+{
+	__m128i sign_mask = _mm_srai_epi32(a, 31);
+	return _mm_sub_epi32(_mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(sign_mask))), sign_mask);
+}
+#else
+CPPSPMD_FORCE_INLINE __m128i abs_epi32(__m128i a)
+{
+	return _mm_abs_epi32(a);
+}
+#endif
+
+#if CPPSPMD_SSE2
+CPPSPMD_FORCE_INLINE __m128i mullo_epi32(__m128i a, __m128i b)
+{
+	__m128i tmp1 = _mm_mul_epu32(a, b);
+	__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
+	return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
+}
+#else
+CPPSPMD_FORCE_INLINE __m128i mullo_epi32(__m128i a, __m128i b)
+{
+	return _mm_mullo_epi32(a, b);
+}
+#endif
+
+CPPSPMD_FORCE_INLINE __m128i mulhi_epu32(__m128i a, __m128i b)
+{
+	__m128i tmp1 = _mm_mul_epu32(a, b);
+	__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
+	return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 3, 1)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 3, 1)));
+}
+
+#if CPPSPMD_SSE2
+inline __m128i load_rgba32(const void* p)
+{
+	__m128i xmm = _mm_cvtsi32_si128(*(const int*)p);
+	xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
+	xmm = _mm_unpacklo_epi16(xmm, _mm_setzero_si128());
+	return xmm;
+}
+#else
+inline __m128i load_rgba32(const void* p)
+{
+	return _mm_cvtepu8_epi32(_mm_castps_si128(_mm_load_ss((const float*)p)));
+}
+#endif
+
+inline void transpose4x4(__m128i& x, __m128i& y, __m128i& z, __m128i& w, const __m128i& r0, const __m128i& r1, const __m128i& r2, const __m128i& r3)
+{
+	__m128i t0 = _mm_unpacklo_epi32(r0, r1);
+	__m128i t1 = _mm_unpacklo_epi32(r2, r3);
+	__m128i t2 = _mm_unpackhi_epi32(r0, r1);
+	__m128i t3 = _mm_unpackhi_epi32(r2, r3);
+	x = _mm_unpacklo_epi64(t0, t1);
+	y = _mm_unpackhi_epi64(t0, t1);
+	z = _mm_unpacklo_epi64(t2, t3);
+	w = _mm_unpackhi_epi64(t2, t3);
+}
+
+const uint32_t ALL_ON_MOVEMASK = 0xF;
+
+struct spmd_kernel
+{
+	struct vint;
+	struct lint;
+	struct vbool;
+	struct vfloat;
+
+	typedef int int_t;
+	typedef vint vint_t;
+	typedef lint lint_t;
+		
+	// Exec mask
+	struct exec_mask
+	{
+		__m128i m_mask;
+
+		exec_mask() = default;
+
+		CPPSPMD_FORCE_INLINE explicit exec_mask(const vbool& b);
+		CPPSPMD_FORCE_INLINE explicit exec_mask(const __m128i& mask) : m_mask(mask) { }
+
+		CPPSPMD_FORCE_INLINE void enable_lane(uint32_t lane) { m_mask = _mm_load_si128((const __m128i *)&g_lane_masks_128[lane][0]); }
+				
+		static CPPSPMD_FORCE_INLINE exec_mask all_on()	{ return exec_mask{ _mm_load_si128((const __m128i*)g_allones_128) };	}
+		static CPPSPMD_FORCE_INLINE exec_mask all_off() { return exec_mask{ _mm_setzero_si128() }; }
+
+		CPPSPMD_FORCE_INLINE uint32_t get_movemask() const { return _mm_movemask_ps(_mm_castsi128_ps(m_mask)); }
+	};
+
+	friend CPPSPMD_FORCE_INLINE bool all(const exec_mask& e);
+	friend CPPSPMD_FORCE_INLINE bool any(const exec_mask& e);
+
+	CPPSPMD_FORCE_INLINE bool spmd_all() const { return all(m_exec); }
+	CPPSPMD_FORCE_INLINE bool spmd_any() const { return any(m_exec); }
+	CPPSPMD_FORCE_INLINE bool spmd_none() { return !any(m_exec); }
+
+	// true if cond is true for all active lanes - false if no active lanes
+	CPPSPMD_FORCE_INLINE bool spmd_all(const vbool& e) { uint32_t m = m_exec.get_movemask(); return (m != 0) && ((exec_mask(e) & m_exec).get_movemask() == m); }
+	// true if cond is true for any active lanes
+	CPPSPMD_FORCE_INLINE bool spmd_any(const vbool& e) { return (exec_mask(e) & m_exec).get_movemask() != 0; }
+	CPPSPMD_FORCE_INLINE bool spmd_none(const vbool& e) { return !spmd_any(e); }
+
+	friend CPPSPMD_FORCE_INLINE exec_mask operator^ (const exec_mask& a, const exec_mask& b);
+	friend CPPSPMD_FORCE_INLINE exec_mask operator& (const exec_mask& a, const exec_mask& b);
+	friend CPPSPMD_FORCE_INLINE exec_mask operator| (const exec_mask& a, const exec_mask& b);
+		
+	exec_mask m_exec;
+	exec_mask m_kernel_exec;
+	exec_mask m_continue_mask;
+#ifdef _DEBUG
+	bool m_in_loop;
+#endif
+		
+	CPPSPMD_FORCE_INLINE uint32_t get_movemask() const { return m_exec.get_movemask(); }
+		
+	void init(const exec_mask& kernel_exec);
+	
+	// Varying bool
+		
+	struct vbool
+	{
+		__m128i m_value;
+
+		vbool() = default;
+
+		CPPSPMD_FORCE_INLINE vbool(bool value) : m_value(_mm_set1_epi32(value ? UINT32_MAX : 0)) { }
+
+		CPPSPMD_FORCE_INLINE explicit vbool(const __m128i& value) : m_value(value) { }
+
+		CPPSPMD_FORCE_INLINE explicit operator vfloat() const;
+		CPPSPMD_FORCE_INLINE explicit operator vint() const;
+								
+	private:
+		vbool& operator=(const vbool&);
+	};
+
+	friend vbool operator!(const vbool& v);
+		
+	CPPSPMD_FORCE_INLINE vbool& store(vbool& dst, const vbool& src)
+	{
+		dst.m_value = blendv_mask_epi32(dst.m_value, src.m_value, m_exec.m_mask);
+		return dst;
+	}
+		
+	CPPSPMD_FORCE_INLINE vbool& store_all(vbool& dst, const vbool& src)
+	{
+		dst.m_value = src.m_value;
+		return dst;
+	}
+	
+	// Varying float
+	struct vfloat
+	{
+		__m128 m_value;
+
+		vfloat() = default;
+
+		CPPSPMD_FORCE_INLINE explicit vfloat(const __m128& v) : m_value(v) { }
+
+		CPPSPMD_FORCE_INLINE vfloat(float value) : m_value(_mm_set1_ps(value)) { }
+
+		CPPSPMD_FORCE_INLINE explicit vfloat(int value) : m_value(_mm_set1_ps((float)value)) { }
+
+	private:
+		vfloat& operator=(const vfloat&);
+	};
+
+	CPPSPMD_FORCE_INLINE vfloat& store(vfloat& dst, const vfloat& src)
+	{
+		dst.m_value = blendv_mask_ps(dst.m_value, src.m_value, _mm_castsi128_ps(m_exec.m_mask));
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat& store(vfloat&& dst, const vfloat& src)
+	{
+		dst.m_value = blendv_mask_ps(dst.m_value, src.m_value, _mm_castsi128_ps(m_exec.m_mask));
+		return dst;
+	}
+	
+	CPPSPMD_FORCE_INLINE vfloat& store_all(vfloat& dst, const vfloat& src)
+	{
+		dst.m_value = src.m_value;
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat& store_all(vfloat&& dst, const vfloat& src)
+	{
+		dst.m_value = src.m_value;
+		return dst;
+	}
+
+	// Linear ref to floats
+	struct float_lref
+	{
+		float* m_pValue;
+
+	private:
+		float_lref& operator=(const float_lref&);
+	};
+
+	CPPSPMD_FORCE_INLINE const float_lref& store(const float_lref& dst, const vfloat& src)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		if (mask == ALL_ON_MOVEMASK)
+			_mm_storeu_ps(dst.m_pValue, src.m_value);
+		else
+			_mm_storeu_ps(dst.m_pValue, blendv_mask_ps(_mm_loadu_ps(dst.m_pValue), src.m_value, _mm_castsi128_ps(m_exec.m_mask)));
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE const float_lref& store(const float_lref&& dst, const vfloat& src)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		if (mask == ALL_ON_MOVEMASK)
+			_mm_storeu_ps(dst.m_pValue, src.m_value);
+		else
+			_mm_storeu_ps(dst.m_pValue, blendv_mask_ps(_mm_loadu_ps(dst.m_pValue), src.m_value, _mm_castsi128_ps(m_exec.m_mask)));
+		return dst;
+	}
+	
+	CPPSPMD_FORCE_INLINE const float_lref& store_all(const float_lref& dst, const vfloat& src)
+	{
+		_mm_storeu_ps(dst.m_pValue, src.m_value);
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE const float_lref& store_all(const float_lref&& dst, const vfloat& src)
+	{
+		_mm_storeu_ps(dst.m_pValue, src.m_value);
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat load(const float_lref& src)
+	{
+		return vfloat{ _mm_and_ps(_mm_loadu_ps(src.m_pValue), _mm_castsi128_ps(m_exec.m_mask)) };
+	}
+		
+	// Varying ref to floats
+	struct float_vref
+	{
+		__m128i m_vindex;
+		float* m_pValue;
+		
+	private:
+		float_vref& operator=(const float_vref&);
+	};
+
+	// Varying ref to varying float
+	struct vfloat_vref
+	{
+		__m128i m_vindex;
+		vfloat* m_pValue;
+		
+	private:
+		vfloat_vref& operator=(const vfloat_vref&);
+	};
+
+	// Varying ref to varying int
+	struct vint_vref
+	{
+		__m128i m_vindex;
+		vint* m_pValue;
+		
+	private:
+		vint_vref& operator=(const vint_vref&);
+	};
+
+	CPPSPMD_FORCE_INLINE const float_vref& store(const float_vref& dst, const vfloat& src);
+	CPPSPMD_FORCE_INLINE const float_vref& store(const float_vref&& dst, const vfloat& src);
+		
+	CPPSPMD_FORCE_INLINE const float_vref& store_all(const float_vref& dst, const vfloat& src);
+	CPPSPMD_FORCE_INLINE const float_vref& store_all(const float_vref&& dst, const vfloat& src);
+
+	CPPSPMD_FORCE_INLINE vfloat load(const float_vref& src)
+	{
+		CPPSPMD_ALIGN(16) int vindex[4];
+		_mm_store_si128((__m128i *)vindex, src.m_vindex);
+
+		CPPSPMD_ALIGN(16) float loaded[4];
+
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		for (int i = 0; i < 4; i++)
+		{
+			if (mask & (1 << i))
+				loaded[i] = src.m_pValue[vindex[i]];
+		}
+		return vfloat{ _mm_and_ps(_mm_castsi128_ps(m_exec.m_mask), _mm_load_ps((const float*)loaded)) };
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat load_all(const float_vref& src)
+	{
+		CPPSPMD_ALIGN(16) int vindex[4];
+		_mm_store_si128((__m128i *)vindex, src.m_vindex);
+
+		CPPSPMD_ALIGN(16) float loaded[4];
+
+		for (int i = 0; i < 4; i++)
+			loaded[i] = src.m_pValue[vindex[i]];
+		return vfloat{ _mm_load_ps((const float*)loaded) };
+	}
+
+	// Linear ref to ints
+	struct int_lref
+	{
+		int* m_pValue;
+
+	private:
+		int_lref& operator=(const int_lref&);
+	};
+		
+	CPPSPMD_FORCE_INLINE const int_lref& store(const int_lref& dst, const vint& src)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		if (mask == ALL_ON_MOVEMASK)
+		{
+			_mm_storeu_si128((__m128i *)dst.m_pValue, src.m_value);
+		}
+		else
+		{
+			CPPSPMD_ALIGN(16) int stored[4];
+			_mm_store_si128((__m128i *)stored, src.m_value);
+
+			for (int i = 0; i < 4; i++)
+			{
+				if (mask & (1 << i))
+					dst.m_pValue[i] = stored[i];
+			}
+		}
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE vint load(const int_lref& src)
+	{
+		__m128i v = _mm_loadu_si128((const __m128i*)src.m_pValue);
+
+		v = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(m_exec.m_mask)));
+
+		return vint{ v };
+	}
+
+	// Linear ref to int16's
+	struct int16_lref
+	{
+		int16_t* m_pValue;
+
+	private:
+		int16_lref& operator=(const int16_lref&);
+	};
+
+	CPPSPMD_FORCE_INLINE const int16_lref& store(const int16_lref& dst, const vint& src)
+	{
+		CPPSPMD_ALIGN(16) int stored[4];
+		_mm_store_si128((__m128i *)stored, src.m_value);
+
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		for (int i = 0; i < 4; i++)
+		{
+			if (mask & (1 << i))
+				dst.m_pValue[i] = static_cast<int16_t>(stored[i]);
+		}
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE const int16_lref& store_all(const int16_lref& dst, const vint& src)
+	{
+		CPPSPMD_ALIGN(16) int stored[4];
+		_mm_store_si128((__m128i *)stored, src.m_value);
+
+		for (int i = 0; i < 4; i++)
+			dst.m_pValue[i] = static_cast<int16_t>(stored[i]);
+		return dst;
+	}
+		
+	CPPSPMD_FORCE_INLINE vint load(const int16_lref& src)
+	{
+		CPPSPMD_ALIGN(16) int values[4];
+
+		for (int i = 0; i < 4; i++)
+			values[i] = static_cast<int16_t>(src.m_pValue[i]);
+
+		__m128i t = _mm_load_si128( (const __m128i *)values );
+
+		return vint{ _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps( t ), _mm_castsi128_ps(m_exec.m_mask))) };
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_all(const int16_lref& src)
+	{
+		CPPSPMD_ALIGN(16) int values[4];
+
+		for (int i = 0; i < 4; i++)
+			values[i] = static_cast<int16_t>(src.m_pValue[i]);
+
+		__m128i t = _mm_load_si128( (const __m128i *)values );
+
+		return vint{ t };
+	}
+		
+	// Linear ref to constant ints
+	struct cint_lref
+	{
+		const int* m_pValue;
+
+	private:
+		cint_lref& operator=(const cint_lref&);
+	};
+
+	CPPSPMD_FORCE_INLINE vint load(const cint_lref& src)
+	{
+		__m128i v = _mm_loadu_si128((const __m128i *)src.m_pValue);
+		v = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(m_exec.m_mask)));
+		return vint{ v };
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_all(const cint_lref& src)
+	{
+		return vint{ _mm_loadu_si128((const __m128i *)src.m_pValue) };
+	}
+	
+	// Varying ref to ints
+	struct int_vref
+	{
+		__m128i m_vindex;
+		int* m_pValue;
+
+	private:
+		int_vref& operator=(const int_vref&);
+	};
+
+	// Varying ref to constant ints
+	struct cint_vref
+	{
+		__m128i m_vindex;
+		const int* m_pValue;
+
+	private:
+		cint_vref& operator=(const cint_vref&);
+	};
+
+	// Varying int
+	struct vint
+	{
+		__m128i m_value;
+
+		vint() = default;
+
+		CPPSPMD_FORCE_INLINE explicit vint(const __m128i& value) : m_value(value)	{ }
+
+		CPPSPMD_FORCE_INLINE explicit vint(const lint &other) : m_value(other.m_value) { }
+
+		CPPSPMD_FORCE_INLINE vint& operator=(const lint& other) { m_value = other.m_value; return *this; }
+
+		CPPSPMD_FORCE_INLINE vint(int value) : m_value(_mm_set1_epi32(value)) { }
+
+		CPPSPMD_FORCE_INLINE explicit vint(float value) : m_value(_mm_set1_epi32((int)value))	{ }
+
+		CPPSPMD_FORCE_INLINE explicit vint(const vfloat& other) : m_value(_mm_cvttps_epi32(other.m_value)) { }
+
+		CPPSPMD_FORCE_INLINE explicit operator vbool() const 
+		{
+			return vbool{ _mm_xor_si128( _mm_load_si128((const __m128i*)g_allones_128), _mm_cmpeq_epi32(m_value, _mm_setzero_si128())) };
+		}
+
+		CPPSPMD_FORCE_INLINE explicit operator vfloat() const
+		{
+			return vfloat{ _mm_cvtepi32_ps(m_value) };
+		}
+
+		CPPSPMD_FORCE_INLINE int_vref operator[](int* ptr) const
+		{
+			return int_vref{ m_value, ptr };
+		}
+
+		CPPSPMD_FORCE_INLINE cint_vref operator[](const int* ptr) const
+		{
+			return cint_vref{ m_value, ptr };
+		}
+
+		CPPSPMD_FORCE_INLINE float_vref operator[](float* ptr) const
+		{
+			return float_vref{ m_value, ptr };
+		}
+
+		CPPSPMD_FORCE_INLINE vfloat_vref operator[](vfloat* ptr) const
+		{
+			return vfloat_vref{ m_value, ptr };
+		}
+
+		CPPSPMD_FORCE_INLINE vint_vref operator[](vint* ptr) const
+		{
+			return vint_vref{ m_value, ptr };
+		}
+
+	private:
+		vint& operator=(const vint&);
+	};
+
+	// Load/store linear int
+	CPPSPMD_FORCE_INLINE void storeu_linear(int *pDst, const vint& src)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		if (mask == ALL_ON_MOVEMASK)
+			_mm_storeu_si128((__m128i *)pDst, src.m_value);
+		else
+		{
+			if (mask & 1) pDst[0] = extract_x(src.m_value);
+			if (mask & 2) pDst[1] = extract_y(src.m_value);
+			if (mask & 4) pDst[2] = extract_z(src.m_value);
+			if (mask & 8) pDst[3] = extract_w(src.m_value);
+		}
+	}
+
+	CPPSPMD_FORCE_INLINE void storeu_linear_all(int *pDst, const vint& src)
+	{
+		_mm_storeu_si128((__m128i*)pDst, src.m_value);
+	}
+
+	CPPSPMD_FORCE_INLINE void store_linear_all(int *pDst, const vint& src)
+	{
+		_mm_store_si128((__m128i*)pDst, src.m_value);
+	}
+		
+	CPPSPMD_FORCE_INLINE vint loadu_linear(const int *pSrc)
+	{
+		__m128i v = _mm_loadu_si128((const __m128i*)pSrc);
+
+		v = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(m_exec.m_mask)));
+
+		return vint{ v };
+	}
+
+	CPPSPMD_FORCE_INLINE vint loadu_linear_all(const int *pSrc)
+	{
+		return vint{ _mm_loadu_si128((__m128i*)pSrc) };
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_linear_all(const int *pSrc)
+	{
+		return vint{ _mm_load_si128((__m128i*)pSrc) };
+	}
+
+	// Load/store linear float
+	CPPSPMD_FORCE_INLINE void storeu_linear(float *pDst, const vfloat& src)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		if (mask == ALL_ON_MOVEMASK)
+			_mm_storeu_ps((float*)pDst, src.m_value);
+		else
+		{
+			int *pDstI = (int *)pDst;
+			if (mask & 1) pDstI[0] = extract_ps_x(src.m_value);
+			if (mask & 2) pDstI[1] = extract_ps_y(src.m_value);
+			if (mask & 4) pDstI[2] = extract_ps_z(src.m_value);
+			if (mask & 8) pDstI[3] = extract_ps_w(src.m_value);
+		}
+	}
+
+	CPPSPMD_FORCE_INLINE void storeu_linear_all(float *pDst, const vfloat& src)
+	{
+		_mm_storeu_ps((float*)pDst, src.m_value);
+	}
+
+	CPPSPMD_FORCE_INLINE void store_linear_all(float *pDst, const vfloat& src)
+	{
+		_mm_store_ps((float*)pDst, src.m_value);
+	}
+		
+	CPPSPMD_FORCE_INLINE vfloat loadu_linear(const float *pSrc)
+	{
+		__m128 v = _mm_loadu_ps((const float*)pSrc);
+
+		v = _mm_and_ps(v, _mm_castsi128_ps(m_exec.m_mask));
+
+		return vfloat{ v };
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat loadu_linear_all(const float *pSrc)
+	{
+		return vfloat{ _mm_loadu_ps((float*)pSrc) };
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat load_linear_all(const float *pSrc)
+	{
+		return vfloat{ _mm_load_ps((float*)pSrc) };
+	}
+	
+	CPPSPMD_FORCE_INLINE vint& store(vint& dst, const vint& src)
+	{
+		dst.m_value = blendv_mask_epi32(dst.m_value, src.m_value, m_exec.m_mask);
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE const int_vref& store(const int_vref& dst, const vint& src)
+	{
+		CPPSPMD_ALIGN(16) int vindex[4];
+		_mm_store_si128((__m128i*)vindex, dst.m_vindex);
+
+		CPPSPMD_ALIGN(16) int stored[4];
+		_mm_store_si128((__m128i*)stored, src.m_value);
+
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		for (int i = 0; i < 4; i++)
+		{
+			if (mask & (1 << i))
+				dst.m_pValue[vindex[i]] = stored[i];
+		}
+		return dst;
+	}
+	
+	CPPSPMD_FORCE_INLINE vint& store_all(vint& dst, const vint& src)
+	{
+		dst.m_value = src.m_value;
+		return dst;
+	}
+				
+	CPPSPMD_FORCE_INLINE const int_vref& store_all(const int_vref& dst, const vint& src)
+	{
+		CPPSPMD_ALIGN(16) int vindex[4];
+		_mm_store_si128((__m128i*)vindex, dst.m_vindex);
+
+		CPPSPMD_ALIGN(16) int stored[4];
+		_mm_store_si128((__m128i*)stored, src.m_value);
+
+		for (int i = 0; i < 4; i++)
+			dst.m_pValue[vindex[i]] = stored[i];
+
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE vint load(const int_vref& src)
+	{
+		CPPSPMD_ALIGN(16) int values[4];
+
+		CPPSPMD_ALIGN(16) int indices[4];
+		_mm_store_si128((__m128i *)indices, src.m_vindex);
+
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		for (int i = 0; i < 4; i++)
+		{
+			if (mask & (1 << i))
+				values[i] = src.m_pValue[indices[i]];
+		}
+
+		return vint{ _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(m_exec.m_mask), _mm_load_ps((const float*)values))) };
+	}
+		
+	CPPSPMD_FORCE_INLINE vint load_all(const int_vref& src)
+	{
+		CPPSPMD_ALIGN(16) int values[4];
+
+		CPPSPMD_ALIGN(16) int indices[4];
+		_mm_store_si128((__m128i *)indices, src.m_vindex);
+
+		for (int i = 0; i < 4; i++)
+			values[i] = src.m_pValue[indices[i]];
+
+		return vint{ _mm_castps_si128( _mm_load_ps((const float*)values)) };
+	}
+		
+	CPPSPMD_FORCE_INLINE vint load(const cint_vref& src)
+	{
+		CPPSPMD_ALIGN(16) int values[4];
+
+		CPPSPMD_ALIGN(16) int indices[4];
+		_mm_store_si128((__m128i *)indices, src.m_vindex);
+
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		for (int i = 0; i < 4; i++)
+		{
+			if (mask & (1 << i))
+				values[i] = src.m_pValue[indices[i]];
+		}
+
+		return vint{ _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(m_exec.m_mask), _mm_load_ps((const float*)values))) };
+	}
+		
+	CPPSPMD_FORCE_INLINE vint load_all(const cint_vref& src)
+	{
+		CPPSPMD_ALIGN(16) int values[4];
+
+		CPPSPMD_ALIGN(16) int indices[4];
+		_mm_store_si128((__m128i *)indices, src.m_vindex);
+
+		for (int i = 0; i < 4; i++)
+			values[i] = src.m_pValue[indices[i]];
+
+		return vint{ _mm_castps_si128( _mm_load_ps((const float*)values)) };
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_bytes_all(const cint_vref& src)
+	{
+		__m128i v0_l;
+
+		const uint8_t* pSrc = (const uint8_t*)src.m_pValue;
+		v0_l = insert_x(_mm_undefined_si128(), ((int*)(pSrc + extract_x(src.m_vindex)))[0]);
+		v0_l = insert_y(v0_l, ((int*)(pSrc + extract_y(src.m_vindex)))[0]);
+		v0_l = insert_z(v0_l, ((int*)(pSrc + extract_z(src.m_vindex)))[0]);
+		v0_l = insert_w(v0_l, ((int*)(pSrc + extract_w(src.m_vindex)))[0]);
+
+		return vint{ v0_l };
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_words_all(const cint_vref& src)
+	{
+		__m128i v0_l;
+
+		const uint8_t* pSrc = (const uint8_t*)src.m_pValue;
+		v0_l = insert_x(_mm_undefined_si128(), ((int16_t*)(pSrc + 2 * extract_x(src.m_vindex)))[0]);
+		v0_l = insert_y(v0_l, ((int16_t*)(pSrc + 2 * extract_y(src.m_vindex)))[0]);
+		v0_l = insert_z(v0_l, ((int16_t*)(pSrc + 2 * extract_z(src.m_vindex)))[0]);
+		v0_l = insert_w(v0_l, ((int16_t*)(pSrc + 2 * extract_w(src.m_vindex)))[0]);
+
+		return vint{ v0_l };
+	}
+
+	CPPSPMD_FORCE_INLINE void store_strided(int *pDst, uint32_t stride, const vint &v)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		
+		if (mask & 1) pDst[0] = extract_x(v.m_value);
+		if (mask & 2) pDst[stride] = extract_y(v.m_value);
+		if (mask & 4) pDst[stride*2] = extract_z(v.m_value);
+		if (mask & 8) pDst[stride*3] = extract_w(v.m_value);
+	}
+
+	CPPSPMD_FORCE_INLINE void store_strided(float *pDstF, uint32_t stride, const vfloat &v)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+
+		if (mask & 1) ((int *)pDstF)[0] = extract_ps_x(v.m_value);
+		if (mask & 2) ((int *)pDstF)[stride] = extract_ps_y(v.m_value);
+		if (mask & 4) ((int *)pDstF)[stride*2] = extract_ps_z(v.m_value);
+		if (mask & 8) ((int *)pDstF)[stride*3] = extract_ps_w(v.m_value);
+	}
+
+	CPPSPMD_FORCE_INLINE void store_all_strided(int *pDst, uint32_t stride, const vint &v)
+	{
+		pDst[0] = extract_x(v.m_value);
+		pDst[stride] = extract_y(v.m_value);
+		pDst[stride*2] = extract_z(v.m_value);
+		pDst[stride*3] = extract_w(v.m_value);
+	}
+
+	CPPSPMD_FORCE_INLINE void store_all_strided(float *pDstF, uint32_t stride, const vfloat &v)
+	{
+		((int *)pDstF)[0] = extract_ps_x(v.m_value);
+		((int *)pDstF)[stride] = extract_ps_y(v.m_value);
+		((int *)pDstF)[stride*2] = extract_ps_z(v.m_value);
+		((int *)pDstF)[stride*3] = extract_ps_w(v.m_value);
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_strided(const int *pSrc, uint32_t stride)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+								
+#if CPPSPMD_SSE2
+		CPPSPMD_ALIGN(16) int vals[4] = { 0, 0, 0, 0 };
+		if (mask & 1) vals[0] = pSrc[0];
+		if (mask & 2) vals[1] = pSrc[stride];
+		if (mask & 4) vals[2] = pSrc[stride * 2];
+		if (mask & 8) vals[3] = pSrc[stride * 3];
+		return vint{ _mm_load_si128((__m128i*)vals) };
+#else
+		const float* pSrcF = (const float*)pSrc;
+		__m128 v = _mm_setzero_ps();
+		if (mask & 1) v = _mm_load_ss(pSrcF);
+		if (mask & 2) v = _mm_insert_ps(v, _mm_load_ss(pSrcF + stride), 0x10);
+		if (mask & 4) v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 2 * stride), 0x20);
+		if (mask & 8) v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 3 * stride), 0x30);
+		return vint{ _mm_castps_si128(v) };
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat load_strided(const float *pSrc, uint32_t stride)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+
+#if CPPSPMD_SSE2
+		CPPSPMD_ALIGN(16) float vals[4] = { 0, 0, 0, 0 };
+		if (mask & 1) vals[0] = pSrc[0];
+		if (mask & 2) vals[1] = pSrc[stride];
+		if (mask & 4) vals[2] = pSrc[stride * 2];
+		if (mask & 8) vals[3] = pSrc[stride * 3];
+		return vfloat{ _mm_load_ps(vals) };
+#else
+		__m128 v = _mm_setzero_ps();
+		if (mask & 1) v = _mm_load_ss(pSrc);
+		if (mask & 2) v = _mm_insert_ps(v, _mm_load_ss(pSrc + stride), 0x10);
+		if (mask & 4) v = _mm_insert_ps(v, _mm_load_ss(pSrc + 2 * stride), 0x20);
+		if (mask & 8) v = _mm_insert_ps(v, _mm_load_ss(pSrc + 3 * stride), 0x30);
+		return vfloat{ v };
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_all_strided(const int *pSrc, uint32_t stride)
+	{
+#if CPPSPMD_SSE2
+		CPPSPMD_ALIGN(16) int vals[4];
+		vals[0] = pSrc[0];
+		vals[1] = pSrc[stride];
+		vals[2] = pSrc[stride * 2];
+		vals[3] = pSrc[stride * 3];
+		return vint{ _mm_load_si128((__m128i*)vals) };
+#else		
+		const float* pSrcF = (const float*)pSrc;
+		__m128 v = _mm_load_ss(pSrcF);
+		v = _mm_insert_ps(v, _mm_load_ss(pSrcF + stride), 0x10);
+		v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 2 * stride), 0x20);
+		v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 3 * stride), 0x30);
+		return vint{ _mm_castps_si128(v) };
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat load_all_strided(const float *pSrc, uint32_t stride)
+	{
+#if CPPSPMD_SSE2
+		CPPSPMD_ALIGN(16) float vals[4];
+		vals[0] = pSrc[0];
+		vals[1] = pSrc[stride];
+		vals[2] = pSrc[stride * 2];
+		vals[3] = pSrc[stride * 3];
+		return vfloat{ _mm_load_ps(vals) };
+#else
+		__m128 v = _mm_load_ss(pSrc);
+		v = _mm_insert_ps(v, _mm_load_ss(pSrc + stride), 0x10);
+		v = _mm_insert_ps(v, _mm_load_ss(pSrc + 2 * stride), 0x20);
+		v = _mm_insert_ps(v, _mm_load_ss(pSrc + 3 * stride), 0x30);
+		return vfloat{ v };
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE const vfloat_vref& store(const vfloat_vref& dst, const vfloat& src)
+	{
+		// TODO: There's surely a better way
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		
+		if (mask & 1) ((int *)(&dst.m_pValue[extract_x(dst.m_vindex)]))[0] = extract_x(_mm_castps_si128(src.m_value));
+		if (mask & 2) ((int *)(&dst.m_pValue[extract_y(dst.m_vindex)]))[1] = extract_y(_mm_castps_si128(src.m_value));
+		if (mask & 4) ((int *)(&dst.m_pValue[extract_z(dst.m_vindex)]))[2] = extract_z(_mm_castps_si128(src.m_value));
+		if (mask & 8) ((int *)(&dst.m_pValue[extract_w(dst.m_vindex)]))[3] = extract_w(_mm_castps_si128(src.m_value));
+
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat load(const vfloat_vref& src)
+	{
+		// TODO: There's surely a better way
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+
+		__m128i k = _mm_setzero_si128();
+
+		if (mask & 1) k = insert_x(k, ((int *)(&src.m_pValue[extract_x(src.m_vindex)]))[0]);
+		if (mask & 2) k = insert_y(k, ((int *)(&src.m_pValue[extract_y(src.m_vindex)]))[1]);
+		if (mask & 4) k = insert_z(k, ((int *)(&src.m_pValue[extract_z(src.m_vindex)]))[2]);
+		if (mask & 8) k = insert_w(k, ((int *)(&src.m_pValue[extract_w(src.m_vindex)]))[3]);
+
+		return vfloat{ _mm_castsi128_ps(k) };
+	}
+
+	CPPSPMD_FORCE_INLINE const vint_vref& store(const vint_vref& dst, const vint& src)
+	{
+		// TODO: There's surely a better way
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		
+		if (mask & 1) ((int *)(&dst.m_pValue[extract_x(dst.m_vindex)]))[0] = extract_x(src.m_value);
+		if (mask & 2) ((int *)(&dst.m_pValue[extract_y(dst.m_vindex)]))[1] = extract_y(src.m_value);
+		if (mask & 4) ((int *)(&dst.m_pValue[extract_z(dst.m_vindex)]))[2] = extract_z(src.m_value);
+		if (mask & 8) ((int *)(&dst.m_pValue[extract_w(dst.m_vindex)]))[3] = extract_w(src.m_value);
+
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE vint load(const vint_vref& src)
+	{
+		// TODO: There's surely a better way
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+
+		__m128i k = _mm_setzero_si128();
+
+		if (mask & 1) k = insert_x(k, ((int *)(&src.m_pValue[extract_x(src.m_vindex)]))[0]);
+		if (mask & 2) k = insert_y(k, ((int *)(&src.m_pValue[extract_y(src.m_vindex)]))[1]);
+		if (mask & 4) k = insert_z(k, ((int *)(&src.m_pValue[extract_z(src.m_vindex)]))[2]);
+		if (mask & 8) k = insert_w(k, ((int *)(&src.m_pValue[extract_w(src.m_vindex)]))[3]);
+
+		return vint{ k };
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_all(const vint_vref& src)
+	{
+		// TODO: There's surely a better way
+		__m128i k;
+
+		k = insert_x(k, ((int*)(&src.m_pValue[extract_x(src.m_vindex)]))[0]);
+		k = insert_y(k, ((int*)(&src.m_pValue[extract_y(src.m_vindex)]))[1]);
+		k = insert_z(k, ((int*)(&src.m_pValue[extract_z(src.m_vindex)]))[2]);
+		k = insert_w(k, ((int*)(&src.m_pValue[extract_w(src.m_vindex)]))[3]);
+
+		return vint{ k };
+	}
+			
+	// Linear integer
+	struct lint
+	{
+		__m128i m_value;
+
+		CPPSPMD_FORCE_INLINE explicit lint(__m128i value)
+			: m_value(value)
+		{ }
+
+		CPPSPMD_FORCE_INLINE explicit operator vfloat() const
+		{
+			return vfloat{ _mm_cvtepi32_ps(m_value) };
+		}
+
+		CPPSPMD_FORCE_INLINE explicit operator vint() const
+		{
+			return vint{ m_value };
+		}
+
+		CPPSPMD_FORCE_INLINE int get_first_value() const 
+		{
+			return _mm_cvtsi128_si32(m_value);
+		}
+
+		CPPSPMD_FORCE_INLINE float_lref operator[](float* ptr) const
+		{
+			return float_lref{ ptr + get_first_value() };
+		}
+
+		CPPSPMD_FORCE_INLINE int_lref operator[](int* ptr) const
+		{
+			return int_lref{ ptr + get_first_value() };
+		}
+
+		CPPSPMD_FORCE_INLINE int16_lref operator[](int16_t* ptr) const
+		{
+			return int16_lref{ ptr + get_first_value() };
+		}
+
+		CPPSPMD_FORCE_INLINE cint_lref operator[](const int* ptr) const
+		{
+			return cint_lref{ ptr + get_first_value() };
+		}
+
+	private:
+		lint& operator=(const lint&);
+	};
+
+	CPPSPMD_FORCE_INLINE lint& store_all(lint& dst, const lint& src)
+	{
+		dst.m_value = src.m_value;
+		return dst;
+	}
+	
+	const lint program_index = lint{ _mm_set_epi32( 3, 2, 1, 0 ) };
+	
+	// SPMD condition helpers
+
+	template<typename IfBody>
+	CPPSPMD_FORCE_INLINE void spmd_if(const vbool& cond, const IfBody& ifBody);
+
+	CPPSPMD_FORCE_INLINE void spmd_if_break(const vbool& cond);
+
+	// No breaks, continues, etc. allowed
+	template<typename IfBody>
+	CPPSPMD_FORCE_INLINE void spmd_sif(const vbool& cond, const IfBody& ifBody);
+
+	// No breaks, continues, etc. allowed
+	template<typename IfBody, typename ElseBody>
+	CPPSPMD_FORCE_INLINE void spmd_sifelse(const vbool& cond, const IfBody& ifBody, const ElseBody &elseBody);
+
+	template<typename IfBody, typename ElseBody>
+	CPPSPMD_FORCE_INLINE void spmd_ifelse(const vbool& cond, const IfBody& ifBody, const ElseBody& elseBody);
+
+	template<typename WhileCondBody, typename WhileBody>
+	CPPSPMD_FORCE_INLINE void spmd_while(const WhileCondBody& whileCondBody, const WhileBody& whileBody);
+
+	template<typename ForInitBody, typename ForCondBody, typename ForIncrBody, typename ForBody>
+	CPPSPMD_FORCE_INLINE void spmd_for(const ForInitBody& forInitBody, const ForCondBody& forCondBody, const ForIncrBody& forIncrBody, const ForBody& forBody);
+
+	template<typename ForeachBody>
+	CPPSPMD_FORCE_INLINE void spmd_foreach(int begin, int end, const ForeachBody& foreachBody);
+		
+#ifdef _DEBUG
+	CPPSPMD_FORCE_INLINE void check_masks();
+#else
+	CPPSPMD_FORCE_INLINE void check_masks() { }
+#endif
+
+	CPPSPMD_FORCE_INLINE void spmd_break();
+	CPPSPMD_FORCE_INLINE void spmd_continue();
+	
+	CPPSPMD_FORCE_INLINE void spmd_return();
+	
+	template<typename UnmaskedBody>
+	CPPSPMD_FORCE_INLINE void spmd_unmasked(const UnmaskedBody& unmaskedBody);
+
+	template<typename SPMDKernel, typename... Args>
+	//CPPSPMD_FORCE_INLINE decltype(auto) spmd_call(Args&&... args);
+	CPPSPMD_FORCE_INLINE void spmd_call(Args&&... args);
+
+	CPPSPMD_FORCE_INLINE void swap(vint &a, vint &b) { vint temp = a; store(a, b); store(b, temp); }
+	CPPSPMD_FORCE_INLINE void swap(vfloat &a, vfloat &b) { vfloat temp = a; store(a, b); store(b, temp); }
+	CPPSPMD_FORCE_INLINE void swap(vbool &a, vbool &b) { vbool temp = a; store(a, b); store(b, temp); }
+
+	CPPSPMD_FORCE_INLINE float reduce_add(vfloat v)	
+	{ 
+		__m128 k3210 = _mm_castsi128_ps(blendv_mask_epi32(_mm_setzero_si128(), _mm_castps_si128(v.m_value), m_exec.m_mask));
+
+//#if CPPSPMD_SSE2
+#if 1
+		// See https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction/35270026#35270026
+		__m128 shuf   = _mm_shuffle_ps(k3210, k3210, _MM_SHUFFLE(2, 3, 0, 1));
+		__m128 sums   = _mm_add_ps(k3210, shuf);
+		shuf          = _mm_movehl_ps(shuf, sums);
+		sums          = _mm_add_ss(sums, shuf);
+		return _mm_cvtss_f32(sums);
+#else
+		// This is pretty slow.
+		__m128 a = _mm_hadd_ps(k3210, k3210);
+		__m128 b = _mm_hadd_ps(a, a);
+		return extractf_ps_x(b);
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE int reduce_add(vint v)
+	{
+		__m128i k3210 = blendv_mask_epi32(_mm_setzero_si128(), v.m_value, m_exec.m_mask);
+
+		// See https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction/35270026#35270026
+		__m128i shuf = _mm_shuffle_epi32(k3210, _MM_SHUFFLE(2, 3, 0, 1));
+		__m128i sums = _mm_add_epi32(k3210, shuf);
+		shuf = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(shuf), _mm_castsi128_ps(sums)));
+		sums = _mm_add_epi32(sums, shuf);
+		return extract_x(sums);
+	}
+
+	#include "cppspmd_math_declares.h"
+
+}; // struct spmd_kernel
+
+using exec_mask = spmd_kernel::exec_mask;
+using vint = spmd_kernel::vint;
+using int_lref = spmd_kernel::int_lref;
+using cint_vref = spmd_kernel::cint_vref;
+using cint_lref = spmd_kernel::cint_lref;
+using int_vref = spmd_kernel::int_vref;
+using lint = spmd_kernel::lint;
+using vbool = spmd_kernel::vbool;
+using vfloat = spmd_kernel::vfloat;
+using float_lref = spmd_kernel::float_lref;
+using float_vref = spmd_kernel::float_vref;
+using vfloat_vref = spmd_kernel::vfloat_vref;
+using vint_vref = spmd_kernel::vint_vref;
+
+CPPSPMD_FORCE_INLINE spmd_kernel::vbool::operator vfloat() const 
+{ 
+	return vfloat { _mm_and_ps( _mm_castsi128_ps(m_value), *(const __m128 *)g_onef_128 ) }; 
+}
+	
+// Returns UINT32_MAX's for true, 0 for false. (Should it return 1's?)
+CPPSPMD_FORCE_INLINE spmd_kernel::vbool::operator vint() const 
+{ 
+	return vint { m_value };
+}
+
+CPPSPMD_FORCE_INLINE vbool operator!(const vbool& v)
+{
+	return vbool{ _mm_castps_si128(_mm_xor_ps(_mm_load_ps((const float*)g_allones_128), _mm_castsi128_ps(v.m_value))) };
+}
+
+CPPSPMD_FORCE_INLINE exec_mask::exec_mask(const vbool& b) { m_mask = b.m_value; }
+
+CPPSPMD_FORCE_INLINE exec_mask operator^(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_xor_si128(a.m_mask, b.m_mask) }; }
+CPPSPMD_FORCE_INLINE exec_mask operator&(const exec_mask& a, const exec_mask& b) {	return exec_mask{ _mm_and_si128(a.m_mask, b.m_mask) }; }
+CPPSPMD_FORCE_INLINE exec_mask operator|(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_or_si128(a.m_mask, b.m_mask) }; }
+
+CPPSPMD_FORCE_INLINE bool all(const exec_mask& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_mask)) == ALL_ON_MOVEMASK; }
+CPPSPMD_FORCE_INLINE bool any(const exec_mask& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_mask)) != 0; }
+
+// Bad pattern - doesn't factor in the current exec mask. Prefer spmd_any() instead.
+CPPSPMD_FORCE_INLINE bool all(const vbool& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_value)) == ALL_ON_MOVEMASK; }
+CPPSPMD_FORCE_INLINE bool any(const vbool& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_value)) != 0; }
+
+CPPSPMD_FORCE_INLINE exec_mask andnot(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_andnot_si128(a.m_mask, b.m_mask) }; }
+CPPSPMD_FORCE_INLINE vbool operator||(const vbool& a, const vbool& b) { return vbool{ _mm_or_si128(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator&&(const vbool& a, const vbool& b) { return vbool{ _mm_and_si128(a.m_value, b.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vfloat operator+(const vfloat& a, const vfloat& b) { return vfloat{ _mm_add_ps(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, const vfloat& b) {	return vfloat{ _mm_sub_ps(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat operator+(float a, const vfloat& b) { return vfloat(a) + b; }
+CPPSPMD_FORCE_INLINE vfloat operator+(const vfloat& a, float b) { return a + vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, const vint& b) { return a - vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator-(const vint& a, const vfloat& b) { return vfloat(a) - b; }
+CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, int b) { return a - vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator-(int a, const vfloat& b) { return vfloat(a) - b; }
+CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, float b) { return a - vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator-(float a, const vfloat& b) { return vfloat(a) - b; }
+
+CPPSPMD_FORCE_INLINE vfloat operator*(const vfloat& a, const vfloat& b) { return vfloat{ _mm_mul_ps(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat operator*(const vfloat& a, float b) { return a * vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator*(float a, const vfloat& b) { return vfloat(a) * b; }
+CPPSPMD_FORCE_INLINE vfloat operator*(const vfloat& a, int b) { return a * vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator*(int a, const vfloat& b) { return vfloat(a) * b; }
+
+CPPSPMD_FORCE_INLINE vfloat operator/(const vfloat& a, const vfloat& b) {	return vfloat{ _mm_div_ps(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat operator/(const vfloat& a, int b) { return a / vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator/(int a, const vfloat& b) { return vfloat(a) / b; }
+CPPSPMD_FORCE_INLINE vfloat operator/(const vfloat& a, float b) { return a / vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator/(float a, const vfloat& b) { return vfloat(a) / b; }
+CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& v) { return vfloat{ _mm_sub_ps(_mm_xor_ps(v.m_value, v.m_value), v.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vbool operator==(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmpeq_ps(a.m_value, b.m_value)) }; }
+CPPSPMD_FORCE_INLINE vbool operator==(const vfloat& a, float b) { return a == vfloat(b); }
+
+CPPSPMD_FORCE_INLINE vbool operator!=(const vfloat& a, const vfloat& b) { return !vbool{ _mm_castps_si128(_mm_cmpeq_ps(a.m_value, b.m_value)) }; }
+CPPSPMD_FORCE_INLINE vbool operator!=(const vfloat& a, float b) { return a != vfloat(b); }
+
+CPPSPMD_FORCE_INLINE vbool operator<(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmplt_ps(a.m_value, b.m_value)) }; }
+CPPSPMD_FORCE_INLINE vbool operator<(const vfloat& a, float b) { return a < vfloat(b); }
+
+CPPSPMD_FORCE_INLINE vbool operator>(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmpgt_ps(a.m_value, b.m_value)) }; }
+CPPSPMD_FORCE_INLINE vbool operator>(const vfloat& a, float b) { return a > vfloat(b); }
+
+CPPSPMD_FORCE_INLINE vbool operator<=(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmple_ps(a.m_value, b.m_value)) }; }
+CPPSPMD_FORCE_INLINE vbool operator<=(const vfloat& a, float b) { return a <= vfloat(b); }
+
+CPPSPMD_FORCE_INLINE vbool operator>=(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmpge_ps(a.m_value, b.m_value)) }; }
+CPPSPMD_FORCE_INLINE vbool operator>=(const vfloat& a, float b) { return a >= vfloat(b); }
+
+CPPSPMD_FORCE_INLINE vfloat spmd_ternaryf(const vbool& cond, const vfloat& a, const vfloat& b) { return vfloat{ blendv_mask_ps(b.m_value, a.m_value, _mm_castsi128_ps(cond.m_value)) }; }
+CPPSPMD_FORCE_INLINE vint spmd_ternaryi(const vbool& cond, const vint& a, const vint& b) { return vint{ blendv_mask_epi32(b.m_value, a.m_value, cond.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vfloat sqrt(const vfloat& v) { return vfloat{ _mm_sqrt_ps(v.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat abs(const vfloat& v) { return vfloat{ _mm_andnot_ps(_mm_set1_ps(-0.0f), v.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat max(const vfloat& a, const vfloat& b) { return vfloat{ _mm_max_ps(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat min(const vfloat& a, const vfloat& b) {	return vfloat{ _mm_min_ps(a.m_value, b.m_value) }; }
+
+#if CPPSPMD_SSE2
+CPPSPMD_FORCE_INLINE vfloat round_truncate(const vfloat& a)
+{
+	__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU) );
+	__m128i has_fractional = _mm_cmplt_epi32(abs_a, _mm_castps_si128(_mm_set1_ps(8388608.0f)));
+		
+	__m128i ai = _mm_cvttps_epi32(a.m_value);
+	
+	__m128 af = _mm_cvtepi32_ps(ai);
+	return vfloat{ blendv_mask_ps(a.m_value, af, _mm_castsi128_ps(has_fractional)) };
+}
+
+CPPSPMD_FORCE_INLINE vfloat floor(const vfloat& a)
+{
+	__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU));
+	__m128i has_fractional = _mm_cmplt_epi32(abs_a, _mm_castps_si128(_mm_set1_ps(8388608.0f)));
+
+	__m128i ai = _mm_cvtps_epi32(a.m_value);
+	__m128 af = _mm_cvtepi32_ps(ai);
+	__m128 changed = _mm_cvtepi32_ps(_mm_castps_si128(_mm_cmpgt_ps(af, a.m_value)));
+
+	af = _mm_add_ps(af, changed);
+
+	return vfloat{ blendv_mask_ps(a.m_value, af, _mm_castsi128_ps(has_fractional)) };
+}
+
+CPPSPMD_FORCE_INLINE vfloat ceil(const vfloat& a)
+{
+	__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU));
+	__m128i has_fractional = _mm_cmplt_epi32(abs_a, _mm_castps_si128(_mm_set1_ps(8388608.0f)));
+	
+	__m128i ai = _mm_cvtps_epi32(a.m_value);
+	__m128 af = _mm_cvtepi32_ps(ai);
+	__m128 changed = _mm_cvtepi32_ps(_mm_castps_si128(_mm_cmplt_ps(af, a.m_value)));
+	
+	af = _mm_sub_ps(af, changed);
+
+	return vfloat{ blendv_mask_ps(a.m_value, af, _mm_castsi128_ps(has_fractional)) };
+}
+
+// We need to disable unsafe math optimizations for the key operations used for rounding to nearest.
+// I wish there was a better way.
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+inline __m128 add_sub(__m128 a, __m128 b) __attribute__((optimize("-fno-unsafe-math-optimizations")))
+#elif defined(__clang__)
+inline __m128 add_sub(__m128 a, __m128 b) __attribute__((optnone))
+#elif defined (_MSC_VER)
+#pragma float_control(push)
+#pragma float_control(precise, on)
+inline __m128 add_sub(__m128 a, __m128 b)
+#else
+inline __m128 add_sub(__m128 a, __m128 b)
+#endif
+{
+	return _mm_sub_ps(_mm_add_ps(a, b), b);
+}
+
+#if defined (_MSC_VER)
+#pragma float_control(pop)
+#endif
+
+CPPSPMD_FORCE_INLINE vfloat round_nearest(const vfloat& a)
+{
+	__m128i no_fract_fp_bits = _mm_castps_si128(_mm_set1_ps(8388608.0f));
+
+	__m128i sign_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x80000000U));
+	__m128 force_int = _mm_castsi128_ps(_mm_or_si128(no_fract_fp_bits, sign_a));
+	
+	// Can't use individual _mm_add_ps/_mm_sub_ps - this will be optimized out with /fp:fast by clang and probably other compilers.
+	//__m128 temp1 = _mm_add_ps(a.m_value, force_int);
+	//__m128 temp2 = _mm_sub_ps(temp1, force_int);
+	__m128 temp2 = add_sub(a.m_value, force_int);
+	
+	__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU));
+	__m128i has_fractional = _mm_cmplt_epi32(abs_a, no_fract_fp_bits);
+	return vfloat{ blendv_mask_ps(a.m_value, temp2, _mm_castsi128_ps(has_fractional)) };
+}
+
+#else
+CPPSPMD_FORCE_INLINE vfloat floor(const vfloat& v) { return vfloat{ _mm_floor_ps(v.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat ceil(const vfloat& a) { return vfloat{ _mm_ceil_ps(a.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat round_nearest(const vfloat &a) { return vfloat{ _mm_round_ps(a.m_value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ) }; }
+CPPSPMD_FORCE_INLINE vfloat round_truncate(const vfloat &a) { return vfloat{ _mm_round_ps(a.m_value, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ) }; }
+#endif
+
+CPPSPMD_FORCE_INLINE vfloat frac(const vfloat& a) { return a - floor(a); }
+CPPSPMD_FORCE_INLINE vfloat fmod(vfloat a, vfloat b) { vfloat c = frac(abs(a / b)) * abs(b); return spmd_ternaryf(a < 0, -c, c); }
+CPPSPMD_FORCE_INLINE vfloat sign(const vfloat& a) { return spmd_ternaryf(a < 0.0f, 1.0f, 1.0f); }
+
+CPPSPMD_FORCE_INLINE vint max(const vint& a, const vint& b) { return vint{ max_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint min(const vint& a, const vint& b) {	return vint{ min_epi32(a.m_value, b.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint maxu(const vint& a, const vint& b) { return vint{ max_epu32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint minu(const vint& a, const vint& b) { return vint{ min_epu32(a.m_value, b.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint abs(const vint& v) { return vint{ abs_epi32(v.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint byteswap(const vint& v) {	return vint{ shuffle_epi8(v.m_value, _mm_set_epi8(12, 13, 14, 15,  8,  9, 10, 11,  4,  5,  6,  7,  0,  1,  2,  3)) }; }
+
+CPPSPMD_FORCE_INLINE vint cast_vfloat_to_vint(const vfloat& v) { return vint{ _mm_castps_si128(v.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat cast_vint_to_vfloat(const vint& v) { return vfloat{ _mm_castsi128_ps(v.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vfloat clamp(const vfloat& v, const vfloat& a, const vfloat& b)
+{
+	return vfloat{ _mm_min_ps(b.m_value, _mm_max_ps(v.m_value, a.m_value) ) };
+}
+
+CPPSPMD_FORCE_INLINE vint clamp(const vint& v, const vint& a, const vint& b)
+{
+	return vint{ min_epi32(b.m_value, max_epi32(v.m_value, a.m_value) ) };
+}
+
+CPPSPMD_FORCE_INLINE vfloat vfma(const vfloat& a, const vfloat& b, const vfloat& c)
+{
+	return vfloat{ _mm_add_ps(_mm_mul_ps(a.m_value, b.m_value), c.m_value) };
+}
+
+CPPSPMD_FORCE_INLINE vfloat vfms(const vfloat& a, const vfloat& b, const vfloat& c)
+{
+	return vfloat{ _mm_sub_ps(_mm_mul_ps(a.m_value, b.m_value), c.m_value) };
+}
+
+CPPSPMD_FORCE_INLINE vfloat vfnma(const vfloat& a, const vfloat& b, const vfloat& c)
+{
+	return vfloat{ _mm_sub_ps(c.m_value, _mm_mul_ps(a.m_value, b.m_value)) };
+}
+
+CPPSPMD_FORCE_INLINE vfloat vfnms(const vfloat& a, const vfloat& b, const vfloat& c)
+{
+	return vfloat{ _mm_sub_ps(_mm_sub_ps(_mm_xor_ps(a.m_value, a.m_value), _mm_mul_ps(a.m_value, b.m_value)), c.m_value) };
+}
+
+CPPSPMD_FORCE_INLINE vfloat lerp(const vfloat &x, const vfloat &y, const vfloat &s) { return vfma(y - x, s, x); }
+
+CPPSPMD_FORCE_INLINE lint operator+(int a, const lint& b) { return lint{ _mm_add_epi32(_mm_set1_epi32(a), b.m_value) }; }
+CPPSPMD_FORCE_INLINE lint operator+(const lint& a, int b) { return lint{ _mm_add_epi32(a.m_value, _mm_set1_epi32(b)) }; }
+CPPSPMD_FORCE_INLINE vfloat operator+(float a, const lint& b) { return vfloat(a) + vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator+(const lint& a, float b) { return vfloat(a) + vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator*(const lint& a, float b) { return vfloat(a) * vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator*(float b, const lint& a) { return vfloat(a) * vfloat(b); }
+
+CPPSPMD_FORCE_INLINE vint operator&(const vint& a, const vint& b) { return vint{ _mm_and_si128(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator&(const vint& a, int b) { return a & vint(b); }
+CPPSPMD_FORCE_INLINE vint andnot(const vint& a, const vint& b) { return vint{ _mm_andnot_si128(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator|(const vint& a, const vint& b) { return vint{ _mm_or_si128(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator|(const vint& a, int b) { return a | vint(b); }
+CPPSPMD_FORCE_INLINE vint operator^(const vint& a, const vint& b) { return vint{ _mm_xor_si128(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator^(const vint& a, int b) { return a ^ vint(b); }
+CPPSPMD_FORCE_INLINE vbool operator==(const vint& a, const vint& b) { return vbool{ _mm_cmpeq_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator!=(const vint& a, const vint& b) { return !vbool{ _mm_cmpeq_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator<(const vint& a, const vint& b) { return vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator<=(const vint& a, const vint& b) { return !vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator>=(const vint& a, const vint& b) { return !vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator>(const vint& a, const vint& b) { return vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator+(const vint& a, const vint& b) { return vint{ _mm_add_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator-(const vint& a, const vint& b) { return vint{ _mm_sub_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator+(const vint& a, int b) { return a + vint(b); }
+CPPSPMD_FORCE_INLINE vint operator-(const vint& a, int b) { return a - vint(b); }
+CPPSPMD_FORCE_INLINE vint operator+(int a, const vint& b) { return vint(a) + b; }
+CPPSPMD_FORCE_INLINE vint operator-(int a, const vint& b) { return vint(a) - b; }
+CPPSPMD_FORCE_INLINE vint operator*(const vint& a, const vint& b) { return vint{ mullo_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator*(const vint& a, int b) { return a * vint(b); }
+CPPSPMD_FORCE_INLINE vint operator*(int a, const vint& b) { return vint(a) * b; }
+
+CPPSPMD_FORCE_INLINE vint mulhiu(const vint& a, const vint& b) { return vint{ mulhi_epu32(a.m_value, b.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint operator-(const vint& v) { return vint{ _mm_sub_epi32(_mm_setzero_si128(), v.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint operator~(const vint& a) { return vint{ -a - 1 }; }
+
+// A few of these break the lane-based abstraction model. They are supported in SSE2, so it makes sense to support them and let the user figure it out.
+CPPSPMD_FORCE_INLINE vint adds_epu8(const vint& a, const vint& b) {	return vint{ _mm_adds_epu8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint subs_epu8(const vint& a, const vint& b) { return vint{ _mm_subs_epu8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint avg_epu8(const vint & a, const vint & b) { return vint{ _mm_avg_epu8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint max_epu8(const vint& a, const vint& b) { return vint{ _mm_max_epu8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint min_epu8(const vint& a, const vint& b) { return vint{ _mm_min_epu8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint sad_epu8(const vint& a, const vint& b) { return vint{ _mm_sad_epu8(a.m_value, b.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint add_epi8(const vint& a, const vint& b) { return vint{ _mm_add_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint adds_epi8(const vint& a, const vint& b) { return vint{ _mm_adds_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint sub_epi8(const vint& a, const vint& b) { return vint{ _mm_sub_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint subs_epi8(const vint& a, const vint& b) { return vint{ _mm_subs_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint cmpeq_epi8(const vint& a, const vint& b) { return vint{ _mm_cmpeq_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint cmpgt_epi8(const vint& a, const vint& b) { return vint{ _mm_cmpgt_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint cmplt_epi8(const vint& a, const vint& b) { return vint{ _mm_cmplt_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint unpacklo_epi8(const vint& a, const vint& b) { return vint{ _mm_unpacklo_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint unpackhi_epi8(const vint& a, const vint& b) { return vint{ _mm_unpackhi_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE int movemask_epi8(const vint& a) { return _mm_movemask_epi8(a.m_value); }
+CPPSPMD_FORCE_INLINE int movemask_epi32(const vint& a) { return _mm_movemask_ps(_mm_castsi128_ps(a.m_value)); }
+
+CPPSPMD_FORCE_INLINE vint cmple_epu8(const vint& a, const vint& b) { return vint{ _mm_cmpeq_epi8(_mm_min_epu8(a.m_value, b.m_value), a.m_value) }; }
+CPPSPMD_FORCE_INLINE vint cmpge_epu8(const vint& a, const vint& b) { return vint{ cmple_epu8(b, a) }; }
+CPPSPMD_FORCE_INLINE vint cmpgt_epu8(const vint& a, const vint& b) { return vint{ _mm_andnot_si128(_mm_cmpeq_epi8(a.m_value, b.m_value), _mm_cmpeq_epi8(_mm_max_epu8(a.m_value, b.m_value), a.m_value)) }; }
+CPPSPMD_FORCE_INLINE vint cmplt_epu8(const vint& a, const vint& b) { return vint{ cmpgt_epu8(b, a) }; }
+CPPSPMD_FORCE_INLINE vint absdiff_epu8(const vint& a, const vint& b) { return vint{ _mm_or_si128(_mm_subs_epu8(a.m_value, b.m_value), _mm_subs_epu8(b.m_value, a.m_value)) }; }
+
+CPPSPMD_FORCE_INLINE vint blendv_epi8(const vint& a, const vint& b, const vint &mask) { return vint{ blendv_epi8(a.m_value, b.m_value, _mm_cmplt_epi8(mask.m_value, _mm_setzero_si128())) }; }
+CPPSPMD_FORCE_INLINE vint blendv_epi32(const vint& a, const vint& b, const vint &mask) { return vint{ blendv_epi32(a.m_value, b.m_value, mask.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint add_epi16(const vint& a, const vint& b) { return vint{ _mm_add_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint adds_epi16(const vint& a, const vint& b) { return vint{ _mm_adds_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint adds_epu16(const vint& a, const vint& b) { return vint{ _mm_adds_epu16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint avg_epu16(const vint& a, const vint& b) { return vint{ _mm_avg_epu16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint sub_epi16(const vint& a, const vint& b) { return vint{ _mm_sub_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint subs_epi16(const vint& a, const vint& b) { return vint{ _mm_subs_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint subs_epu16(const vint& a, const vint& b) { return vint{ _mm_subs_epu16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint mullo_epi16(const vint& a, const vint& b) { return vint{ _mm_mullo_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint mulhi_epi16(const vint& a, const vint& b) { return vint{ _mm_mulhi_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint mulhi_epu16(const vint& a, const vint& b) { return vint{ _mm_mulhi_epu16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint min_epi16(const vint& a, const vint& b) { return vint{ _mm_min_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint max_epi16(const vint& a, const vint& b) { return vint{ _mm_max_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint madd_epi16(const vint& a, const vint& b) { return vint{ _mm_madd_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint cmpeq_epi16(const vint& a, const vint& b) { return vint{ _mm_cmpeq_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint cmpgt_epi16(const vint& a, const vint& b) { return vint{ _mm_cmpgt_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint cmplt_epi16(const vint& a, const vint& b) { return vint{ _mm_cmplt_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint packs_epi16(const vint& a, const vint& b) { return vint{ _mm_packs_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint packus_epi16(const vint& a, const vint& b) { return vint{ _mm_packus_epi16(a.m_value, b.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint uniform_shift_left_epi16(const vint& a, const vint& b) { return vint{ _mm_sll_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint uniform_arith_shift_right_epi16(const vint& a, const vint& b) { return vint{ _mm_sra_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint uniform_shift_right_epi16(const vint& a, const vint& b) { return vint{ _mm_srl_epi16(a.m_value, b.m_value) }; }
+
+#define VINT_SHIFT_LEFT_EPI16(a, b) vint(_mm_slli_epi16((a).m_value, b))
+#define VINT_SHIFT_RIGHT_EPI16(a, b) vint(_mm_srai_epi16((a).m_value, b))
+#define VUINT_SHIFT_RIGHT_EPI16(a, b) vint(_mm_srli_epi16((a).m_value, b))
+
+CPPSPMD_FORCE_INLINE vint undefined_vint() { return vint{ _mm_undefined_si128() }; }
+CPPSPMD_FORCE_INLINE vfloat undefined_vfloat() { return vfloat{ _mm_undefined_ps() }; }
+
+// control is an 8-bit immediate value containing 4 2-bit indices which shuffles the int32's in each 128-bit lane.
+#define VINT_LANE_SHUFFLE_EPI32(a, control) vint(_mm_shuffle_epi32((a).m_value, control))
+
+// control is an 8-bit immediate value containing 4 2-bit indices which shuffles the int16's in either the high or low 64-bit lane.
+#define VINT_LANE_SHUFFLELO_EPI16(a, control) vint(_mm_shufflelo_epi16((a).m_value, control))
+#define VINT_LANE_SHUFFLEHI_EPI16(a, control) vint(_mm_shufflehi_epi16((a).m_value, control))
+
+#define VINT_LANE_SHUFFLE_MASK(a, b, c, d) ((a) | ((b) << 2) | ((c) << 4) | ((d) << 6))
+#define VINT_LANE_SHUFFLE_MASK_R(d, c, b, a) ((a) | ((b) << 2) | ((c) << 4) | ((d) << 6))
+
+#define VINT_LANE_SHIFT_LEFT_BYTES(a, l) vint(_mm_slli_si128((a).m_value, l))
+#define VINT_LANE_SHIFT_RIGHT_BYTES(a, l) vint(_mm_srli_si128((a).m_value, l))
+
+// Unpack and interleave 8-bit integers from the low or high half of a and b
+CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi8(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi8(a.m_value, b.m_value)); }
+CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi8(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi8(a.m_value, b.m_value)); }
+
+// Unpack and interleave 16-bit integers from the low or high half of a and b
+CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi16(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi16(a.m_value, b.m_value)); }
+CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi16(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi16(a.m_value, b.m_value)); }
+
+// Unpack and interleave 32-bit integers from the low or high half of a and b
+CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi32(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi32(a.m_value, b.m_value)); }
+CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi32(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi32(a.m_value, b.m_value)); }
+
+// Unpack and interleave 64-bit integers from the low or high half of a and b
+CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi64(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi64(a.m_value, b.m_value)); }
+CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi64(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi64(a.m_value, b.m_value)); }
+
+CPPSPMD_FORCE_INLINE vint vint_set1_epi8(int8_t a) { return vint(_mm_set1_epi8(a)); }
+CPPSPMD_FORCE_INLINE vint vint_set1_epi16(int16_t a) { return vint(_mm_set1_epi16(a)); }
+CPPSPMD_FORCE_INLINE vint vint_set1_epi32(int32_t a) { return vint(_mm_set1_epi32(a)); }
+CPPSPMD_FORCE_INLINE vint vint_set1_epi64(int64_t a) { return vint(_mm_set1_epi64x(a)); }
+
+CPPSPMD_FORCE_INLINE vint mul_epu32(const vint &a, const vint& b) { return vint(_mm_mul_epu32(a.m_value, b.m_value)); }
+
+CPPSPMD_FORCE_INLINE vint div_epi32(const vint &a, const vint& b)
+{
+	__m128d al = _mm_cvtepi32_pd(a.m_value);
+	__m128d ah = _mm_cvtepi32_pd(_mm_unpackhi_epi64(a.m_value, a.m_value));
+
+	__m128d bl = _mm_cvtepi32_pd(b.m_value);
+	__m128d bh = _mm_cvtepi32_pd(_mm_unpackhi_epi64(b.m_value, b.m_value));
+
+	__m128d rl = _mm_div_pd(al, bl);
+	__m128d rh = _mm_div_pd(ah, bh);
+
+	__m128i rli = _mm_cvttpd_epi32(rl);
+	__m128i rhi = _mm_cvttpd_epi32(rh);
+
+	return vint(_mm_unpacklo_epi64(rli, rhi));
+}
+
+CPPSPMD_FORCE_INLINE vint mod_epi32(const vint &a, const vint& b)
+{
+	vint aa = abs(a), ab = abs(b);
+	vint q = div_epi32(aa, ab);
+	vint r = aa - q * ab;
+	return spmd_ternaryi(a < 0, -r, r);
+}
+
+CPPSPMD_FORCE_INLINE vint operator/ (const vint& a, const vint& b)
+{
+	return div_epi32(a, b);
+}
+
+CPPSPMD_FORCE_INLINE vint operator/ (const vint& a, int b)
+{
+	return div_epi32(a, vint(b));
+}
+
+CPPSPMD_FORCE_INLINE vint operator% (const vint& a, const vint& b)
+{
+	return mod_epi32(a, b);
+}
+
+CPPSPMD_FORCE_INLINE vint operator% (const vint& a, int b)
+{
+	return mod_epi32(a, vint(b));
+}
+
+CPPSPMD_FORCE_INLINE vint operator<< (const vint& a, const vint& b)
+{
+#if 0
+	CPPSPMD_ALIGN(32) int result[4];
+	result[0] = extract_x(a.m_value) << extract_x(b.m_value);
+	result[1] = extract_y(a.m_value) << extract_y(b.m_value);
+	result[2] = extract_z(a.m_value) << extract_z(b.m_value);
+	result[3] = extract_w(a.m_value) << extract_w(b.m_value);
+
+	return vint{ _mm_load_si128((__m128i*)result) };
+#elif 0
+	int x = extract_x(a.m_value) << extract_x(b.m_value);
+	int y = extract_y(a.m_value) << extract_y(b.m_value);
+	int z = extract_z(a.m_value) << extract_z(b.m_value);
+	int w = extract_w(a.m_value) << extract_w(b.m_value);
+
+	__m128i v = insert_x(_mm_undefined_si128(), x);
+	v = insert_y(v, y);
+	v = insert_z(v, z);
+	return vint{ insert_w(v, w) };
+#else
+	// What this does: shift left each b lane by 23 bits (to move the shift amount into the FP exponent position), then epi32 add to the integer rep of 1.0f, then cast that to float, then convert that to int to get fast 2^x.
+	return a * vint(cast_vint_to_vfloat(vint(_mm_slli_epi32(b.m_value, 23)) + cast_vfloat_to_vint(vfloat(1.0f))));
+#endif
+}
+
+// uniform shift left
+CPPSPMD_FORCE_INLINE vint operator<< (const vint& a, int b)
+{
+	__m128i bv = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(b)), _mm_castsi128_ps(_mm_load_si128((const __m128i *)g_x_128))));
+	return vint{ _mm_sll_epi32(a.m_value, bv) };
+}
+
+// uniform arithmetic shift right
+CPPSPMD_FORCE_INLINE vint operator>> (const vint& a, int b)
+{
+	__m128i bv = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(b)), _mm_castsi128_ps(_mm_load_si128((const __m128i *)g_x_128))));
+	return vint{ _mm_sra_epi32(a.m_value, bv) };
+}
+
+// uniform shift right
+CPPSPMD_FORCE_INLINE vint vuint_shift_right(const vint& a, int b)
+{
+	__m128i bv = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(b)), _mm_castsi128_ps(_mm_load_si128((const __m128i *)g_x_128))));
+	return vint{ _mm_srl_epi32(a.m_value, bv) };
+}
+
+CPPSPMD_FORCE_INLINE vint vuint_shift_right(const vint& a, const vint& b)
+{
+#if 0
+	CPPSPMD_ALIGN(32) int result[4];
+	result[0] = ((uint32_t)extract_x(a.m_value)) >> extract_x(b.m_value);
+	result[1] = ((uint32_t)extract_y(a.m_value)) >> extract_y(b.m_value);
+	result[2] = ((uint32_t)extract_z(a.m_value)) >> extract_z(b.m_value);
+	result[3] = ((uint32_t)extract_w(a.m_value)) >> extract_w(b.m_value);
+
+	return vint{ _mm_load_si128((__m128i*)result) };
+#elif 0
+	uint32_t x = ((uint32_t)extract_x(a.m_value)) >> ((uint32_t)extract_x(b.m_value));
+	uint32_t y = ((uint32_t)extract_y(a.m_value)) >> ((uint32_t)extract_y(b.m_value));
+	uint32_t z = ((uint32_t)extract_z(a.m_value)) >> ((uint32_t)extract_z(b.m_value));
+	uint32_t w = ((uint32_t)extract_w(a.m_value)) >> ((uint32_t)extract_w(b.m_value));
+
+	__m128i v = insert_x(_mm_undefined_si128(), x);
+	v = insert_y(v, y);
+	v = insert_z(v, z);
+	return vint{ insert_w(v, w) };
+#else
+	//vint inv_shift = 32 - b;
+	//vfloat f = cast_vint_to_vfloat(vint(_mm_slli_epi32(inv_shift.m_value, 23)) + cast_vfloat_to_vint(vfloat(1.0f)));
+	
+	// Take float rep of 1.0f (0x3f800000), subtract (32<<23), subtract (shift<<23), cast to float.
+	vfloat f = cast_vint_to_vfloat(vint(_mm_sub_epi32(_mm_set1_epi32(0x4f800000), _mm_slli_epi32(b.m_value, 23))));
+
+	// Now convert scale factor to integer.
+	vint r = vint(f);
+
+	// mulhi_epu32 (using two _mm_mul_epu32), to emulate varying shift left.
+	vint q(mulhi_epu32(a.m_value, r.m_value));
+
+	// Handle shift amounts of 0.
+	return spmd_ternaryi(b > 0, q, a);
+#endif
+}
+
+CPPSPMD_FORCE_INLINE vint vuint_shift_right_not_zero(const vint& a, const vint& b)
+{
+	//vint inv_shift = 32 - b;
+	//vfloat f = cast_vint_to_vfloat(vint(_mm_slli_epi32(inv_shift.m_value, 23)) + cast_vfloat_to_vint(vfloat(1.0f)));
+	
+	// Take float rep of 1.0f (0x3f800000), subtract (32<<23), subtract (shift<<23), cast to float.
+	vfloat f = cast_vint_to_vfloat(vint(_mm_sub_epi32(_mm_set1_epi32(0x4f800000), _mm_slli_epi32(b.m_value, 23))));
+
+	// Now convert scale factor to integer.
+	vint r = vint(f);
+
+	// mulhi_epu32 (using two _mm_mul_epu32), to emulate varying shift left.
+	return vint(mulhi_epu32(a.m_value, r.m_value));
+}
+
+CPPSPMD_FORCE_INLINE vint operator>> (const vint& a, const vint& b)
+{
+#if 0
+	CPPSPMD_ALIGN(32) int result[4];
+	result[0] = extract_x(a.m_value) >> extract_x(b.m_value);
+	result[1] = extract_y(a.m_value) >> extract_y(b.m_value);
+	result[2] = extract_z(a.m_value) >> extract_z(b.m_value);
+	result[3] = extract_w(a.m_value) >> extract_w(b.m_value);
+
+	return vint{ _mm_load_si128((__m128i*)result) };
+#elif 0
+	int x = extract_x(a.m_value) >> extract_x(b.m_value);
+	int y = extract_y(a.m_value) >> extract_y(b.m_value);
+	int z = extract_z(a.m_value) >> extract_z(b.m_value);
+	int w = extract_w(a.m_value) >> extract_w(b.m_value);
+
+	__m128i v = insert_x(_mm_undefined_si128(), x);
+	v = insert_y(v, y);
+	v = insert_z(v, z);
+	return vint{ insert_w(v, w) };
+#else
+	vint sign_mask(_mm_cmplt_epi32(a.m_value, _mm_setzero_si128()));
+	vint a_shifted = vuint_shift_right(a ^ sign_mask, b) ^ sign_mask;
+	return a_shifted;
+#endif
+}
+
+#undef VINT_SHIFT_LEFT
+#undef VINT_SHIFT_RIGHT
+#undef VUINT_SHIFT_RIGHT
+
+// Shift left/right by a uniform immediate constant
+#define VINT_SHIFT_LEFT(a, b) vint(_mm_slli_epi32( (a).m_value, (b) ) )
+#define VINT_SHIFT_RIGHT(a, b) vint( _mm_srai_epi32( (a).m_value, (b) ) ) 
+#define VUINT_SHIFT_RIGHT(a, b) vint( _mm_srli_epi32( (a).m_value, (b) ) )
+#define VINT_ROT(x, k) (VINT_SHIFT_LEFT((x), (k)) | VUINT_SHIFT_RIGHT((x), 32 - (k)))
+
+CPPSPMD_FORCE_INLINE vbool operator==(const lint& a, const lint& b) { return vbool{ _mm_cmpeq_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator==(const lint& a, int b) { return vint(a) == vint(b); }
+CPPSPMD_FORCE_INLINE vbool operator==(int a, const lint& b) { return vint(a) == vint(b); }
+CPPSPMD_FORCE_INLINE vbool operator<(const lint& a, const lint& b) { return vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator>(const lint& a, const lint& b) { return vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator<=(const lint& a, const lint& b) { return !vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator>=(const lint& a, const lint& b) { return !vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
+
+CPPSPMD_FORCE_INLINE float extract(const vfloat& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) float values[4]; _mm_store_ps(values, v.m_value); return values[instance]; }
+CPPSPMD_FORCE_INLINE int extract(const vint& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) int values[4]; _mm_store_si128((__m128i*)values, v.m_value); return values[instance]; }
+CPPSPMD_FORCE_INLINE int extract(const lint& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) int values[4]; _mm_store_si128((__m128i*)values, v.m_value); return values[instance]; }
+CPPSPMD_FORCE_INLINE bool extract(const vbool& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) int values[4]; _mm_store_si128((__m128i*)values, v.m_value); return values[instance] != 0; }
+
+#undef VINT_EXTRACT
+#undef VBOOL_EXTRACT
+#undef VFLOAT_EXTRACT
+
+#if CPPSPMD_SSE2
+// Pass in an immediate constant and the compiler will optimize these expressions.
+#define VINT_EXTRACT(v, instance) ( ((instance) == 0) ? extract_x((v).m_value) : (((instance) == 1) ? extract_y((v).m_value) : (((instance) == 2) ? extract_z((v).m_value) : extract_w((v).m_value))) )
+#define VBOOL_EXTRACT(v, instance) ( ((instance) == 0) ? extract_x((v).m_value) : (((instance) == 1) ? extract_y((v).m_value) : (((instance) == 2) ? extract_z((v).m_value) : extract_w((v).m_value))) )
+#define VFLOAT_EXTRACT(v, instance) ( ((instance) == 0) ? extractf_ps_x((v).m_value) : (((instance) == 1) ? extractf_ps_y((v).m_value) : (((instance) == 2) ? extractf_ps_z((v).m_value) : extractf_ps_w((v).m_value))) )
+#else
+CPPSPMD_FORCE_INLINE float cast_int_bits_as_float(int v) { return *(const float*)&v; }
+
+#define VINT_EXTRACT(v, instance) _mm_extract_epi32((v).m_value, instance)
+#define VBOOL_EXTRACT(v, instance) _mm_extract_epi32((v).m_value, instance)
+#define VFLOAT_EXTRACT(v, instance) cast_int_bits_as_float(_mm_extract_ps((v).m_value, instance))
+#endif
+
+CPPSPMD_FORCE_INLINE vfloat &insert(vfloat& v, int instance, float f)
+{
+	assert(instance < 4);
+	CPPSPMD_ALIGN(16) float values[4];
+	_mm_store_ps(values, v.m_value);
+	values[instance] = f;
+	v.m_value = _mm_load_ps(values);
+	return v;
+}
+
+CPPSPMD_FORCE_INLINE vint &insert(vint& v, int instance, int i)
+{
+	assert(instance < 4);
+	CPPSPMD_ALIGN(16) int values[4];
+	_mm_store_si128((__m128i *)values, v.m_value);
+	values[instance] = i;
+	v.m_value = _mm_load_si128((__m128i *)values);
+	return v;
+}
+
+CPPSPMD_FORCE_INLINE vint init_lookup4(const uint8_t pTab[16])
+{
+	__m128i l = _mm_loadu_si128((const __m128i*)pTab);
+	return vint{ l };
+}
+
+CPPSPMD_FORCE_INLINE vint table_lookup4_8(const vint& a, const vint& table)
+{
+	return vint{ shuffle_epi8(table.m_value, a.m_value) };
+}
+
+CPPSPMD_FORCE_INLINE void init_lookup5(const uint8_t pTab[32], vint& table_0, vint& table_1)
+{
+	__m128i l = _mm_loadu_si128((const __m128i*)pTab);
+	__m128i h = _mm_loadu_si128((const __m128i*)(pTab + 16));
+	table_0.m_value = l;
+	table_1.m_value = h;
+}
+
+CPPSPMD_FORCE_INLINE vint table_lookup5_8(const vint& a, const vint& table_0, const vint& table_1)
+{
+	__m128i l_0 = shuffle_epi8(table_0.m_value, a.m_value);
+	__m128i h_0 = shuffle_epi8(table_1.m_value, a.m_value);
+
+	__m128i m_0 = _mm_slli_epi32(a.m_value, 31 - 4);
+
+	__m128 v_0 = blendv_ps(_mm_castsi128_ps(l_0), _mm_castsi128_ps(h_0), _mm_castsi128_ps(m_0));
+
+	return vint{ _mm_castps_si128(v_0) };
+}
+
+CPPSPMD_FORCE_INLINE void init_lookup6(const uint8_t pTab[64], vint& table_0, vint& table_1, vint& table_2, vint& table_3)
+{
+	__m128i a = _mm_loadu_si128((const __m128i*)pTab);
+	__m128i b = _mm_loadu_si128((const __m128i*)(pTab + 16));
+	__m128i c = _mm_loadu_si128((const __m128i*)(pTab + 32));
+	__m128i d = _mm_loadu_si128((const __m128i*)(pTab + 48));
+
+	table_0.m_value = a;
+	table_1.m_value = b;
+	table_2.m_value = c;
+	table_3.m_value = d;
+}
+
+CPPSPMD_FORCE_INLINE vint table_lookup6_8(const vint& a, const vint& table_0, const vint& table_1, const vint& table_2, const vint& table_3)
+{
+	__m128i m_0 = _mm_slli_epi32(a.m_value, 31 - 4);
+
+	__m128 av_0;
+	{
+		__m128i al_0 = shuffle_epi8(table_0.m_value, a.m_value);
+		__m128i ah_0 = shuffle_epi8(table_1.m_value, a.m_value);
+		av_0 = blendv_ps(_mm_castsi128_ps(al_0), _mm_castsi128_ps(ah_0), _mm_castsi128_ps(m_0));
+	}
+
+	__m128 bv_0;
+	{
+		__m128i bl_0 = shuffle_epi8(table_2.m_value, a.m_value);
+		__m128i bh_0 = shuffle_epi8(table_3.m_value, a.m_value);
+		bv_0 = blendv_ps(_mm_castsi128_ps(bl_0), _mm_castsi128_ps(bh_0), _mm_castsi128_ps(m_0));
+	}
+
+	__m128i m2_0 = _mm_slli_epi32(a.m_value, 31 - 5);
+	__m128 v2_0 = blendv_ps(av_0, bv_0, _mm_castsi128_ps(m2_0));
+
+	return vint{ _mm_castps_si128(v2_0) };
+}
+
+#if 0
+template<typename SPMDKernel, typename... Args>
+CPPSPMD_FORCE_INLINE decltype(auto) spmd_call(Args&&... args)
+{
+	SPMDKernel kernel;
+	kernel.init(exec_mask::all_on());
+	return kernel._call(std::forward<Args>(args)...);
+}
+#else
+template<typename SPMDKernel, typename... Args>
+CPPSPMD_FORCE_INLINE void spmd_call(Args&&... args)
+{
+	SPMDKernel kernel;
+	kernel.init(exec_mask::all_on());
+	kernel._call(std::forward<Args>(args)...);
+}
+#endif
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::init(const spmd_kernel::exec_mask& kernel_exec)
+{
+	m_exec = kernel_exec;
+	m_kernel_exec = kernel_exec;
+	m_continue_mask = exec_mask::all_off();
+
+#ifdef _DEBUG
+	m_in_loop = false;
+#endif
+}
+
+CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store(const float_vref& dst, const vfloat& src)
+{
+	CPPSPMD_ALIGN(16) int vindex[4];
+	_mm_store_si128((__m128i*)vindex, dst.m_vindex);
+
+	CPPSPMD_ALIGN(16) float stored[4];
+	_mm_store_ps(stored, src.m_value);
+
+	int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+	for (int i = 0; i < 4; i++)
+	{
+		if (mask & (1 << i))
+			dst.m_pValue[vindex[i]] = stored[i];
+	}
+	return dst;
+}
+
+CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store_all(const float_vref& dst, const vfloat& src)
+{
+	CPPSPMD_ALIGN(16) int vindex[4];
+	_mm_store_si128((__m128i*)vindex, dst.m_vindex);
+
+	CPPSPMD_ALIGN(16) float stored[4];
+	_mm_store_ps(stored, src.m_value);
+
+	for (int i = 0; i < 4; i++)
+		dst.m_pValue[vindex[i]] = stored[i];
+	return dst;
+}
+
+CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store(const float_vref&& dst, const vfloat& src)
+{
+	CPPSPMD_ALIGN(16) int vindex[4];
+	_mm_store_si128((__m128i*)vindex, dst.m_vindex);
+
+	CPPSPMD_ALIGN(16) float stored[4];
+	_mm_store_ps(stored, src.m_value);
+
+	int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+	for (int i = 0; i < 4; i++)
+	{
+		if (mask & (1 << i))
+			dst.m_pValue[vindex[i]] = stored[i];
+	}
+	return dst;
+}
+
+CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store_all(const float_vref&& dst, const vfloat& src)
+{
+	CPPSPMD_ALIGN(16) int vindex[4];
+	_mm_store_si128((__m128i*)vindex, dst.m_vindex);
+
+	CPPSPMD_ALIGN(16) float stored[4];
+	_mm_store_ps(stored, src.m_value);
+
+	for (int i = 0; i < 4; i++)
+		dst.m_pValue[vindex[i]] = stored[i];
+	return dst;
+}
+
+#include "cppspmd_flow.h"
+#include "cppspmd_math.h"
+
+} // namespace cppspmd_sse41
+
diff --git a/encoder/cppspmd_type_aliases.h b/encoder/cppspmd_type_aliases.h
new file mode 100644
index 0000000..0dfb28b
--- /dev/null
+++ b/encoder/cppspmd_type_aliases.h
@@ -0,0 +1,47 @@
+// cppspmd_type_aliases.h
+// Do not include this file directly
+//
+// Copyright 2020-2021 Binomial LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifndef CPPSPMD_TYPES
+#define CPPSPMD_TYPES
+
+using exec_mask = CPPSPMD::exec_mask;
+
+#if CPPSPMD_INT16
+using vint16 = CPPSPMD::vint16;
+using int16_lref = CPPSPMD::int16_lref;
+using cint16_vref = CPPSPMD::cint16_vref;
+using int16_vref = CPPSPMD::int16_vref;
+using lint16 = CPPSPMD::lint16;
+using vint16_vref = CPPSPMD::vint16_vref;
+#else
+using vint = CPPSPMD::vint;
+using int_lref = CPPSPMD::int_lref;
+using cint_vref = CPPSPMD::cint_vref;
+using int_vref = CPPSPMD::int_vref;
+using lint = CPPSPMD::lint;
+using vint_vref = CPPSPMD::vint_vref;
+#endif
+
+using vbool = CPPSPMD::vbool;
+using vfloat = CPPSPMD::vfloat;
+using float_lref = CPPSPMD::float_lref;
+using float_vref = CPPSPMD::float_vref;
+using vfloat_vref = CPPSPMD::vfloat_vref;
+
+#endif // CPPSPMD_TYPES