Update basis_spec.txt
diff --git a/spec/basis_spec.txt b/spec/basis_spec.txt
index 93ec749..673c430 100644
--- a/spec/basis_spec.txt
+++ b/spec/basis_spec.txt
@@ -1,7 +1,5 @@
-[WORK IN PROGRESS]
-
 File: basis_spec.txt
-Version 1.00
+Version 1.01
 
 1.0 Introduction
 ----------------
@@ -319,8 +317,9 @@
 indicates if the color endpoint codebook is grayscale or not. 
 
 Immediately following this code is the compressed color endpoint codebook data. 
-A simple form of DPCM coding is used to send the ETC1S intensity table indices and
-color values. Here is the procedure to decode the endpoint codebook:
+A simple form of DPCM (Delta Pulse Code Modulation) coding is used to send the
+ETC1S intensity table indices and color values. Here is the procedure to decode
+the endpoint codebook:
 
     const int COLOR5_PAL0_PREV_HI = 9, COLOR5_PAL0_DELTA_LO = -9, COLOR5_PAL0_DELTA_HI = 31;
     const int COLOR5_PAL1_PREV_HI = 21, COLOR5_PAL1_DELTA_LO = -21, COLOR5_PAL1_DELTA_HI = 21;
@@ -334,9 +333,9 @@
     for (uint32_t i = 0; i < num_endpoints; i++)
     {
         // Decode the intensity delta Huffman code
-        uint32_t inten_delta = sym_codec.decode_huffman(inten_delta_model);
-        m_endpoints[i].m_inten5 = static_cast<uint8_t>((inten_delta + prev_inten) & 7);
-        prev_inten = m_endpoints[i].m_inten5;
+        uint32_t inten_delta = decode_huffman(inten_delta_model);
+        endpoints[i].m_inten5 = static_cast<uint8_t>((inten_delta + prev_inten) & 7);
+        prev_inten = endpoints[i].m_inten5;
 
         // Now decode the endpoint entry's color or intensity value
         for (uint32_t c = 0; c < (endpoints_are_grayscale ? 1U : 3U); c++)
@@ -344,16 +343,16 @@
             // The Huffman table we used to decode the delta depends on the previous color's value
             int delta;
             if (prev_color5[c] <= basist::COLOR5_PAL0_PREV_HI)
-                delta = sym_codec.decode_huffman(color5_delta_model0);
+                delta = decode_huffman(color5_delta_model0);
             else if (prev_color5[c] <= basist::COLOR5_PAL1_PREV_HI)
-                delta = sym_codec.decode_huffman(color5_delta_model1);
+                delta = decode_huffman(color5_delta_model1);
             else
-                delta = sym_codec.decode_huffman(color5_delta_model2);
+                delta = decode_huffman(color5_delta_model2);
 
             // Apply the delta
             int v = (prev_color5[c] + delta) & 31;
 
-            m_endpoints[i].m_color5[c] = static_cast<uint8_t>(v);
+            endpoints[i].m_color5[c] = static_cast<uint8_t>(v);
 
             prev_color5[c] = static_cast<uint8_t>(v);
         }
@@ -361,8 +360,8 @@
         // If the endpoints are grayscale, set G and B to match R.
         if (endpoints_are_grayscale)
         {
-            m_endpoints[i].m_color5[1] = m_endpoints[i].m_color5[0];
-            m_endpoints[i].m_color5[2] = m_endpoints[i].m_color5[0];
+            endpoints[i].m_color5[1] = endpoints[i].m_color5[0];
+            endpoints[i].m_color5[2] = endpoints[i].m_color5[0];
         }
     }
 
@@ -371,19 +370,457 @@
 8.0 ETC1S Selector Codebooks
 ----------------------------
 
+The selector codebook section starts at file offset
+basis_file_header::m_selector_cb_file_ofs and is m_selector_cb_file_size bytes
+long. The selector codebook will have basis_file_header::m_total_selectors total
+entries.
 
+The first bit of this section indicates if "global" selector codebooks are used.
+Basis Universal doesn't currently utilize global selector codebooks, so this bit
+should always be 0.
 
+The second bit of this section indicates if "hybrid" global/local selector
+codebooks are used. Hybrid codebooks are not supported either, so this bit
+should always be 0.
 
+The third bit indicates of the selector codebook has been sent in raw form
+(uncompressed). If it's set, each selector is sent as four 8-bit bytes. Each
+byte corresponds to four 2-bit ETC1S selectors. The first selector of each group
+of 4 selectors starts at the LSB (least significant bit) of each byte, and is
+2-bits wide.
 
+If the third bit is 0, the selectors have been DPCM coded with Huffman coding. 
+The "delta_selector_pal_model" Huffman table will immediately follow the third
+bit, and is stored using the procedure outlined in section 6.0.
 
+Here is the DPCM decoding procedure for selector codebooks:
 
+        uint8_t prev_bytes[4] = { 0, 0, 0, 0 };
 
+        for (uint32_t i = 0; i < num_selectors; i++)
+        {
+            if (!i)
+            {
+				// First selector is sent raw
+                for (uint32_t j = 0; j < 4; j++)
+                {
+                    uint32_t cur_byte = get_bits(8);
+                    prev_bytes[j] = static_cast<uint8_t>(cur_byte);
 
+                    for (uint32_t k = 0; k < 4; k++)
+                        selectors[i].set_selector(k, j, (cur_byte >> (k * 2)) & 3);
+                }
+                selectors[i].init_flags();
+                continue;
+            }
 
+			// Subsequent selectors are sent with a simple form of byte-wise DPCM coding.
+            for (uint32_t j = 0; j < 4; j++)
+            {
+                int delta_byte = decode_huffman(delta_selector_pal_model);
 
+                uint32_t cur_byte = delta_byte ^ prev_bytes[j];
+                prev_bytes[j] = static_cast<uint8_t>(cur_byte);
 
+                for (uint32_t k = 0; k < 4; k++)
+                    selectors[i].set_selector(k, j, (cur_byte >> (k * 2)) & 3);
+            }
+        }
 
+Any bytes in this section following the selector codebook bits can be safely ignored.
 
+9.0 ETC1S Compressed Slice Decoding Huffman Tables
+--------------------------------------------------
 
+Each ETC1S slice is compressed with four Huffman tables stored using the
+procedural outlined in section 6.0. These Huffman tables are stored at file
+offset basis_file_header::m_tables_file_ofs. This section will be 
+basis_file_header::m_tables_file_size bytes long.
+
+The following four Huffman tables are sent, in this order:
+
+	1. endpoint_pred_model
+	2. delta_endpoint_model
+	3. selector_model
+	4. selector_history_buf_rle_model
+
+Following the last Huffman table are 13-bits indicating the size of the selector
+history buffer. Any remaining bits may be safely ignored.
+
+10. ETC1S Slice Decoding
+------------------------
+
+ETC1S slices consist of a compressed 2D array of ETC1S blocks, always compressed
+in top-down/left-right raster order. For texture video, the previous slice's
+already decoded contents may be referred to when blocks are encoded using
+Conditional Replenishment (also known as "skip blocks"). 
+
+Each ETC1S block is encoded by using references to the color endpoint codebook
+and the selector codebook. Sections 10.1 and 10.2 describe the helper procedures
+using by the decoder, and section 10.3 describes how the array of ETC1S blocks
+is actually decoded.
+
+10.1 Approximate Move to Front Routines
+---------------------------------------
+
+An approximate Move to Front (MTF) approach is used to efficiently encode the
+selector codebook references. Here is the C++ example class for approximate MTF
+decoding:
+
+	class approx_move_to_front
+	{
+	public:
+		approx_move_to_front(uint32_t n)
+		{
+			init(n);
+		}
+
+		void init(uint32_t n)
+		{
+			m_values.resize(n);
+			m_rover = n / 2;
+		}
+
+		size_t size() const { return m_values.size(); }
+
+		const int& operator[] (uint32_t index) const { return m_values[index]; }
+			  int operator[] (uint32_t index)        { return m_values[index]; }
+
+		void add(int new_value)
+		{
+			m_values[m_rover++] = new_value;
+			if (m_rover == m_values.size())
+				m_rover = (uint32_t)m_values.size() / 2;
+		}
+
+		void use(uint32_t index)
+		{
+			if (index)
+			{
+				int x = m_values[index / 2];
+				int y = m_values[index];
+				m_values[index / 2] = y;
+				m_values[index] = x;
+			}
+		}
+    	
+	private:
+		std::vector<int> m_values;
+		uint32_t m_rover;
+	};
+
+10.2 VLC Decoding Procedure
+---------------------------
+
+ETC1S slice decoding utilizes a simple Variable Length Coding (VLC) scheme that
+sends raw bits using variable-size chunks. Here is the VLC decoding procedure:
+
+	uint32_t decode_vlc(uint32_t chunk_bits)
+	{
+		assert(chunk_bits);
+
+		const uint32_t chunk_size = 1 << chunk_bits;
+		const uint32_t chunk_mask = chunk_size - 1;
+				
+		uint32_t v = 0;
+		uint32_t ofs = 0;
+
+		for ( ; ; )
+		{
+			uint32_t s = get_bits(chunk_bits + 1);
+			v |= ((s & chunk_mask) << ofs);
+			ofs += chunk_bits;
+
+			if ((s & chunk_size) == 0)
+				break;
+			
+			if (ofs >= 32)
+			{
+				assert(0);
+				break;
+			}
+		}
+
+		return v;
+	}
+
+10.3 ETC1S Slice Block Decoding
+-------------------------------
+
+Each slice has a corresponding "basis_slice_desc" structure, described in section
+4.2. The slice's dimensions in ETC1S blocks are stored in
+basis_slice_desc::m_num_blocks_x and basis_slice_desc::m_num_blocks_y. Each
+slice is located at file offset basis_slice_desc::m_file_ofs, and is
+basis_slice_desc::m_file_size bytes long.
+
+The decoder iterates through all the slice blocks in top-down, left-right raster
+order. Each block is represented by an index into the color endpoint codebook
+and another index into the selector endpoint codebook. The endpoint codebook
+contains each ETC1S block's base RGB color and intensity table information, and
+the selector codebook contains the 4x4 texel selector entry (which are 2-bits
+each) information. This is all the information needed to fully represent the
+texels within each block.
+
+The decoding procedural loops over all the blocks in raster order, and decodes
+the endpoint and selector indices used to represent each block. The decoding
+procedural is complex enough that commented code is best used to describe it.
+
+Here's the slice decoding procedure. This block of code shows the block loop,
+and how endpoint codebook indices are decoded. The next block of code shows how
+selector codebook indices are decoded.
+
+	// Constants used by the decoder
+	const uint32_t ENDPOINT_PRED_TOTAL_SYMBOLS = (4 * 4 * 4 * 4) + 1;
+	const uint32_t ENDPOINT_PRED_REPEAT_LAST_SYMBOL = ENDPOINT_PRED_TOTAL_SYMBOLS - 1;
+	const uint32_t ENDPOINT_PRED_MIN_REPEAT_COUNT = 3;
+	const uint32_t ENDPOINT_PRED_COUNT_VLC_BITS = 4;
+
+	const uint32_t NUM_ENDPOINT_PREDS = 3;
+	const uint32_t CR_ENDPOINT_PRED_INDEX = NUM_ENDPOINT_PREDS - 1;
+	const uint32_t NO_ENDPOINT_PRED_INDEX = 3;
+	
+	// Endpoint/selector codebooks - decoded previously. See sections 7.0 and 8.0.
+	endpoint endpoints[endpoint_codebook_size];
+	selector selectors[selector_codebook_size]; 
+	
+	// Array of per-block values used for endpoint index prediction (enough for 2 rows).
+	struct block_preds
+	{
+		uint16_t m_endpoint_index;
+		uint8_t m_pred_bits;
+	};
+	block_preds block_endpoint_preds[2][num_blocks_x];
+	
+	// State used during block decoding
+	uint32_t cur_pred_bits = 0;
+	int prev_endpoint_pred_sym = 0;
+	int endpoint_pred_repeat_count = 0;
+	uint32_t prev_endpoint_index = 0;
+
+	// This	array is only used for texture video. It holds the previous frame's endpoint and selector indices (each 16-bits, for 32-bits total).
+	uint32_t prev_frame_indices[block_x][block_y]; 
+	
+	// Selector history buffer - See section 10.1.
+	approx_move_to_front selector_history_buf;
+
+	// Loop over all slice blocks in raster order
+	for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+	{
+		// The index into the block_endpoint_preds array
+		const uint32_t cur_block_endpoint_pred_array = block_y & 1;
+
+		for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+		{
+			// Check if we're at the start of a 2x2 block group.
+			if ((block_x & 1) == 0)
+			{
+				// Are we on an even or odd row of blocks?
+				if ((block_y & 1) == 0)
+				{
+					// We're on an even row and column of blocks. Decode the combined endpoint index predictor symbols for 2x2 blocks.
+					// This symbol tells the decoder how the endpoints are decoded for each block in a 2x2 group of blocks.
+										
+					// Are we in an RLE run?
+					if (endpoint_pred_repeat_count)
+					{
+						// Inside a run of endpoint predictor symbols.
+						endpoint_pred_repeat_count--;
+						cur_pred_bits = prev_endpoint_pred_sym;
+					}
+					else
+					{
+						// Decode the endpoint prediction symbol, using the "endpoint pred" Huffman table (see section 9.0).
+						cur_pred_bits = decode_huffman(m_endpoint_pred_model);
+						if (cur_pred_bits == ENDPOINT_PRED_REPEAT_LAST_SYMBOL)
+						{
+							// It's a run of symbols, so decode the count using VLC decoding (see section 10.2)
+							endpoint_pred_repeat_count = decode_vlc(ENDPOINT_PRED_COUNT_VLC_BITS) + ENDPOINT_PRED_MIN_REPEAT_COUNT - 1;
+
+							cur_pred_bits = prev_endpoint_pred_sym;
+						}
+						else
+						{
+							// It's not a run of symbols
+							prev_endpoint_pred_sym = cur_pred_bits;
+						}
+					}
+
+					// The symbol has enough endpoint prediction information for 4 blocks (2 bits per block), so 8 bits total. 
+					// Remember the prediction information we should use for the next row of 2 blocks beneath the current block.
+					block_endpoint_preds[cur_block_endpoint_pred_array ^ 1][block_x].m_pred_bits = (uint8_t)(cur_pred_bits >> 4);
+				}
+				else
+				{
+					// We're on an odd row of blocks, so use the endpoint prediction information we previously stored on the previous even row.
+					cur_pred_bits = block_endpoint_preds[cur_block_endpoint_pred_array][block_x].m_pred_bits;
+				}
+			}
+			
+			// Decode the current block's endpoint and selector indices.
+			uint32_t endpoint_index, selector_index = 0;
+
+			// Get the 2-bit endpoint prediction index for this block.
+			const uint32_t pred = cur_pred_bits & 3;
+
+			// Get the next block's endpoint prediction bits ready.
+			cur_pred_bits >>= 2;			
+			
+			// Now check to see if we should reuse a previously encoded block's endpoints.
+			if (pred == 0)
+			{
+				// Reuse the left block's endpoint index
+				assert(block_x > 0);
+				endpoint_index = prev_endpoint_index;
+			}
+			else if (pred == 1)
+			{
+				// Reuse the upper block's endpoint index
+				assert(block_y > 0)
+				endpoint_index = block_endpoint_preds[cur_block_endpoint_pred_array ^ 1][block_x].m_endpoint_index;
+			}
+			else if (pred == 2)
+			{
+				if (is_video)
+				{
+					// If it's texture video, reuse the previous frame's endpoint index, at this block.
+					assert(pred == CR_ENDPOINT_PRED_INDEX);
+					endpoint_index = prev_frame_indices[block_x][block_y];
+					selector_index = endpoint_index >> 16;
+					endpoint_index &= 0xFFFFU;
+				}
+				else
+				{
+					// Reuse the upper left block's endpoint index.
+					assert((block_x > 0) && (block_y > 0));
+					endpoint_index = block_endpoint_preds[cur_block_endpoint_pred_array ^ 1][block_x - 1].m_endpoint_index;
+				}
+			}
+			else
+			{
+				// We need to decode and apply a DPCM encoded delta to the previously used endpoint index.
+				// This uses the delta endpoint Huffman table (see section 9.0).
+				const uint32_t delta_sym = decode_huffman(delta_endpoint_model);
+
+				endpoint_index = delta_sym + prev_endpoint_index;
+				
+				// Wrap around if the index goes beyond the end of the endpoint codebook
+				if (endpoint_index >= endpoints.size())
+					endpoint_index -= (int)endpoints.size();
+			}
+
+			// Remember the endpoint index we used on this block, so the next row can potentially reuse the index.
+			block_endpoint_preds[cur_block_endpoint_pred_array][block_x].m_endpoint_index = (uint16_t)endpoint_index;
+
+			// Remember the endpoint index used
+			prev_endpoint_index = endpoint_index;
+			
+			// Now we have fully decoded the ETC1S endpoint codebook index, in endpoint_index. 
+			
+      		// Now decode the selector index (see the next block of code, below).
+			< selector decoding - see below >
+			
+		} // block_x
+	} // block_y
+
+The compressed format allows the encoder to reuse the endpoint index used by
+the previous block, the block immediately above the current block, or the
+block to the upper left (if the file is not texture video). Alternately, the
+encoder can send a Huffman coded DPCM encoded index relative to the
+previously used endpoint index.
+
+Which type of prediction was used by the encoder is controlled by the "endpoint
+pred" (endpoint prediction) indices, which are sent with Huffman coding (using
+the "endpoint_pred_model" table described in Section 9.0) once every 2x2 blocks.
+
+For texture video, the endpoint prediction symbol normally used to refer to the
+upper left block (endpoint pred index 2) instead indicates that both the
+endpoint and selector indices from the previous frame's block should be reused
+on the current frame's block. The endpoint pred indices are RLE coded, so this
+allows the encoder to efficiently skip over a large number of unchanged blocks
+in a video sequence.
+
+The code to decode the selector codebook index immediately follows the code above for decoding the endpoint indices:
+
+	const uint32_t MAX_SELECTOR_HISTORY_BUF_SIZE = 64;
+	const uint32_t SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH = 3;
+	const uint32_t SELECTOR_HISTORY_BUF_RLE_COUNT_BITS = 6;
+	const uint32_t SELECTOR_HISTORY_BUF_RLE_COUNT_TOTAL = (1 << SELECTOR_HISTORY_BUF_RLE_COUNT_BITS);
+
+	// Decode selector index, unless it's texture video and the endpoint predictor indicated that the 
+	// block's endpoints were reused from the previous frame.
+	if ((!is_video) || (pred != CR_ENDPOINT_PRED_INDEX))
+	{
+		int selector_sym;
+
+		// Are we in a selector RLE run?
+		if (cur_selector_rle_count > 0)
+		{
+			// Handle selector RLE run.
+			cur_selector_rle_count--;
+
+			selector_sym = (int)selectors.size();
+		}
+		else
+		{
+			// Decode the selector symbol, using the selector Huffman table (see section 9.0).
+			selector_sym = decode_huffman(m_selector_model);
+
+			// Is it a run?
+			if (selector_sym == static_cast<int>(SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX))
+			{
+				// Decode the selector run's size, using the selector history buf RLE Huffman table (see section 9.0).
+				int run_sym = decode_huffman(selector_history_buf_rle_model);
+
+				// Is it a very long run?
+				if (run_sym == (SELECTOR_HISTORY_BUF_RLE_COUNT_TOTAL - 1))
+					cur_selector_rle_count = decode_vlc(7) + SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH;
+				else
+					cur_selector_rle_count = run_sym + SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH;
+
+				selector_sym = (int)selectors.size();
+
+				cur_selector_rle_count--;
+			}
+		}
+
+		// Is it a reference into the selector history buffer?
+		if (selector_sym >= (int)selectors.size())
+		{
+			assert(m_selector_history_buf_size > 0);
+
+			// Compute the history buffer index
+			int history_buf_index = selector_sym - (int)selectors.size();
+
+			if (history_buf_index < selector_history_buf.size());
+
+			// Access the history buffer
+			selector_index = selector_history_buf[history_buf_index];
+
+			// Update the history buffer
+			if (history_buf_index != 0)
+				selector_history_buf.use(history_buf_index);
+		}
+		else
+		{
+			// It's an index into the selector codebook
+			selector_index = selector_sym;
+
+			// Add it to the selector history buffer
+			if (m_selector_history_buf_size)
+				selector_history_buf.add(selector_index);
+		}
+	}
+    	
+	// For texture video, remember the endpoint and selector indices used by the block on this frame, for later reuse on the next frame.
+	if (is_video)
+		prev_frame_indices[block_x + block_y * num_blocks_x] = endpoint_index | (selector_index << 16);
+
+	// The block is fully decoded here. The codebook indices are endpoint_index and selector_index.
+	// Make sure they are valid
+	assert((endpoint_index < endpoints.size()) && (selector_index < selectors.size()));
+
+At this point, the decoder has both an endpoint and selector codebook indices.
+It can now fetch the endpoints/selectors from the codebooks and write out ETC1S
+texture data, or it can transcode the ETC1S data to another texture format.