std/png/decode_filter_x86_sse42.wuffs - external/github.com/google/wuffs - Git at Google

 // Copyright 2021 The Wuffs Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // --------

 // Filter 1: Sub.

 // This (filter = 1, distance = 3) implementation doesn't actually bench faster
 // than the non-SIMD one.
 //
 // pri func decoder.filter_1_distance_3_x86_sse42!(curr: slice base.u8),
 //     choose cpu_arch >= x86_sse42,
 // {
 //     var curr : slice base.u8
 //
 //     var util : base.x86_sse42_utility
 //     var x128 : base.x86_m128i
 //     var a128 : base.x86_m128i
 //
 //     iterate (curr = args.curr)(length: 4, advance: 3, unroll: 2) {
 //         x128 = util.make_m128i_single_u32(a: curr.peek_u32le())
 //         x128 = x128._mm_add_epi8(b: a128)
 //         a128 = x128
 //         curr.poke_u24le!(a: x128.truncate_u32())
 //     } else (length: 3, advance: 3, unroll: 1) {
 //         x128 = util.make_m128i_single_u32(a: curr.peek_u24le_as_u32())
 //         x128 = x128._mm_add_epi8(b: a128)
 //         curr.poke_u24le!(a: x128.truncate_u32())
 //     }
 // }
 //
 // Note that "more SIMD" doesn't always mean faster compute. See
 // https://github.com/google/wuffs/commit/1660f9268621ed4415b3b363f0a0e1026d4aa83d
 // "Have std/png filter_1_distance_? use more SIMD" for a pessimizing example.

 pri func decoder.filter_1_distance_4_x86_sse42!(curr: slice base.u8),
 	choose cpu_arch >= x86_sse42,
 {
 	var curr : slice base.u8

 	var util : base.x86_sse42_utility
 	var x128 : base.x86_m128i
 	var a128 : base.x86_m128i

 	iterate (curr = args.curr)(length: 4, advance: 4, unroll: 2) {
 		x128 = util.make_m128i_single_u32(a: curr.peek_u32le())
 		x128 = x128._mm_add_epi8(b: a128)
 		a128 = x128
 		curr.poke_u32le!(a: x128.truncate_u32())
 	}
 }

 // --------

 // Filter 3: Average.

 // Similar to filter_1_distance_3_x86_sse42, the SIMD implementation for
 // (filter = 3, distance = 3) doesn't actually bench faster than non-SIMD.
 //
 // pri func decoder.filter_3_distance_3_x86_sse42!(curr: slice base.u8, prev: slice base.u8),
 //     choose cpu_arch >= x86_sse42,
 // {
 //     etc
 // }

 pri func decoder.filter_3_distance_4_x86_sse42!(curr: slice base.u8, prev: slice base.u8),
 	choose cpu_arch >= x86_sse42,
 {
 	var curr : slice base.u8
 	var prev : slice base.u8

 	var util : base.x86_sse42_utility
 	var x128 : base.x86_m128i
 	var a128 : base.x86_m128i
 	var b128 : base.x86_m128i
 	var p128 : base.x86_m128i
 	var k128 : base.x86_m128i

 	if args.prev.length() == 0 {
 		k128 = util.make_m128i_repeat_u8(a: 0xFE)
 		iterate (curr = args.curr)(length: 4, advance: 4, unroll: 2) {
 			// The predictor, p128, is just half (rounded down) of the previous
 			// pixel, a128. In this branch, b128 stays zero so the average of
 			// a128 and b128 is just half of a128. _mm_avg_epu8 rounds up, but
 			// (a128 & 0xFE_repeated) takes out the low bits of a128's bytes.
 			p128 = a128._mm_and_si128(b: k128)._mm_avg_epu8(b: b128)

 			// Add the predictor to the residual and, for the next iteration,
 			// set its previous pixel, a128, to this one, x128.
 			x128 = util.make_m128i_single_u32(a: curr.peek_u32le())
 			x128 = x128._mm_add_epi8(b: p128)
 			a128 = x128
 			curr.poke_u32le!(a: x128.truncate_u32())
 		}

 	} else {
 		k128 = util.make_m128i_repeat_u8(a: 0x01)
 		iterate (curr = args.curr, prev = args.prev)(length: 4, advance: 4, unroll: 2) {
 			// Load the pixel from the row above.
 			b128 = util.make_m128i_single_u32(a: prev.peek_u32le())

 			// The predictor, p128, is the average (rounded down) of the
 			// previous pixel, a128, and the pixel above, b128.
 			p128 = a128._mm_avg_epu8(b: b128)

 			// Subtract a correction term because _mm_avg_epu8 rounds up but
 			// the PNG filter rounds down. The correction term is the low bit
 			// of each byte of (a128 ^ b128).
 			p128 = p128._mm_sub_epi8(b: k128._mm_and_si128(b: a128._mm_xor_si128(b: b128)))

 			// Add the predictor to the residual and, for the next iteration,
 			// set its previous pixel, a128, to this one, x128.
 			x128 = util.make_m128i_single_u32(a: curr.peek_u32le())
 			x128 = x128._mm_add_epi8(b: p128)
 			a128 = x128
 			curr.poke_u32le!(a: x128.truncate_u32())
 		}
 	}
 }

 // --------

 // Filter 4: Paeth.

 pri func decoder.filter_4_distance_3_x86_sse42!(curr: slice base.u8, prev: slice base.u8),
 	choose cpu_arch >= x86_sse42,
 {
 	// See the comments in filter_4_distance_4_x86_sse42 for an explanation of
 	// how this works. That function's single loop is copied twice here, once
 	// with "length: 4" and once with "length: 3". It's generally faster to
 	// load 4 bytes at a time instead of 3.
 	//
 	// Differences between that function and this one are marked with a §.

 	var curr : slice base.u8
 	var prev : slice base.u8

 	var util        : base.x86_sse42_utility
 	var x128        : base.x86_m128i
 	var a128        : base.x86_m128i
 	var b128        : base.x86_m128i
 	var c128        : base.x86_m128i
 	var p128        : base.x86_m128i
 	var pa128       : base.x86_m128i
 	var pb128       : base.x86_m128i
 	var pc128       : base.x86_m128i
 	var smallest128 : base.x86_m128i
 	var z128        : base.x86_m128i

 	// § The advance is 3, not 4.
 	iterate (curr = args.curr, prev = args.prev)(length: 4, advance: 3, unroll: 2) {
 		b128 = util.make_m128i_single_u32(a: prev.peek_u32le())
 		b128 = b128._mm_unpacklo_epi8(b: z128)
 		pa128 = b128._mm_sub_epi16(b: c128)
 		pb128 = a128._mm_sub_epi16(b: c128)
 		pc128 = pa128._mm_add_epi16(b: pb128)
 		pa128 = pa128._mm_abs_epi16()
 		pb128 = pb128._mm_abs_epi16()
 		pc128 = pc128._mm_abs_epi16()
 		smallest128 = pc128._mm_min_epi16(b: pb128._mm_min_epi16(b: pa128))
 		p128 = c128._mm_blendv_epi8(
 			b: b128,
 			mask: smallest128._mm_cmpeq_epi16(b: pb128))._mm_blendv_epi8(
 			b: a128,
 			mask: smallest128._mm_cmpeq_epi16(b: pa128))
 		x128 = util.make_m128i_single_u32(a: curr.peek_u32le())
 		x128 = x128._mm_unpacklo_epi8(b: z128)
 		x128 = x128._mm_add_epi8(b: p128)
 		a128 = x128
 		c128 = b128
 		x128 = x128._mm_packus_epi16(b: x128)
 		// § poke_u24le replaces poke_u32le.
 		curr.poke_u24le!(a: x128.truncate_u32())

 		// § The length and advance are both 3, not 4.
 	} else (length: 3, advance: 3, unroll: 1) {
 		// § peek_u24le_as_u32 replaces peek_u32le.
 		b128 = util.make_m128i_single_u32(a: prev.peek_u24le_as_u32())
 		b128 = b128._mm_unpacklo_epi8(b: z128)
 		pa128 = b128._mm_sub_epi16(b: c128)
 		pb128 = a128._mm_sub_epi16(b: c128)
 		pc128 = pa128._mm_add_epi16(b: pb128)
 		pa128 = pa128._mm_abs_epi16()
 		pb128 = pb128._mm_abs_epi16()
 		pc128 = pc128._mm_abs_epi16()
 		smallest128 = pc128._mm_min_epi16(b: pb128._mm_min_epi16(b: pa128))
 		p128 = c128._mm_blendv_epi8(
 			b: b128,
 			mask: smallest128._mm_cmpeq_epi16(b: pb128))._mm_blendv_epi8(
 			b: a128,
 			mask: smallest128._mm_cmpeq_epi16(b: pa128))
 		// § peek_u24le_as_u32 replaces peek_u32le.
 		x128 = util.make_m128i_single_u32(a: curr.peek_u24le_as_u32())
 		x128 = x128._mm_unpacklo_epi8(b: z128)
 		x128 = x128._mm_add_epi8(b: p128)
 		// § These assignments are unnecessary; this is the last iteration.
 		// a128 = x128
 		// c128 = b128
 		x128 = x128._mm_packus_epi16(b: x128)
 		// § poke_u24le replaces poke_u32le.
 		curr.poke_u24le!(a: x128.truncate_u32())
 	}
 }

 pri func decoder.filter_4_distance_4_x86_sse42!(curr: slice base.u8, prev: slice base.u8),
 	choose cpu_arch >= x86_sse42,
 {
 	var curr : slice base.u8
 	var prev : slice base.u8

 	var util        : base.x86_sse42_utility
 	var x128        : base.x86_m128i
 	var a128        : base.x86_m128i
 	var b128        : base.x86_m128i
 	var c128        : base.x86_m128i
 	var p128        : base.x86_m128i
 	var pa128       : base.x86_m128i
 	var pb128       : base.x86_m128i
 	var pc128       : base.x86_m128i
 	var smallest128 : base.x86_m128i
 	var z128        : base.x86_m128i

 	iterate (curr = args.curr, prev = args.prev)(length: 4, advance: 4, unroll: 2) {
 		// Load the pixel from the row above.
 		b128 = util.make_m128i_single_u32(a: prev.peek_u32le())

 		// Convert from u8 to i16 by unpacking it with zeroes.
 		b128 = b128._mm_unpacklo_epi8(b: z128)

 		// Compute:
 		//  - pa128 = (m128 - a128)
 		//  - pb128 = (m128 - b128)
 		//  - pc128 = (m128 - c128)
 		// where m128 = (a128 + b128 - c128).
 		pa128 = b128._mm_sub_epi16(b: c128)
 		pb128 = a128._mm_sub_epi16(b: c128)
 		pc128 = pa128._mm_add_epi16(b: pb128)

 		// Compute the smallest absolute value of pa128, pb128 and pc128.
 		pa128 = pa128._mm_abs_epi16()
 		pb128 = pb128._mm_abs_epi16()
 		pc128 = pc128._mm_abs_epi16()
 		smallest128 = pc128._mm_min_epi16(b: pb128._mm_min_epi16(b: pa128))

 		// The predictor, p128, is whichever of a128, b128 or c128 such that
 		// pa128, pb128 or pc128 matches this smallest absolute value. Ties are
 		// broken in favor of a128 then b128 then c128.
 		//
 		// The a._mm_blendv_epi8(b, mask) method picks b when mask is true.
 		p128 = c128._mm_blendv_epi8(
 			b: b128,
 			mask: smallest128._mm_cmpeq_epi16(b: pb128))._mm_blendv_epi8(
 			b: a128,
 			mask: smallest128._mm_cmpeq_epi16(b: pa128))

 		// Add the predictor to the residual and, for the next iteration, set
 		// its previous pixels, a128 and c128, to x128 and b128.
 		//
 		// Again, convert between u8 and i16 by unpacking and re-packing.
 		x128 = util.make_m128i_single_u32(a: curr.peek_u32le())
 		x128 = x128._mm_unpacklo_epi8(b: z128)
 		x128 = x128._mm_add_epi8(b: p128)
 		a128 = x128
 		c128 = b128
 		x128 = x128._mm_packus_epi16(b: x128)
 		curr.poke_u32le!(a: x128.truncate_u32())
 	}
 }
	// Copyright 2021 The Wuffs Authors.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// https://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	// --------

	// Filter 1: Sub.

	// This (filter = 1, distance = 3) implementation doesn't actually bench faster
	// than the non-SIMD one.
	//
	// pri func decoder.filter_1_distance_3_x86_sse42!(curr: slice base.u8),
	// choose cpu_arch >= x86_sse42,
	// {
	// var curr : slice base.u8
	//
	// var util : base.x86_sse42_utility
	// var x128 : base.x86_m128i
	// var a128 : base.x86_m128i
	//
	// iterate (curr = args.curr)(length: 4, advance: 3, unroll: 2) {
	// x128 = util.make_m128i_single_u32(a: curr.peek_u32le())
	// x128 = x128._mm_add_epi8(b: a128)
	// a128 = x128
	// curr.poke_u24le!(a: x128.truncate_u32())
	// } else (length: 3, advance: 3, unroll: 1) {
	// x128 = util.make_m128i_single_u32(a: curr.peek_u24le_as_u32())
	// x128 = x128._mm_add_epi8(b: a128)
	// curr.poke_u24le!(a: x128.truncate_u32())
	// }
	// }
	//
	// Note that "more SIMD" doesn't always mean faster compute. See
	// https://github.com/google/wuffs/commit/1660f9268621ed4415b3b363f0a0e1026d4aa83d
	// "Have std/png filter_1_distance_? use more SIMD" for a pessimizing example.

	pri func decoder.filter_1_distance_4_x86_sse42!(curr: slice base.u8),
	choose cpu_arch >= x86_sse42,
	{
	var curr : slice base.u8

	var util : base.x86_sse42_utility
	var x128 : base.x86_m128i
	var a128 : base.x86_m128i

	iterate (curr = args.curr)(length: 4, advance: 4, unroll: 2) {
	x128 = util.make_m128i_single_u32(a: curr.peek_u32le())
	x128 = x128._mm_add_epi8(b: a128)
	a128 = x128
	curr.poke_u32le!(a: x128.truncate_u32())
	}
	}

	// --------

	// Filter 3: Average.

	// Similar to filter_1_distance_3_x86_sse42, the SIMD implementation for
	// (filter = 3, distance = 3) doesn't actually bench faster than non-SIMD.
	//
	// pri func decoder.filter_3_distance_3_x86_sse42!(curr: slice base.u8, prev: slice base.u8),
	// choose cpu_arch >= x86_sse42,
	// {
	// etc
	// }

	pri func decoder.filter_3_distance_4_x86_sse42!(curr: slice base.u8, prev: slice base.u8),
	choose cpu_arch >= x86_sse42,
	{
	var curr : slice base.u8
	var prev : slice base.u8

	var util : base.x86_sse42_utility
	var x128 : base.x86_m128i
	var a128 : base.x86_m128i
	var b128 : base.x86_m128i
	var p128 : base.x86_m128i
	var k128 : base.x86_m128i

	if args.prev.length() == 0 {
	k128 = util.make_m128i_repeat_u8(a: 0xFE)
	iterate (curr = args.curr)(length: 4, advance: 4, unroll: 2) {
	// The predictor, p128, is just half (rounded down) of the previous
	// pixel, a128. In this branch, b128 stays zero so the average of
	// a128 and b128 is just half of a128. _mm_avg_epu8 rounds up, but
	// (a128 & 0xFE_repeated) takes out the low bits of a128's bytes.
	p128 = a128._mm_and_si128(b: k128)._mm_avg_epu8(b: b128)

	// Add the predictor to the residual and, for the next iteration,
	// set its previous pixel, a128, to this one, x128.
	x128 = util.make_m128i_single_u32(a: curr.peek_u32le())
	x128 = x128._mm_add_epi8(b: p128)
	a128 = x128
	curr.poke_u32le!(a: x128.truncate_u32())
	}

	} else {
	k128 = util.make_m128i_repeat_u8(a: 0x01)
	iterate (curr = args.curr, prev = args.prev)(length: 4, advance: 4, unroll: 2) {
	// Load the pixel from the row above.
	b128 = util.make_m128i_single_u32(a: prev.peek_u32le())

	// The predictor, p128, is the average (rounded down) of the
	// previous pixel, a128, and the pixel above, b128.
	p128 = a128._mm_avg_epu8(b: b128)

	// Subtract a correction term because _mm_avg_epu8 rounds up but
	// the PNG filter rounds down. The correction term is the low bit
	// of each byte of (a128 ^ b128).
	p128 = p128._mm_sub_epi8(b: k128._mm_and_si128(b: a128._mm_xor_si128(b: b128)))

	// Add the predictor to the residual and, for the next iteration,
	// set its previous pixel, a128, to this one, x128.
	x128 = util.make_m128i_single_u32(a: curr.peek_u32le())
	x128 = x128._mm_add_epi8(b: p128)
	a128 = x128
	curr.poke_u32le!(a: x128.truncate_u32())
	}
	}
	}

	// --------

	// Filter 4: Paeth.

	pri func decoder.filter_4_distance_3_x86_sse42!(curr: slice base.u8, prev: slice base.u8),
	choose cpu_arch >= x86_sse42,
	{
	// See the comments in filter_4_distance_4_x86_sse42 for an explanation of
	// how this works. That function's single loop is copied twice here, once
	// with "length: 4" and once with "length: 3". It's generally faster to
	// load 4 bytes at a time instead of 3.
	//
	// Differences between that function and this one are marked with a §.

	var curr : slice base.u8
	var prev : slice base.u8

	var util : base.x86_sse42_utility
	var x128 : base.x86_m128i
	var a128 : base.x86_m128i
	var b128 : base.x86_m128i
	var c128 : base.x86_m128i
	var p128 : base.x86_m128i
	var pa128 : base.x86_m128i
	var pb128 : base.x86_m128i
	var pc128 : base.x86_m128i
	var smallest128 : base.x86_m128i
	var z128 : base.x86_m128i

	// § The advance is 3, not 4.
	iterate (curr = args.curr, prev = args.prev)(length: 4, advance: 3, unroll: 2) {
	b128 = util.make_m128i_single_u32(a: prev.peek_u32le())
	b128 = b128._mm_unpacklo_epi8(b: z128)
	pa128 = b128._mm_sub_epi16(b: c128)
	pb128 = a128._mm_sub_epi16(b: c128)
	pc128 = pa128._mm_add_epi16(b: pb128)
	pa128 = pa128._mm_abs_epi16()
	pb128 = pb128._mm_abs_epi16()
	pc128 = pc128._mm_abs_epi16()
	smallest128 = pc128._mm_min_epi16(b: pb128._mm_min_epi16(b: pa128))
	p128 = c128._mm_blendv_epi8(
	b: b128,
	mask: smallest128._mm_cmpeq_epi16(b: pb128))._mm_blendv_epi8(
	b: a128,
	mask: smallest128._mm_cmpeq_epi16(b: pa128))
	x128 = util.make_m128i_single_u32(a: curr.peek_u32le())
	x128 = x128._mm_unpacklo_epi8(b: z128)
	x128 = x128._mm_add_epi8(b: p128)
	a128 = x128
	c128 = b128
	x128 = x128._mm_packus_epi16(b: x128)
	// § poke_u24le replaces poke_u32le.
	curr.poke_u24le!(a: x128.truncate_u32())

	// § The length and advance are both 3, not 4.
	} else (length: 3, advance: 3, unroll: 1) {
	// § peek_u24le_as_u32 replaces peek_u32le.
	b128 = util.make_m128i_single_u32(a: prev.peek_u24le_as_u32())
	b128 = b128._mm_unpacklo_epi8(b: z128)
	pa128 = b128._mm_sub_epi16(b: c128)
	pb128 = a128._mm_sub_epi16(b: c128)
	pc128 = pa128._mm_add_epi16(b: pb128)
	pa128 = pa128._mm_abs_epi16()
	pb128 = pb128._mm_abs_epi16()
	pc128 = pc128._mm_abs_epi16()
	smallest128 = pc128._mm_min_epi16(b: pb128._mm_min_epi16(b: pa128))
	p128 = c128._mm_blendv_epi8(
	b: b128,
	mask: smallest128._mm_cmpeq_epi16(b: pb128))._mm_blendv_epi8(
	b: a128,
	mask: smallest128._mm_cmpeq_epi16(b: pa128))
	// § peek_u24le_as_u32 replaces peek_u32le.
	x128 = util.make_m128i_single_u32(a: curr.peek_u24le_as_u32())
	x128 = x128._mm_unpacklo_epi8(b: z128)
	x128 = x128._mm_add_epi8(b: p128)
	// § These assignments are unnecessary; this is the last iteration.
	// a128 = x128
	// c128 = b128
	x128 = x128._mm_packus_epi16(b: x128)
	// § poke_u24le replaces poke_u32le.
	curr.poke_u24le!(a: x128.truncate_u32())
	}
	}

	pri func decoder.filter_4_distance_4_x86_sse42!(curr: slice base.u8, prev: slice base.u8),
	choose cpu_arch >= x86_sse42,
	{
	var curr : slice base.u8
	var prev : slice base.u8

	var util : base.x86_sse42_utility
	var x128 : base.x86_m128i
	var a128 : base.x86_m128i
	var b128 : base.x86_m128i
	var c128 : base.x86_m128i
	var p128 : base.x86_m128i
	var pa128 : base.x86_m128i
	var pb128 : base.x86_m128i
	var pc128 : base.x86_m128i
	var smallest128 : base.x86_m128i
	var z128 : base.x86_m128i

	iterate (curr = args.curr, prev = args.prev)(length: 4, advance: 4, unroll: 2) {
	// Load the pixel from the row above.
	b128 = util.make_m128i_single_u32(a: prev.peek_u32le())

	// Convert from u8 to i16 by unpacking it with zeroes.
	b128 = b128._mm_unpacklo_epi8(b: z128)

	// Compute:
	// - pa128 = (m128 - a128)
	// - pb128 = (m128 - b128)
	// - pc128 = (m128 - c128)
	// where m128 = (a128 + b128 - c128).
	pa128 = b128._mm_sub_epi16(b: c128)
	pb128 = a128._mm_sub_epi16(b: c128)
	pc128 = pa128._mm_add_epi16(b: pb128)

	// Compute the smallest absolute value of pa128, pb128 and pc128.
	pa128 = pa128._mm_abs_epi16()
	pb128 = pb128._mm_abs_epi16()
	pc128 = pc128._mm_abs_epi16()
	smallest128 = pc128._mm_min_epi16(b: pb128._mm_min_epi16(b: pa128))

	// The predictor, p128, is whichever of a128, b128 or c128 such that
	// pa128, pb128 or pc128 matches this smallest absolute value. Ties are
	// broken in favor of a128 then b128 then c128.
	//
	// The a._mm_blendv_epi8(b, mask) method picks b when mask is true.
	p128 = c128._mm_blendv_epi8(
	b: b128,
	mask: smallest128._mm_cmpeq_epi16(b: pb128))._mm_blendv_epi8(
	b: a128,
	mask: smallest128._mm_cmpeq_epi16(b: pa128))

	// Add the predictor to the residual and, for the next iteration, set
	// its previous pixels, a128 and c128, to x128 and b128.
	//
	// Again, convert between u8 and i16 by unpacking and re-packing.
	x128 = util.make_m128i_single_u32(a: curr.peek_u32le())
	x128 = x128._mm_unpacklo_epi8(b: z128)
	x128 = x128._mm_add_epi8(b: p128)
	a128 = x128
	c128 = b128
	x128 = x128._mm_packus_epi16(b: x128)
	curr.poke_u32le!(a: x128.truncate_u32())
	}
	}