std/png/decode_filter_sse128.wuffs - external/github.com/google/wuffs - Git at Google

 // Copyright 2021 The Wuffs Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // --------

 // Filter 1: Sub.

 // This (filter = 1, distance = 3) implementation doesn't actually bench faster
 // than the non-SIMD one.
 //
 // pri func decoder.filter_1_distance_3_sse128!(curr: slice base.u8),
 //     choose cpu_arch >= sse128,
 // {
 //     var c    : slice base.u8
 //     var x128 : base.sse128_i
 //     var a128 : base.sse128_i
 //
 //     iterate (c = args.curr)(length: 4, advance: 3, unroll: 1) {
 //         x128.load_u32!(a: c.peek_u32le())
 //         x128 = x128._mm_add_epi8!(b: a128)
 //         a128 = x128
 //         c.poke_u24le!(a: x128.truncate_u32())
 //     } else (length: 3, advance: 3, unroll: 1) {
 //         x128.load_u32!(a: c.peek_u24le_as_u32())
 //         x128 = x128._mm_add_epi8!(b: a128)
 //         c.poke_u24le!(a: x128.truncate_u32())
 //     }
 // }

 pri func decoder.filter_1_distance_4_sse128!(curr: slice base.u8),
 	choose cpu_arch >= sse128,
 {
 	var c    : slice base.u8
 	var x128 : base.sse128_i
 	var a128 : base.sse128_i

 	iterate (c = args.curr)(length: 4, advance: 4, unroll: 1) {
 		x128.load_u32!(a: c.peek_u32le())
 		x128 = x128._mm_add_epi8(b: a128)
 		a128 = x128
 		c.poke_u32le!(a: x128.truncate_u32())
 	}
 }

 // --------

 // Filter 3: Average.

 // Similar to filter_1_distance_3_sse128, the SIMD implementation for (filter =
 // 3, distance = 3) doesn't actually bench faster than the non-SIMD one.
 //
 // pri func decoder.filter_3_distance_3_sse128!(curr: slice base.u8, prev: slice base.u8),
 //     choose cpu_arch >= sse128,
 // {
 //     etc
 // }

 pri func decoder.filter_3_distance_4_sse128!(curr: slice base.u8, prev: slice base.u8),
 	choose cpu_arch >= sse128,
 {
 	var c    : slice base.u8
 	var p    : slice base.u8
 	var x128 : base.sse128_i
 	var a128 : base.sse128_i
 	var b128 : base.sse128_i
 	var p128 : base.sse128_i
 	var k128 : base.sse128_i

 	if args.prev.length() == 0 {
 		k128 = k128.create_mm_set1_epi8(a: 0xFE)
 		iterate (c = args.curr)(length: 4, advance: 4, unroll: 1) {
 			// The predictor, p128, is just half (rounded down) of the previous
 			// pixel, a128. In this branch, b128 stays zero so the average of
 			// a128 and b128 is just half of a128. _mm_avg_epu8 rounds up, but
 			// (a128 & 0xFE_repeated) takes out the low bits of a128's bytes.
 			p128 = a128._mm_and_si128(b: k128)._mm_avg_epu8(b: b128)

 			// Add the predictor to the residual and, for the next iteration,
 			// set its previous pixel, a128, to this one, x128.
 			x128.load_u32!(a: c.peek_u32le())
 			x128 = x128._mm_add_epi8(b: p128)
 			a128 = x128
 			c.poke_u32le!(a: x128.truncate_u32())
 		}

 	} else {
 		k128 = k128.create_mm_set1_epi8(a: 0x01)
 		iterate (c = args.curr, p = args.prev)(length: 4, advance: 4, unroll: 1) {
 			// Load the pixel from the row above.
 			b128.load_u32!(a: p.peek_u32le())

 			// The predictor, p128, is the average (rounded down) of the
 			// previous pixel, a128, and the pixel above, b128.
 			p128 = a128._mm_avg_epu8(b: b128)

 			// Subtract a correction term because _mm_avg_epu8 rounds up but
 			// the PNG filter rounds down. The correction term is the low bit
 			// of each byte of (a128 ^ b128).
 			p128 = p128._mm_sub_epi8(b: k128._mm_and_si128(b: a128._mm_xor_si128(b: b128)))

 			// Add the predictor to the residual and, for the next iteration,
 			// set its previous pixel, a128, to this one, x128.
 			x128.load_u32!(a: c.peek_u32le())
 			x128 = x128._mm_add_epi8(b: p128)
 			a128 = x128
 			c.poke_u32le!(a: x128.truncate_u32())
 		}
 	}
 }
	// Copyright 2021 The Wuffs Authors.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// https://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	// --------

	// Filter 1: Sub.

	// This (filter = 1, distance = 3) implementation doesn't actually bench faster
	// than the non-SIMD one.
	//
	// pri func decoder.filter_1_distance_3_sse128!(curr: slice base.u8),
	// choose cpu_arch >= sse128,
	// {
	// var c : slice base.u8
	// var x128 : base.sse128_i
	// var a128 : base.sse128_i
	//
	// iterate (c = args.curr)(length: 4, advance: 3, unroll: 1) {
	// x128.load_u32!(a: c.peek_u32le())
	// x128 = x128._mm_add_epi8!(b: a128)
	// a128 = x128
	// c.poke_u24le!(a: x128.truncate_u32())
	// } else (length: 3, advance: 3, unroll: 1) {
	// x128.load_u32!(a: c.peek_u24le_as_u32())
	// x128 = x128._mm_add_epi8!(b: a128)
	// c.poke_u24le!(a: x128.truncate_u32())
	// }
	// }

	pri func decoder.filter_1_distance_4_sse128!(curr: slice base.u8),
	choose cpu_arch >= sse128,
	{
	var c : slice base.u8
	var x128 : base.sse128_i
	var a128 : base.sse128_i

	iterate (c = args.curr)(length: 4, advance: 4, unroll: 1) {
	x128.load_u32!(a: c.peek_u32le())
	x128 = x128._mm_add_epi8(b: a128)
	a128 = x128
	c.poke_u32le!(a: x128.truncate_u32())
	}
	}

	// --------

	// Filter 3: Average.

	// Similar to filter_1_distance_3_sse128, the SIMD implementation for (filter =
	// 3, distance = 3) doesn't actually bench faster than the non-SIMD one.
	//
	// pri func decoder.filter_3_distance_3_sse128!(curr: slice base.u8, prev: slice base.u8),
	// choose cpu_arch >= sse128,
	// {
	// etc
	// }

	pri func decoder.filter_3_distance_4_sse128!(curr: slice base.u8, prev: slice base.u8),
	choose cpu_arch >= sse128,
	{
	var c : slice base.u8
	var p : slice base.u8
	var x128 : base.sse128_i
	var a128 : base.sse128_i
	var b128 : base.sse128_i
	var p128 : base.sse128_i
	var k128 : base.sse128_i

	if args.prev.length() == 0 {
	k128 = k128.create_mm_set1_epi8(a: 0xFE)
	iterate (c = args.curr)(length: 4, advance: 4, unroll: 1) {
	// The predictor, p128, is just half (rounded down) of the previous
	// pixel, a128. In this branch, b128 stays zero so the average of
	// a128 and b128 is just half of a128. _mm_avg_epu8 rounds up, but
	// (a128 & 0xFE_repeated) takes out the low bits of a128's bytes.
	p128 = a128._mm_and_si128(b: k128)._mm_avg_epu8(b: b128)

	// Add the predictor to the residual and, for the next iteration,
	// set its previous pixel, a128, to this one, x128.
	x128.load_u32!(a: c.peek_u32le())
	x128 = x128._mm_add_epi8(b: p128)
	a128 = x128
	c.poke_u32le!(a: x128.truncate_u32())
	}

	} else {
	k128 = k128.create_mm_set1_epi8(a: 0x01)
	iterate (c = args.curr, p = args.prev)(length: 4, advance: 4, unroll: 1) {
	// Load the pixel from the row above.
	b128.load_u32!(a: p.peek_u32le())

	// The predictor, p128, is the average (rounded down) of the
	// previous pixel, a128, and the pixel above, b128.
	p128 = a128._mm_avg_epu8(b: b128)

	// Subtract a correction term because _mm_avg_epu8 rounds up but
	// the PNG filter rounds down. The correction term is the low bit
	// of each byte of (a128 ^ b128).
	p128 = p128._mm_sub_epi8(b: k128._mm_and_si128(b: a128._mm_xor_si128(b: b128)))

	// Add the predictor to the residual and, for the next iteration,
	// set its previous pixel, a128, to this one, x128.
	x128.load_u32!(a: c.peek_u32le())
	x128 = x128._mm_add_epi8(b: p128)
	a128 = x128
	c.poke_u32le!(a: x128.truncate_u32())
	}
	}
	}