std/png/decode_filter_arm_neon.wuffs - external/github.com/google/wuffs - Git at Google

 // Copyright 2021 The Wuffs Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    https://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // --------

 // Compared to libpng's ARM NEON implementation, this implementation cannot
 // assume that the row pointers are 16-byte aligned. libpng allocates its own
 // buffers, Wuffs code cannot call malloc and instead uses whatever buffers it
 // is given. Wuffs also uncompresses the entire zlib stream into a continuous
 // array. Rows are therefore not necessarily padded to alignment boundaries,
 // especially considering the one additional filter byte per row.
 //
 // Wuffs' "cpu_arch >= arm_neon" requires __ARM_FEATURE_UNALIGNED but still, we
 // can't e.g. cast a uint8_t* to a uint32_t* without proving alignment.

 // --------

 // Filter 1: Sub.

 pri func decoder.filter_1_distance_4_arm_neon!(curr: slice base.u8),
 	choose cpu_arch >= arm_neon,
 {
 	var curr : slice base.u8

 	var util : base.arm_neon_utility
 	var fa   : base.arm_neon_u8x8
 	var fx   : base.arm_neon_u8x8

 	iterate (curr = args.curr)(length: 4, advance: 4, unroll: 2) {
 		fx = util.make_u32x2_repeat(a: curr.peek_u32le()).as_u8x8()
 		fx = fx.vadd_u8(b: fa)
 		curr.poke_u32le!(a: fx.as_u32x2().vget_lane_u32(b: 0))
 		fa = fx
 	}
 }

 // --------

 // Filter 3: Average.

 pri func decoder.filter_3_distance_4_arm_neon!(curr: slice base.u8, prev: slice base.u8),
 	choose cpu_arch >= arm_neon,
 {
 	var curr : slice base.u8
 	var prev : slice base.u8

 	var util : base.arm_neon_utility
 	var fa   : base.arm_neon_u8x8
 	var fb   : base.arm_neon_u8x8
 	var fx   : base.arm_neon_u8x8

 	if args.prev.length() == 0 {
 		iterate (curr = args.curr)(length: 4, advance: 4, unroll: 2) {
 			fx = util.make_u32x2_repeat(a: curr.peek_u32le()).as_u8x8()
 			fx = fx.vadd_u8(b: fa.vhadd_u8(b: fb))
 			curr.poke_u32le!(a: fx.as_u32x2().vget_lane_u32(b: 0))
 			fa = fx
 		}

 	} else {
 		iterate (curr = args.curr, prev = args.prev)(length: 4, advance: 4, unroll: 2) {
 			fb = util.make_u32x2_repeat(a: prev.peek_u32le()).as_u8x8()
 			fx = util.make_u32x2_repeat(a: curr.peek_u32le()).as_u8x8()
 			fx = fx.vadd_u8(b: fa.vhadd_u8(b: fb))
 			curr.poke_u32le!(a: fx.as_u32x2().vget_lane_u32(b: 0))
 			fa = fx
 		}
 	}
 }

 // --------

 // Filter 4: Paeth.

 pri func decoder.filter_4_distance_3_arm_neon!(curr: slice base.u8, prev: slice base.u8),
 	choose cpu_arch >= arm_neon,
 {
 	// See the comments in filter_4_distance_4_arm_neon for an explanation of
 	// how this works. That function's single loop is copied twice here, once
 	// with "length: 4" and once with "length: 3". It's generally faster to
 	// load 4 bytes at a time instead of 3.
 	//
 	// Differences between that function and this one are marked with a §.

 	var curr : slice base.u8
 	var prev : slice base.u8

 	var util  : base.arm_neon_utility
 	var fa    : base.arm_neon_u8x8
 	var fb    : base.arm_neon_u8x8
 	var fc    : base.arm_neon_u8x8
 	var fx    : base.arm_neon_u8x8
 	var fafb  : base.arm_neon_u16x8
 	var fcfc  : base.arm_neon_u16x8
 	var pa    : base.arm_neon_u16x8
 	var pb    : base.arm_neon_u16x8
 	var pc    : base.arm_neon_u16x8
 	var cmpab : base.arm_neon_u16x8
 	var cmpac : base.arm_neon_u16x8
 	var picka : base.arm_neon_u8x8
 	var pickb : base.arm_neon_u8x8

 	// § The advance is 3, not 4.
 	iterate (curr = args.curr, prev = args.prev)(length: 4, advance: 3, unroll: 2) {
 		fb = util.make_u32x2_repeat(a: prev.peek_u32le()).as_u8x8()
 		fx = util.make_u32x2_repeat(a: curr.peek_u32le()).as_u8x8()
 		fafb = fa.vaddl_u8(b: fb)
 		fcfc = fc.vaddl_u8(b: fc)
 		pa = fb.vabdl_u8(b: fc)
 		pb = fa.vabdl_u8(b: fc)
 		pc = fafb.vabdq_u16(b: fcfc)
 		cmpab = pa.vcleq_u16(b: pb)
 		cmpac = pa.vcleq_u16(b: pc)
 		picka = cmpab.vandq_u16(b: cmpac).vmovn_u16()
 		pickb = pb.vcleq_u16(b: pc).vmovn_u16()
 		fx = fx.vadd_u8(
 			b: picka.vbsl_u8(b: fa,
 			c: pickb.vbsl_u8(b: fb, c: fc)))
 		// § poke_u24le replaces poke_u32le.
 		curr.poke_u24le!(a: fx.as_u32x2().vget_lane_u32(b: 0))
 		fc = fb
 		fa = fx
 	} else (length: 3, advance: 3, unroll: 1) {
 		// § peek_u24le_as_u32 replaces peek_u32le.
 		fb = util.make_u32x2_repeat(a: prev.peek_u24le_as_u32()).as_u8x8()
 		// § peek_u24le_as_u32 replaces peek_u32le.
 		fx = util.make_u32x2_repeat(a: curr.peek_u24le_as_u32()).as_u8x8()
 		fafb = fa.vaddl_u8(b: fb)
 		fcfc = fc.vaddl_u8(b: fc)
 		pa = fb.vabdl_u8(b: fc)
 		pb = fa.vabdl_u8(b: fc)
 		pc = fafb.vabdq_u16(b: fcfc)
 		cmpab = pa.vcleq_u16(b: pb)
 		cmpac = pa.vcleq_u16(b: pc)
 		picka = cmpab.vandq_u16(b: cmpac).vmovn_u16()
 		pickb = pb.vcleq_u16(b: pc).vmovn_u16()
 		fx = fx.vadd_u8(
 			b: picka.vbsl_u8(b: fa,
 			c: pickb.vbsl_u8(b: fb, c: fc)))
 		// § poke_u24le replaces poke_u32le.
 		curr.poke_u24le!(a: fx.as_u32x2().vget_lane_u32(b: 0))
 		// § These assignments are unnecessary; this is the last iteration.
 		// fc = fb
 		// fa = fx
 	}
 }

 pri func decoder.filter_4_distance_4_arm_neon!(curr: slice base.u8, prev: slice base.u8),
 	choose cpu_arch >= arm_neon,
 {
 	var curr : slice base.u8
 	var prev : slice base.u8

 	var util  : base.arm_neon_utility
 	var fa    : base.arm_neon_u8x8
 	var fb    : base.arm_neon_u8x8
 	var fc    : base.arm_neon_u8x8
 	var fx    : base.arm_neon_u8x8
 	var fafb  : base.arm_neon_u16x8
 	var fcfc  : base.arm_neon_u16x8
 	var pa    : base.arm_neon_u16x8
 	var pb    : base.arm_neon_u16x8
 	var pc    : base.arm_neon_u16x8
 	var cmpab : base.arm_neon_u16x8
 	var cmpac : base.arm_neon_u16x8
 	var picka : base.arm_neon_u8x8
 	var pickb : base.arm_neon_u8x8

 	iterate (curr = args.curr, prev = args.prev)(length: 4, advance: 4, unroll: 2) {
 		fb = util.make_u32x2_repeat(a: prev.peek_u32le()).as_u8x8()
 		fx = util.make_u32x2_repeat(a: curr.peek_u32le()).as_u8x8()

 		fafb = fa.vaddl_u8(b: fb)  // fafb = (fa + fb)
 		fcfc = fc.vaddl_u8(b: fc)  // fcfc = (fc + fc)

 		pa = fb.vabdl_u8(b: fc)  //      pa = abs(fa + fb - fc - fa)
 		pb = fa.vabdl_u8(b: fc)  //      pb = abs(fa + fb - fc - fb)
 		pc = fafb.vabdq_u16(b: fcfc)  // pc = abs(fa + fb - fc - fc)

 		cmpab = pa.vcleq_u16(b: pb)  // cmpab = (pa <= pb)
 		cmpac = pa.vcleq_u16(b: pc)  // cmpac = (pa <= pc)

 		picka = cmpab.vandq_u16(b: cmpac).vmovn_u16()  // picka = ((pa <= pb) && (pa <= pc))
 		pickb = pb.vcleq_u16(b: pc).vmovn_u16()  //       pickb = (pb <= pc)

 		// Add the predictor to the residual.
 		fx = fx.vadd_u8(
 			b: picka.vbsl_u8(b: fa,
 			c: pickb.vbsl_u8(b: fb, c: fc)))

 		curr.poke_u32le!(a: fx.as_u32x2().vget_lane_u32(b: 0))
 		fc = fb
 		fa = fx
 	}
 }
	// Copyright 2021 The Wuffs Authors.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// https://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	// --------

	// Compared to libpng's ARM NEON implementation, this implementation cannot
	// assume that the row pointers are 16-byte aligned. libpng allocates its own
	// buffers, Wuffs code cannot call malloc and instead uses whatever buffers it
	// is given. Wuffs also uncompresses the entire zlib stream into a continuous
	// array. Rows are therefore not necessarily padded to alignment boundaries,
	// especially considering the one additional filter byte per row.
	//
	// Wuffs' "cpu_arch >= arm_neon" requires __ARM_FEATURE_UNALIGNED but still, we
	// can't e.g. cast a uint8_t* to a uint32_t* without proving alignment.

	// --------

	// Filter 1: Sub.

	pri func decoder.filter_1_distance_4_arm_neon!(curr: slice base.u8),
	choose cpu_arch >= arm_neon,
	{
	var curr : slice base.u8

	var util : base.arm_neon_utility
	var fa : base.arm_neon_u8x8
	var fx : base.arm_neon_u8x8

	iterate (curr = args.curr)(length: 4, advance: 4, unroll: 2) {
	fx = util.make_u32x2_repeat(a: curr.peek_u32le()).as_u8x8()
	fx = fx.vadd_u8(b: fa)
	curr.poke_u32le!(a: fx.as_u32x2().vget_lane_u32(b: 0))
	fa = fx
	}
	}

	// --------

	// Filter 3: Average.

	pri func decoder.filter_3_distance_4_arm_neon!(curr: slice base.u8, prev: slice base.u8),
	choose cpu_arch >= arm_neon,
	{
	var curr : slice base.u8
	var prev : slice base.u8

	var util : base.arm_neon_utility
	var fa : base.arm_neon_u8x8
	var fb : base.arm_neon_u8x8
	var fx : base.arm_neon_u8x8

	if args.prev.length() == 0 {
	iterate (curr = args.curr)(length: 4, advance: 4, unroll: 2) {
	fx = util.make_u32x2_repeat(a: curr.peek_u32le()).as_u8x8()
	fx = fx.vadd_u8(b: fa.vhadd_u8(b: fb))
	curr.poke_u32le!(a: fx.as_u32x2().vget_lane_u32(b: 0))
	fa = fx
	}

	} else {
	iterate (curr = args.curr, prev = args.prev)(length: 4, advance: 4, unroll: 2) {
	fb = util.make_u32x2_repeat(a: prev.peek_u32le()).as_u8x8()
	fx = util.make_u32x2_repeat(a: curr.peek_u32le()).as_u8x8()
	fx = fx.vadd_u8(b: fa.vhadd_u8(b: fb))
	curr.poke_u32le!(a: fx.as_u32x2().vget_lane_u32(b: 0))
	fa = fx
	}
	}
	}

	// --------

	// Filter 4: Paeth.

	pri func decoder.filter_4_distance_3_arm_neon!(curr: slice base.u8, prev: slice base.u8),
	choose cpu_arch >= arm_neon,
	{
	// See the comments in filter_4_distance_4_arm_neon for an explanation of
	// how this works. That function's single loop is copied twice here, once
	// with "length: 4" and once with "length: 3". It's generally faster to
	// load 4 bytes at a time instead of 3.
	//
	// Differences between that function and this one are marked with a §.

	var curr : slice base.u8
	var prev : slice base.u8

	var util : base.arm_neon_utility
	var fa : base.arm_neon_u8x8
	var fb : base.arm_neon_u8x8
	var fc : base.arm_neon_u8x8
	var fx : base.arm_neon_u8x8
	var fafb : base.arm_neon_u16x8
	var fcfc : base.arm_neon_u16x8
	var pa : base.arm_neon_u16x8
	var pb : base.arm_neon_u16x8
	var pc : base.arm_neon_u16x8
	var cmpab : base.arm_neon_u16x8
	var cmpac : base.arm_neon_u16x8
	var picka : base.arm_neon_u8x8
	var pickb : base.arm_neon_u8x8

	// § The advance is 3, not 4.
	iterate (curr = args.curr, prev = args.prev)(length: 4, advance: 3, unroll: 2) {
	fb = util.make_u32x2_repeat(a: prev.peek_u32le()).as_u8x8()
	fx = util.make_u32x2_repeat(a: curr.peek_u32le()).as_u8x8()
	fafb = fa.vaddl_u8(b: fb)
	fcfc = fc.vaddl_u8(b: fc)
	pa = fb.vabdl_u8(b: fc)
	pb = fa.vabdl_u8(b: fc)
	pc = fafb.vabdq_u16(b: fcfc)
	cmpab = pa.vcleq_u16(b: pb)
	cmpac = pa.vcleq_u16(b: pc)
	picka = cmpab.vandq_u16(b: cmpac).vmovn_u16()
	pickb = pb.vcleq_u16(b: pc).vmovn_u16()
	fx = fx.vadd_u8(
	b: picka.vbsl_u8(b: fa,
	c: pickb.vbsl_u8(b: fb, c: fc)))
	// § poke_u24le replaces poke_u32le.
	curr.poke_u24le!(a: fx.as_u32x2().vget_lane_u32(b: 0))
	fc = fb
	fa = fx
	} else (length: 3, advance: 3, unroll: 1) {
	// § peek_u24le_as_u32 replaces peek_u32le.
	fb = util.make_u32x2_repeat(a: prev.peek_u24le_as_u32()).as_u8x8()
	// § peek_u24le_as_u32 replaces peek_u32le.
	fx = util.make_u32x2_repeat(a: curr.peek_u24le_as_u32()).as_u8x8()
	fafb = fa.vaddl_u8(b: fb)
	fcfc = fc.vaddl_u8(b: fc)
	pa = fb.vabdl_u8(b: fc)
	pb = fa.vabdl_u8(b: fc)
	pc = fafb.vabdq_u16(b: fcfc)
	cmpab = pa.vcleq_u16(b: pb)
	cmpac = pa.vcleq_u16(b: pc)
	picka = cmpab.vandq_u16(b: cmpac).vmovn_u16()
	pickb = pb.vcleq_u16(b: pc).vmovn_u16()
	fx = fx.vadd_u8(
	b: picka.vbsl_u8(b: fa,
	c: pickb.vbsl_u8(b: fb, c: fc)))
	// § poke_u24le replaces poke_u32le.
	curr.poke_u24le!(a: fx.as_u32x2().vget_lane_u32(b: 0))
	// § These assignments are unnecessary; this is the last iteration.
	// fc = fb
	// fa = fx
	}
	}

	pri func decoder.filter_4_distance_4_arm_neon!(curr: slice base.u8, prev: slice base.u8),
	choose cpu_arch >= arm_neon,
	{
	var curr : slice base.u8
	var prev : slice base.u8

	var util : base.arm_neon_utility
	var fa : base.arm_neon_u8x8
	var fb : base.arm_neon_u8x8
	var fc : base.arm_neon_u8x8
	var fx : base.arm_neon_u8x8
	var fafb : base.arm_neon_u16x8
	var fcfc : base.arm_neon_u16x8
	var pa : base.arm_neon_u16x8
	var pb : base.arm_neon_u16x8
	var pc : base.arm_neon_u16x8
	var cmpab : base.arm_neon_u16x8
	var cmpac : base.arm_neon_u16x8
	var picka : base.arm_neon_u8x8
	var pickb : base.arm_neon_u8x8

	iterate (curr = args.curr, prev = args.prev)(length: 4, advance: 4, unroll: 2) {
	fb = util.make_u32x2_repeat(a: prev.peek_u32le()).as_u8x8()
	fx = util.make_u32x2_repeat(a: curr.peek_u32le()).as_u8x8()

	fafb = fa.vaddl_u8(b: fb) // fafb = (fa + fb)
	fcfc = fc.vaddl_u8(b: fc) // fcfc = (fc + fc)

	pa = fb.vabdl_u8(b: fc) // pa = abs(fa + fb - fc - fa)
	pb = fa.vabdl_u8(b: fc) // pb = abs(fa + fb - fc - fb)
	pc = fafb.vabdq_u16(b: fcfc) // pc = abs(fa + fb - fc - fc)

	cmpab = pa.vcleq_u16(b: pb) // cmpab = (pa <= pb)
	cmpac = pa.vcleq_u16(b: pc) // cmpac = (pa <= pc)

	picka = cmpab.vandq_u16(b: cmpac).vmovn_u16() // picka = ((pa <= pb) && (pa <= pc))
	pickb = pb.vcleq_u16(b: pc).vmovn_u16() // pickb = (pb <= pc)

	// Add the predictor to the residual.
	fx = fx.vadd_u8(
	b: picka.vbsl_u8(b: fa,
	c: pickb.vbsl_u8(b: fb, c: fc)))

	curr.poke_u32le!(a: fx.as_u32x2().vget_lane_u32(b: 0))
	fc = fb
	fa = fx
	}
	}