| // Copyright 2021 The Wuffs Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // https://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // -------- |
| |
| // Filter 1: Sub. |
| |
| // This (filter = 1, distance = 3) implementation doesn't actually bench faster |
| // than the non-SIMD one. |
| // |
| // pri func decoder.filter_1_distance_3_sse128!(curr: slice base.u8), |
| // choose cpu_arch >= sse128, |
| // { |
| // var c : slice base.u8 |
| // var x128 : base.sse128_i |
| // var a128 : base.sse128_i |
| // |
| // iterate (c = args.curr)(length: 4, advance: 3, unroll: 1) { |
| // x128.load_u32!(a: c.peek_u32le()) |
| // x128 = x128._mm_add_epi8!(b: a128) |
| // a128 = x128 |
| // c.poke_u24le!(a: x128.truncate_u32()) |
| // } else (length: 3, advance: 3, unroll: 1) { |
| // x128.load_u32!(a: c.peek_u24le_as_u32()) |
| // x128 = x128._mm_add_epi8!(b: a128) |
| // c.poke_u24le!(a: x128.truncate_u32()) |
| // } |
| // } |
| |
| pri func decoder.filter_1_distance_4_sse128!(curr: slice base.u8), |
| choose cpu_arch >= sse128, |
| { |
| var c : slice base.u8 |
| var x128 : base.sse128_i |
| var a128 : base.sse128_i |
| |
| iterate (c = args.curr)(length: 4, advance: 4, unroll: 1) { |
| x128.load_u32!(a: c.peek_u32le()) |
| x128 = x128._mm_add_epi8(b: a128) |
| a128 = x128 |
| c.poke_u32le!(a: x128.truncate_u32()) |
| } |
| } |
| |
| // -------- |
| |
| // Filter 3: Average. |
| |
| // Similar to filter_1_distance_3_sse128, the SIMD implementation for (filter = |
| // 3, distance = 3) doesn't actually bench faster than the non-SIMD one. |
| // |
| // pri func decoder.filter_3_distance_3_sse128!(curr: slice base.u8, prev: slice base.u8), |
| // choose cpu_arch >= sse128, |
| // { |
| // etc |
| // } |
| |
| pri func decoder.filter_3_distance_4_sse128!(curr: slice base.u8, prev: slice base.u8), |
| choose cpu_arch >= sse128, |
| { |
| var c : slice base.u8 |
| var p : slice base.u8 |
| var x128 : base.sse128_i |
| var a128 : base.sse128_i |
| var b128 : base.sse128_i |
| var p128 : base.sse128_i |
| var k128 : base.sse128_i |
| |
| if args.prev.length() == 0 { |
| k128 = k128.create_mm_set1_epi8(a: 0xFE) |
| iterate (c = args.curr)(length: 4, advance: 4, unroll: 1) { |
| // The predictor, p128, is just half (rounded down) of the previous |
| // pixel, a128. In this branch, b128 stays zero so the average of |
| // a128 and b128 is just half of a128. _mm_avg_epu8 rounds up, but |
| // (a128 & 0xFE_repeated) takes out the low bits of a128's bytes. |
| p128 = a128._mm_and_si128(b: k128)._mm_avg_epu8(b: b128) |
| |
| // Add the predictor to the residual and, for the next iteration, |
| // set its previous pixel, a128, to this one, x128. |
| x128.load_u32!(a: c.peek_u32le()) |
| x128 = x128._mm_add_epi8(b: p128) |
| a128 = x128 |
| c.poke_u32le!(a: x128.truncate_u32()) |
| } |
| |
| } else { |
| k128 = k128.create_mm_set1_epi8(a: 0x01) |
| iterate (c = args.curr, p = args.prev)(length: 4, advance: 4, unroll: 1) { |
| // Load the pixel from the row above. |
| b128.load_u32!(a: p.peek_u32le()) |
| |
| // The predictor, p128, is the average (rounded down) of the |
| // previous pixel, a128, and the pixel above, b128. |
| p128 = a128._mm_avg_epu8(b: b128) |
| |
| // Subtract a correction term because _mm_avg_epu8 rounds up but |
| // the PNG filter rounds down. The correction term is the low bit |
| // of each byte of (a128 ^ b128). |
| p128 = p128._mm_sub_epi8(b: k128._mm_and_si128(b: a128._mm_xor_si128(b: b128))) |
| |
| // Add the predictor to the residual and, for the next iteration, |
| // set its previous pixel, a128, to this one, x128. |
| x128.load_u32!(a: c.peek_u32le()) |
| x128 = x128._mm_add_epi8(b: p128) |
| a128 = x128 |
| c.poke_u32le!(a: x128.truncate_u32()) |
| } |
| } |
| } |