Factor out std/adler32/common_up_x86_sse42.wuffs
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 88603c5..30da340 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -17658,13 +17658,6 @@
wuffs_adler32__hasher* self,
wuffs_base__slice_u8 a_x);
-#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-static wuffs_base__empty_struct
-wuffs_adler32__hasher__up_sse42(
- wuffs_adler32__hasher* self,
- wuffs_base__slice_u8 a_x);
-#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
-
#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
static wuffs_base__empty_struct
wuffs_adler32__hasher__up_arm_neon(
@@ -17672,6 +17665,13 @@
wuffs_base__slice_u8 a_x);
#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+static wuffs_base__empty_struct
+wuffs_adler32__hasher__up_sse42(
+ wuffs_adler32__hasher* self,
+ wuffs_base__slice_u8 a_x);
+#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+
// ---------------- VTables
const wuffs_base__hasher_u32__func_ptrs
@@ -17869,98 +17869,6 @@
return wuffs_base__make_empty_struct();
}
-// -------- func adler32.hasher.up_sse42
-
-#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-#if defined(__GNUC__)
-__attribute__((target("sse4.2")))
-#endif
-static wuffs_base__empty_struct
-wuffs_adler32__hasher__up_sse42(
- wuffs_adler32__hasher* self,
- wuffs_base__slice_u8 a_x) {
- uint32_t v_s1 = 0;
- uint32_t v_s2 = 0;
- wuffs_base__slice_u8 v_remaining = {0};
- wuffs_base__slice_u8 v_p = {0};
- __m128i v_zeroes = {0};
- __m128i v_ones = {0};
- __m128i v_weights__left = {0};
- __m128i v_weights_right = {0};
- __m128i v_p__left = {0};
- __m128i v_p_right = {0};
- __m128i v_v1 = {0};
- __m128i v_v2 = {0};
- __m128i v_v2j = {0};
- __m128i v_v2k = {0};
- uint32_t v_num_iterate_bytes = 0;
- uint64_t v_tail_index = 0;
-
- v_zeroes = _mm_set1_epi16((int16_t)(0));
- v_ones = _mm_set1_epi16((int16_t)(1));
- v_weights__left = _mm_set_epi8((int8_t)(17), (int8_t)(18), (int8_t)(19), (int8_t)(20), (int8_t)(21), (int8_t)(22), (int8_t)(23), (int8_t)(24), (int8_t)(25), (int8_t)(26), (int8_t)(27), (int8_t)(28), (int8_t)(29), (int8_t)(30), (int8_t)(31), (int8_t)(32));
- v_weights_right = _mm_set_epi8((int8_t)(1), (int8_t)(2), (int8_t)(3), (int8_t)(4), (int8_t)(5), (int8_t)(6), (int8_t)(7), (int8_t)(8), (int8_t)(9), (int8_t)(10), (int8_t)(11), (int8_t)(12), (int8_t)(13), (int8_t)(14), (int8_t)(15), (int8_t)(16));
- v_s1 = ((self->private_impl.f_state) & 0xFFFF);
- v_s2 = ((self->private_impl.f_state) >> (32 - (16)));
- while (((uint64_t)(a_x.len)) > 0) {
- v_remaining = wuffs_base__slice_u8__subslice_j(a_x, 0);
- if (((uint64_t)(a_x.len)) > 5536) {
- v_remaining = wuffs_base__slice_u8__subslice_i(a_x, 5536);
- a_x = wuffs_base__slice_u8__subslice_j(a_x, 5536);
- }
- v_num_iterate_bytes = ((uint32_t)((((uint64_t)(a_x.len)) & 4294967264)));
- v_s2 += ((uint32_t)(v_s1 * v_num_iterate_bytes));
- v_v1 = _mm_setzero_si128();
- v_v2j = _mm_setzero_si128();
- v_v2k = _mm_setzero_si128();
- {
- wuffs_base__slice_u8 i_slice_p = a_x;
- v_p.ptr = i_slice_p.ptr;
- v_p.len = 32;
- uint8_t* i_end0_p = v_p.ptr + (((i_slice_p.len - (size_t)(v_p.ptr - i_slice_p.ptr)) / 32) * 32);
- while (v_p.ptr < i_end0_p) {
- v_p__left = _mm_lddqu_si128((const __m128i*)(const void*)(wuffs_base__slice_u8__subslice_j(v_p, 16).ptr));
- v_p_right = _mm_lddqu_si128((const __m128i*)(const void*)(wuffs_base__slice_u8__subslice_ij(v_p, 16, 32).ptr));
- v_v2j = _mm_add_epi32(v_v2j, v_v1);
- v_v1 = _mm_add_epi32(v_v1, _mm_sad_epu8(v_p__left, v_zeroes));
- v_v1 = _mm_add_epi32(v_v1, _mm_sad_epu8(v_p_right, v_zeroes));
- v_v2k = _mm_add_epi32(v_v2k, _mm_madd_epi16(v_ones, _mm_maddubs_epi16(v_p__left, v_weights__left)));
- v_v2k = _mm_add_epi32(v_v2k, _mm_madd_epi16(v_ones, _mm_maddubs_epi16(v_p_right, v_weights_right)));
- v_p.ptr += 32;
- }
- v_p.len = 0;
- }
- v_v1 = _mm_add_epi32(v_v1, _mm_shuffle_epi32(v_v1, (int32_t)(177)));
- v_v1 = _mm_add_epi32(v_v1, _mm_shuffle_epi32(v_v1, (int32_t)(78)));
- v_s1 += ((uint32_t)(_mm_cvtsi128_si32(v_v1)));
- v_v2 = _mm_add_epi32(v_v2k, _mm_slli_epi32(v_v2j, (int32_t)(5)));
- v_v2 = _mm_add_epi32(v_v2, _mm_shuffle_epi32(v_v2, (int32_t)(177)));
- v_v2 = _mm_add_epi32(v_v2, _mm_shuffle_epi32(v_v2, (int32_t)(78)));
- v_s2 += ((uint32_t)(_mm_cvtsi128_si32(v_v2)));
- v_tail_index = (((uint64_t)(a_x.len)) & 18446744073709551584u);
- if (v_tail_index < ((uint64_t)(a_x.len))) {
- {
- wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, v_tail_index);
- v_p.ptr = i_slice_p.ptr;
- v_p.len = 1;
- uint8_t* i_end0_p = i_slice_p.ptr + i_slice_p.len;
- while (v_p.ptr < i_end0_p) {
- v_s1 += ((uint32_t)(v_p.ptr[0]));
- v_s2 += v_s1;
- v_p.ptr += 1;
- }
- v_p.len = 0;
- }
- }
- v_s1 %= 65521;
- v_s2 %= 65521;
- a_x = v_remaining;
- }
- self->private_impl.f_state = (((v_s2 & 65535) << 16) | (v_s1 & 65535));
- return wuffs_base__make_empty_struct();
-}
-#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
-
// -------- func adler32.hasher.up_arm_neon
#if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
@@ -18065,6 +17973,98 @@
}
#endif // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
+// -------- func adler32.hasher.up_sse42
+
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+#if defined(__GNUC__)
+__attribute__((target("sse4.2")))
+#endif
+static wuffs_base__empty_struct
+wuffs_adler32__hasher__up_sse42(
+ wuffs_adler32__hasher* self,
+ wuffs_base__slice_u8 a_x) {
+ uint32_t v_s1 = 0;
+ uint32_t v_s2 = 0;
+ wuffs_base__slice_u8 v_remaining = {0};
+ wuffs_base__slice_u8 v_p = {0};
+ __m128i v_zeroes = {0};
+ __m128i v_ones = {0};
+ __m128i v_weights__left = {0};
+ __m128i v_weights_right = {0};
+ __m128i v_p__left = {0};
+ __m128i v_p_right = {0};
+ __m128i v_v1 = {0};
+ __m128i v_v2 = {0};
+ __m128i v_v2j = {0};
+ __m128i v_v2k = {0};
+ uint32_t v_num_iterate_bytes = 0;
+ uint64_t v_tail_index = 0;
+
+ v_zeroes = _mm_set1_epi16((int16_t)(0));
+ v_ones = _mm_set1_epi16((int16_t)(1));
+ v_weights__left = _mm_set_epi8((int8_t)(17), (int8_t)(18), (int8_t)(19), (int8_t)(20), (int8_t)(21), (int8_t)(22), (int8_t)(23), (int8_t)(24), (int8_t)(25), (int8_t)(26), (int8_t)(27), (int8_t)(28), (int8_t)(29), (int8_t)(30), (int8_t)(31), (int8_t)(32));
+ v_weights_right = _mm_set_epi8((int8_t)(1), (int8_t)(2), (int8_t)(3), (int8_t)(4), (int8_t)(5), (int8_t)(6), (int8_t)(7), (int8_t)(8), (int8_t)(9), (int8_t)(10), (int8_t)(11), (int8_t)(12), (int8_t)(13), (int8_t)(14), (int8_t)(15), (int8_t)(16));
+ v_s1 = ((self->private_impl.f_state) & 0xFFFF);
+ v_s2 = ((self->private_impl.f_state) >> (32 - (16)));
+ while (((uint64_t)(a_x.len)) > 0) {
+ v_remaining = wuffs_base__slice_u8__subslice_j(a_x, 0);
+ if (((uint64_t)(a_x.len)) > 5536) {
+ v_remaining = wuffs_base__slice_u8__subslice_i(a_x, 5536);
+ a_x = wuffs_base__slice_u8__subslice_j(a_x, 5536);
+ }
+ v_num_iterate_bytes = ((uint32_t)((((uint64_t)(a_x.len)) & 4294967264)));
+ v_s2 += ((uint32_t)(v_s1 * v_num_iterate_bytes));
+ v_v1 = _mm_setzero_si128();
+ v_v2j = _mm_setzero_si128();
+ v_v2k = _mm_setzero_si128();
+ {
+ wuffs_base__slice_u8 i_slice_p = a_x;
+ v_p.ptr = i_slice_p.ptr;
+ v_p.len = 32;
+ uint8_t* i_end0_p = v_p.ptr + (((i_slice_p.len - (size_t)(v_p.ptr - i_slice_p.ptr)) / 32) * 32);
+ while (v_p.ptr < i_end0_p) {
+ v_p__left = _mm_lddqu_si128((const __m128i*)(const void*)(wuffs_base__slice_u8__subslice_j(v_p, 16).ptr));
+ v_p_right = _mm_lddqu_si128((const __m128i*)(const void*)(wuffs_base__slice_u8__subslice_ij(v_p, 16, 32).ptr));
+ v_v2j = _mm_add_epi32(v_v2j, v_v1);
+ v_v1 = _mm_add_epi32(v_v1, _mm_sad_epu8(v_p__left, v_zeroes));
+ v_v1 = _mm_add_epi32(v_v1, _mm_sad_epu8(v_p_right, v_zeroes));
+ v_v2k = _mm_add_epi32(v_v2k, _mm_madd_epi16(v_ones, _mm_maddubs_epi16(v_p__left, v_weights__left)));
+ v_v2k = _mm_add_epi32(v_v2k, _mm_madd_epi16(v_ones, _mm_maddubs_epi16(v_p_right, v_weights_right)));
+ v_p.ptr += 32;
+ }
+ v_p.len = 0;
+ }
+ v_v1 = _mm_add_epi32(v_v1, _mm_shuffle_epi32(v_v1, (int32_t)(177)));
+ v_v1 = _mm_add_epi32(v_v1, _mm_shuffle_epi32(v_v1, (int32_t)(78)));
+ v_s1 += ((uint32_t)(_mm_cvtsi128_si32(v_v1)));
+ v_v2 = _mm_add_epi32(v_v2k, _mm_slli_epi32(v_v2j, (int32_t)(5)));
+ v_v2 = _mm_add_epi32(v_v2, _mm_shuffle_epi32(v_v2, (int32_t)(177)));
+ v_v2 = _mm_add_epi32(v_v2, _mm_shuffle_epi32(v_v2, (int32_t)(78)));
+ v_s2 += ((uint32_t)(_mm_cvtsi128_si32(v_v2)));
+ v_tail_index = (((uint64_t)(a_x.len)) & 18446744073709551584u);
+ if (v_tail_index < ((uint64_t)(a_x.len))) {
+ {
+ wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, v_tail_index);
+ v_p.ptr = i_slice_p.ptr;
+ v_p.len = 1;
+ uint8_t* i_end0_p = i_slice_p.ptr + i_slice_p.len;
+ while (v_p.ptr < i_end0_p) {
+ v_s1 += ((uint32_t)(v_p.ptr[0]));
+ v_s2 += v_s1;
+ v_p.ptr += 1;
+ }
+ v_p.len = 0;
+ }
+ }
+ v_s1 %= 65521;
+ v_s2 %= 65521;
+ a_x = v_remaining;
+ }
+ self->private_impl.f_state = (((v_s2 & 65535) << 16) | (v_s1 & 65535));
+ return wuffs_base__make_empty_struct();
+}
+#endif // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+
#endif // !defined(WUFFS_CONFIG__MODULES) || defined(WUFFS_CONFIG__MODULE__ADLER32)
#if !defined(WUFFS_CONFIG__MODULES) || defined(WUFFS_CONFIG__MODULE__BMP)
diff --git a/std/adler32/common_adler32.wuffs b/std/adler32/common_adler32.wuffs
index 0ad8054..6a5f2de 100644
--- a/std/adler32/common_adler32.wuffs
+++ b/std/adler32/common_adler32.wuffs
@@ -65,171 +65,3 @@
} endwhile
this.state = ((s2 & 0xFFFF) << 16) | (s1 & 0xFFFF)
}
-
-pri func hasher.up_sse42!(x: slice base.u8),
- choose cpu_arch >= x86_sse42,
-{
- // These variables are the same as the non-SIMD version.
- var s1 : base.u32
- var s2 : base.u32
- var remaining : slice base.u8
- var p : slice base.u8
-
- // The remaining variables are specific to the SIMD version.
-
- var zeroes : base.x86_m128i
- var ones : base.x86_m128i
- var weights__left : base.x86_m128i
- var weights_right : base.x86_m128i
- var p__left : base.x86_m128i
- var p_right : base.x86_m128i
- var v1 : base.x86_m128i
- var v2 : base.x86_m128i
- var v2j : base.x86_m128i
- var v2k : base.x86_m128i
-
- var num_iterate_bytes : base.u32
- var tail_index : base.u64
-
- // zeroes and ones are uniform u16×8 vectors.
- zeroes = zeroes.create_mm_set1_epi16(a: 0)
- ones = ones.create_mm_set1_epi16(a: 1)
-
- // weights__left and weights_right, little-endian, form the sequence 32,
- // 31, 30, ..., 1.
- weights__left = weights__left.create_mm_set_epi8(
- e15: 0x11, e14: 0x12, e13: 0x13, e12: 0x14,
- e11: 0x15, e10: 0x16, e9: 0x17, e8: 0x18,
- e7: 0x19, e6: 0x1A, e5: 0x1B, e4: 0x1C,
- e3: 0x1D, e2: 0x1E, e1: 0x1F, e0: 0x20)
- weights_right = weights_right.create_mm_set_epi8(
- e15: 0x01, e14: 0x02, e13: 0x03, e12: 0x04,
- e11: 0x05, e10: 0x06, e9: 0x07, e8: 0x08,
- e7: 0x09, e6: 0x0A, e5: 0x0B, e4: 0x0C,
- e3: 0x0D, e2: 0x0E, e1: 0x0F, e0: 0x10)
-
- // Decompose this.state.
- s1 = this.state.low_bits(n: 16)
- s2 = this.state.high_bits(n: 16)
-
- // Just like the non-SIMD version, loop over args.x up to almost-5552 bytes
- // at a time. The slightly smaller 5536 is the largest multiple of 32 less
- // than non-SIMD's 5552.
- while args.x.length() > 0 {
- remaining = args.x[.. 0]
- if args.x.length() > 5536 {
- remaining = args.x[5536 ..]
- args.x = args.x[.. 5536]
- }
-
- // The s1 state is the sum of the input bytes and the s2 state is the
- // sum of the s1 state at each 1-byte step. Inside the iterate loop
- // below, but starting fresh at each outer while loop iteration, s1
- // consists of three parts (called s1i, s1j and s1k):
- // - s1i: the initial value, before any 32-byte iterations.
- // - s1j: the total contribution from previous 32-byte iterations.
- // - s1k: the contribution due to the current 32-byte iteration.
- //
- // The upcoming iterate loop (at 32 bytes per iteration) encompasses
- // num_iterate_bytes 1-byte steps. We hoist the total s1i contribution,
- // (s1i * num_iterate_bytes) out here.
- num_iterate_bytes = (args.x.length() & 0xFFFF_FFE0) as base.u32
- s2 ~mod+= (s1 ~mod* num_iterate_bytes)
-
- // Zero-initialize some u32×4 vectors associated with the two state
- // variables s1 and s2. The iterate loop accumulates four parallel u32
- // sums in each vector. A post-iterate step merges the four u32 sums
- // into a single u32 sum.
- v1 = v1.create_mm_setzero_si128()
- v2j = v2j.create_mm_setzero_si128()
- v2k = v2k.create_mm_setzero_si128()
-
- // The inner loop.
- iterate (p = args.x)(length: 32, advance: 32, unroll: 1) {
- // Split the 32-byte p into left and right halves. SSE4.2 works
- // with 16-byte registers.
- //
- // Let p__left = [u8×16: p00, p01, p02, ..., p15]
- // Let p_right = [u8×16: p16, p17, p18, ..., p31]
- p__left.load_slice128!(a: p[.. 16])
- p_right.load_slice128!(a: p[16 .. 32])
-
- // For v2j, we need to calculate the sums of the s1j terms for each
- // of p's 32 elements. This is simply 32 times the same number,
- // that number being the sum of v1's four u32 accumulators. We add
- // v1 now and multiply by 32 later, outside the inner loop.
- v2j = v2j._mm_add_epi32(b: v1)
-
- // For v1, we need to add the elements of p. Computing the sum of
- // absolute differences (_mm_sad_epu8) with zero just sums the
- // elements. p__left._mm_sad_epu8(b: zeroes) equals
- // [u64×2: p00 + p01 + ... + p07, p08 + p09 + ... + p15]
- // This is equivalent (little-endian) to:
- // [u32×4: p00 + p01 + ... + p07, 0, p08 + p09 + ... + p15, 0]
- // We accumulate those "sum of p__left elements" in v1, and ditto
- // for the p_right elements.
- v1 = v1._mm_add_epi32(b: p__left._mm_sad_epu8(b: zeroes))
- v1 = v1._mm_add_epi32(b: p_right._mm_sad_epu8(b: zeroes))
-
- // For v2k, we need to calculate a weighted sum: ((32 * p00) + (31
- // * p01) + (30 * p02) + ... + (1 * p31)), which splits naturally
- // into weighted sums of the left half and of the right half.
- //
- // The _mm_maddubs_epi16 call (vertically multiply u8 columns and
- // then horizontally sum u16 pairs) with the left half produces:
- // [u16×8: ((32*p00)+(31*p01)),
- // ((30*p02)+(29*p03)),
- // ...
- // ((18*p14)+(17*p15))]
- //
- // The ones._mm_madd_epi16(b: etc) call is likewise a multiply-add
- // (note that it's "madd" not "add"). Multiplying by 1 is a no-op,
- // so this sums u16 pairs to produce u32 values:
- // [u32×4: ((32*p00)+(31*p01)+(30*p02)+(29*p03)),
- // ((28*p04)+(27*p05)+(26*p06)+(25*p07)),
- // ...
- // ((20*p12)+(19*p13)+(18*p14)+(17*p15))]
- //
- // Ditto again for the p_right elements.
- v2k = v2k._mm_add_epi32(b: ones._mm_madd_epi16(b:
- p__left._mm_maddubs_epi16(b: weights__left)))
- v2k = v2k._mm_add_epi32(b: ones._mm_madd_epi16(b:
- p_right._mm_maddubs_epi16(b: weights_right)))
- }
-
- // Merge the four parallel u32 sums (v1) into the single u32 sum (s1).
- // Starting with a u32×4 vector [x0, x1, x2, x3]:
- // - shuffling with 0b1011_0001 gives [x1, x0, x3, x2].
- // - adding gives [x0+x1, x0+x1, x2+x3, x2+x3].
- // - shuffling with 0b0100_1110 gives [x2+x3, x2+x3, x0+x1, x0+x1].
- // - adding gives [x0+x1+x2+x3, ditto, ditto, ditto].
- // The truncate_u32 call extracts the first u32: x0+x1+x2+x3.
- v1 = v1._mm_add_epi32(b: v1._mm_shuffle_epi32(imm8: 0b1011_0001))
- v1 = v1._mm_add_epi32(b: v1._mm_shuffle_epi32(imm8: 0b0100_1110))
- s1 ~mod+= v1.truncate_u32()
-
- // Combine v2j and v2k. The slli (shift logical left immediate) by 5
- // multiplies v2j's four u32 elements each by 32, alluded to earlier.
- v2 = v2k._mm_add_epi32(b: v2j._mm_slli_epi32(imm8: 5))
-
- // Similarly merge v2 (a u32×4 vector) into s2 (a u32 scalar).
- v2 = v2._mm_add_epi32(b: v2._mm_shuffle_epi32(imm8: 0b1011_0001))
- v2 = v2._mm_add_epi32(b: v2._mm_shuffle_epi32(imm8: 0b0100_1110))
- s2 ~mod+= v2.truncate_u32()
-
- // Handle the tail of args.x that wasn't a complete 32-byte chunk.
- tail_index = args.x.length() & 0xFFFF_FFFF_FFFF_FFE0 // And-not 32.
- if tail_index < args.x.length() {
- iterate (p = args.x[tail_index ..])(length: 1, advance: 1, unroll: 1) {
- s1 ~mod+= p[0] as base.u32
- s2 ~mod+= s1
- }
- }
-
- // The rest of this function is the same as the non-SIMD version.
- s1 %= 65521
- s2 %= 65521
- args.x = remaining
- } endwhile
- this.state = ((s2 & 0xFFFF) << 16) | (s1 & 0xFFFF)
-}
diff --git a/std/adler32/common_up_x86_sse42.wuffs b/std/adler32/common_up_x86_sse42.wuffs
new file mode 100644
index 0000000..a9f44c1
--- /dev/null
+++ b/std/adler32/common_up_x86_sse42.wuffs
@@ -0,0 +1,181 @@
+// Copyright 2021 The Wuffs Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pri func hasher.up_sse42!(x: slice base.u8),
+ choose cpu_arch >= x86_sse42,
+{
+ // These variables are the same as the non-SIMD version.
+ var s1 : base.u32
+ var s2 : base.u32
+ var remaining : slice base.u8
+ var p : slice base.u8
+
+ // The remaining variables are specific to the SIMD version.
+
+ var zeroes : base.x86_m128i
+ var ones : base.x86_m128i
+ var weights__left : base.x86_m128i
+ var weights_right : base.x86_m128i
+ var p__left : base.x86_m128i
+ var p_right : base.x86_m128i
+ var v1 : base.x86_m128i
+ var v2 : base.x86_m128i
+ var v2j : base.x86_m128i
+ var v2k : base.x86_m128i
+
+ var num_iterate_bytes : base.u32
+ var tail_index : base.u64
+
+ // zeroes and ones are uniform u16×8 vectors.
+ zeroes = zeroes.create_mm_set1_epi16(a: 0)
+ ones = ones.create_mm_set1_epi16(a: 1)
+
+ // weights__left and weights_right, little-endian, form the sequence 32,
+ // 31, 30, ..., 1.
+ weights__left = weights__left.create_mm_set_epi8(
+ e15: 0x11, e14: 0x12, e13: 0x13, e12: 0x14,
+ e11: 0x15, e10: 0x16, e9: 0x17, e8: 0x18,
+ e7: 0x19, e6: 0x1A, e5: 0x1B, e4: 0x1C,
+ e3: 0x1D, e2: 0x1E, e1: 0x1F, e0: 0x20)
+ weights_right = weights_right.create_mm_set_epi8(
+ e15: 0x01, e14: 0x02, e13: 0x03, e12: 0x04,
+ e11: 0x05, e10: 0x06, e9: 0x07, e8: 0x08,
+ e7: 0x09, e6: 0x0A, e5: 0x0B, e4: 0x0C,
+ e3: 0x0D, e2: 0x0E, e1: 0x0F, e0: 0x10)
+
+ // Decompose this.state.
+ s1 = this.state.low_bits(n: 16)
+ s2 = this.state.high_bits(n: 16)
+
+ // Just like the non-SIMD version, loop over args.x up to almost-5552 bytes
+ // at a time. The slightly smaller 5536 is the largest multiple of 32 less
+ // than non-SIMD's 5552.
+ while args.x.length() > 0 {
+ remaining = args.x[.. 0]
+ if args.x.length() > 5536 {
+ remaining = args.x[5536 ..]
+ args.x = args.x[.. 5536]
+ }
+
+ // The s1 state is the sum of the input bytes and the s2 state is the
+ // sum of the s1 state at each 1-byte step. Inside the iterate loop
+ // below, but starting fresh at each outer while loop iteration, s1
+ // consists of three parts (called s1i, s1j and s1k):
+ // - s1i: the initial value, before any 32-byte iterations.
+ // - s1j: the total contribution from previous 32-byte iterations.
+ // - s1k: the contribution due to the current 32-byte iteration.
+ //
+ // The upcoming iterate loop (at 32 bytes per iteration) encompasses
+ // num_iterate_bytes 1-byte steps. We hoist the total s1i contribution,
+ // (s1i * num_iterate_bytes) out here.
+ num_iterate_bytes = (args.x.length() & 0xFFFF_FFE0) as base.u32
+ s2 ~mod+= (s1 ~mod* num_iterate_bytes)
+
+ // Zero-initialize some u32×4 vectors associated with the two state
+ // variables s1 and s2. The iterate loop accumulates four parallel u32
+ // sums in each vector. A post-iterate step merges the four u32 sums
+ // into a single u32 sum.
+ v1 = v1.create_mm_setzero_si128()
+ v2j = v2j.create_mm_setzero_si128()
+ v2k = v2k.create_mm_setzero_si128()
+
+ // The inner loop.
+ iterate (p = args.x)(length: 32, advance: 32, unroll: 1) {
+ // Split the 32-byte p into left and right halves. SSE4.2 works
+ // with 16-byte registers.
+ //
+ // Let p__left = [u8×16: p00, p01, p02, ..., p15]
+ // Let p_right = [u8×16: p16, p17, p18, ..., p31]
+ p__left.load_slice128!(a: p[.. 16])
+ p_right.load_slice128!(a: p[16 .. 32])
+
+ // For v2j, we need to calculate the sums of the s1j terms for each
+ // of p's 32 elements. This is simply 32 times the same number,
+ // that number being the sum of v1's four u32 accumulators. We add
+ // v1 now and multiply by 32 later, outside the inner loop.
+ v2j = v2j._mm_add_epi32(b: v1)
+
+ // For v1, we need to add the elements of p. Computing the sum of
+ // absolute differences (_mm_sad_epu8) with zero just sums the
+ // elements. p__left._mm_sad_epu8(b: zeroes) equals
+ // [u64×2: p00 + p01 + ... + p07, p08 + p09 + ... + p15]
+ // This is equivalent (little-endian) to:
+ // [u32×4: p00 + p01 + ... + p07, 0, p08 + p09 + ... + p15, 0]
+ // We accumulate those "sum of p__left elements" in v1, and ditto
+ // for the p_right elements.
+ v1 = v1._mm_add_epi32(b: p__left._mm_sad_epu8(b: zeroes))
+ v1 = v1._mm_add_epi32(b: p_right._mm_sad_epu8(b: zeroes))
+
+ // For v2k, we need to calculate a weighted sum: ((32 * p00) + (31
+ // * p01) + (30 * p02) + ... + (1 * p31)), which splits naturally
+ // into weighted sums of the left half and of the right half.
+ //
+ // The _mm_maddubs_epi16 call (vertically multiply u8 columns and
+ // then horizontally sum u16 pairs) with the left half produces:
+ // [u16×8: ((32*p00)+(31*p01)),
+ // ((30*p02)+(29*p03)),
+ // ...
+ // ((18*p14)+(17*p15))]
+ //
+ // The ones._mm_madd_epi16(b: etc) call is likewise a multiply-add
+ // (note that it's "madd" not "add"). Multiplying by 1 is a no-op,
+ // so this sums u16 pairs to produce u32 values:
+ // [u32×4: ((32*p00)+(31*p01)+(30*p02)+(29*p03)),
+ // ((28*p04)+(27*p05)+(26*p06)+(25*p07)),
+ // ...
+ // ((20*p12)+(19*p13)+(18*p14)+(17*p15))]
+ //
+ // Ditto again for the p_right elements.
+ v2k = v2k._mm_add_epi32(b: ones._mm_madd_epi16(b:
+ p__left._mm_maddubs_epi16(b: weights__left)))
+ v2k = v2k._mm_add_epi32(b: ones._mm_madd_epi16(b:
+ p_right._mm_maddubs_epi16(b: weights_right)))
+ }
+
+ // Merge the four parallel u32 sums (v1) into the single u32 sum (s1).
+ // Starting with a u32×4 vector [x0, x1, x2, x3]:
+ // - shuffling with 0b1011_0001 gives [x1, x0, x3, x2].
+ // - adding gives [x0+x1, x0+x1, x2+x3, x2+x3].
+ // - shuffling with 0b0100_1110 gives [x2+x3, x2+x3, x0+x1, x0+x1].
+ // - adding gives [x0+x1+x2+x3, ditto, ditto, ditto].
+ // The truncate_u32 call extracts the first u32: x0+x1+x2+x3.
+ v1 = v1._mm_add_epi32(b: v1._mm_shuffle_epi32(imm8: 0b1011_0001))
+ v1 = v1._mm_add_epi32(b: v1._mm_shuffle_epi32(imm8: 0b0100_1110))
+ s1 ~mod+= v1.truncate_u32()
+
+ // Combine v2j and v2k. The slli (shift logical left immediate) by 5
+ // multiplies v2j's four u32 elements each by 32, alluded to earlier.
+ v2 = v2k._mm_add_epi32(b: v2j._mm_slli_epi32(imm8: 5))
+
+ // Similarly merge v2 (a u32×4 vector) into s2 (a u32 scalar).
+ v2 = v2._mm_add_epi32(b: v2._mm_shuffle_epi32(imm8: 0b1011_0001))
+ v2 = v2._mm_add_epi32(b: v2._mm_shuffle_epi32(imm8: 0b0100_1110))
+ s2 ~mod+= v2.truncate_u32()
+
+ // Handle the tail of args.x that wasn't a complete 32-byte chunk.
+ tail_index = args.x.length() & 0xFFFF_FFFF_FFFF_FFE0 // And-not 32.
+ if tail_index < args.x.length() {
+ iterate (p = args.x[tail_index ..])(length: 1, advance: 1, unroll: 1) {
+ s1 ~mod+= p[0] as base.u32
+ s2 ~mod+= s1
+ }
+ }
+
+ // The rest of this function is the same as the non-SIMD version.
+ s1 %= 65521
+ s2 %= 65521
+ args.x = remaining
+ } endwhile
+ this.state = ((s2 & 0xFFFF) << 16) | (s1 & 0xFFFF)
+}