Factor out std/adler32/common_up_x86_sse42.wuffs

commit: 4c3f8ccfff160550337623f2261b04a457f60058 [log] [tgz]
author: Nigel Tao <nigeltao@golang.org> Tue Feb 16 22:23:54 2021 +1100
committer: Nigel Tao <nigeltao@golang.org> Tue Feb 16 22:23:54 2021 +1100
tree: 03a6206c03b1a337339dfef65f415039b925d7ad
parent: 010b34ed38801091c3d45740059b1a9caa03612e [diff]
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 88603c5..30da340 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c

@@ -17658,13 +17658,6 @@
     wuffs_adler32__hasher* self,
     wuffs_base__slice_u8 a_x);
 
-#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-static wuffs_base__empty_struct
-wuffs_adler32__hasher__up_sse42(
-    wuffs_adler32__hasher* self,
-    wuffs_base__slice_u8 a_x);
-#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
-
 #if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
 static wuffs_base__empty_struct
 wuffs_adler32__hasher__up_arm_neon(
@@ -17672,6 +17665,13 @@
     wuffs_base__slice_u8 a_x);
 #endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
 
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+static wuffs_base__empty_struct
+wuffs_adler32__hasher__up_sse42(
+    wuffs_adler32__hasher* self,
+    wuffs_base__slice_u8 a_x);
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+
 // ---------------- VTables
 
 const wuffs_base__hasher_u32__func_ptrs
@@ -17869,98 +17869,6 @@
   return wuffs_base__make_empty_struct();
 }
 
-// -------- func adler32.hasher.up_sse42
-
-#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
-#if defined(__GNUC__)
-__attribute__((target("sse4.2")))
-#endif
-static wuffs_base__empty_struct
-wuffs_adler32__hasher__up_sse42(
-    wuffs_adler32__hasher* self,
-    wuffs_base__slice_u8 a_x) {
-  uint32_t v_s1 = 0;
-  uint32_t v_s2 = 0;
-  wuffs_base__slice_u8 v_remaining = {0};
-  wuffs_base__slice_u8 v_p = {0};
-  __m128i v_zeroes = {0};
-  __m128i v_ones = {0};
-  __m128i v_weights__left = {0};
-  __m128i v_weights_right = {0};
-  __m128i v_p__left = {0};
-  __m128i v_p_right = {0};
-  __m128i v_v1 = {0};
-  __m128i v_v2 = {0};
-  __m128i v_v2j = {0};
-  __m128i v_v2k = {0};
-  uint32_t v_num_iterate_bytes = 0;
-  uint64_t v_tail_index = 0;
-
-  v_zeroes = _mm_set1_epi16((int16_t)(0));
-  v_ones = _mm_set1_epi16((int16_t)(1));
-  v_weights__left = _mm_set_epi8((int8_t)(17), (int8_t)(18), (int8_t)(19), (int8_t)(20), (int8_t)(21), (int8_t)(22), (int8_t)(23), (int8_t)(24), (int8_t)(25), (int8_t)(26), (int8_t)(27), (int8_t)(28), (int8_t)(29), (int8_t)(30), (int8_t)(31), (int8_t)(32));
-  v_weights_right = _mm_set_epi8((int8_t)(1), (int8_t)(2), (int8_t)(3), (int8_t)(4), (int8_t)(5), (int8_t)(6), (int8_t)(7), (int8_t)(8), (int8_t)(9), (int8_t)(10), (int8_t)(11), (int8_t)(12), (int8_t)(13), (int8_t)(14), (int8_t)(15), (int8_t)(16));
-  v_s1 = ((self->private_impl.f_state) & 0xFFFF);
-  v_s2 = ((self->private_impl.f_state) >> (32 - (16)));
-  while (((uint64_t)(a_x.len)) > 0) {
-    v_remaining = wuffs_base__slice_u8__subslice_j(a_x, 0);
-    if (((uint64_t)(a_x.len)) > 5536) {
-      v_remaining = wuffs_base__slice_u8__subslice_i(a_x, 5536);
-      a_x = wuffs_base__slice_u8__subslice_j(a_x, 5536);
-    }
-    v_num_iterate_bytes = ((uint32_t)((((uint64_t)(a_x.len)) & 4294967264)));
-    v_s2 += ((uint32_t)(v_s1 * v_num_iterate_bytes));
-    v_v1 = _mm_setzero_si128();
-    v_v2j = _mm_setzero_si128();
-    v_v2k = _mm_setzero_si128();
-    {
-      wuffs_base__slice_u8 i_slice_p = a_x;
-      v_p.ptr = i_slice_p.ptr;
-      v_p.len = 32;
-      uint8_t* i_end0_p = v_p.ptr + (((i_slice_p.len - (size_t)(v_p.ptr - i_slice_p.ptr)) / 32) * 32);
-      while (v_p.ptr < i_end0_p) {
-        v_p__left = _mm_lddqu_si128((const __m128i*)(const void*)(wuffs_base__slice_u8__subslice_j(v_p, 16).ptr));
-        v_p_right = _mm_lddqu_si128((const __m128i*)(const void*)(wuffs_base__slice_u8__subslice_ij(v_p, 16, 32).ptr));
-        v_v2j = _mm_add_epi32(v_v2j, v_v1);
-        v_v1 = _mm_add_epi32(v_v1, _mm_sad_epu8(v_p__left, v_zeroes));
-        v_v1 = _mm_add_epi32(v_v1, _mm_sad_epu8(v_p_right, v_zeroes));
-        v_v2k = _mm_add_epi32(v_v2k, _mm_madd_epi16(v_ones, _mm_maddubs_epi16(v_p__left, v_weights__left)));
-        v_v2k = _mm_add_epi32(v_v2k, _mm_madd_epi16(v_ones, _mm_maddubs_epi16(v_p_right, v_weights_right)));
-        v_p.ptr += 32;
-      }
-      v_p.len = 0;
-    }
-    v_v1 = _mm_add_epi32(v_v1, _mm_shuffle_epi32(v_v1, (int32_t)(177)));
-    v_v1 = _mm_add_epi32(v_v1, _mm_shuffle_epi32(v_v1, (int32_t)(78)));
-    v_s1 += ((uint32_t)(_mm_cvtsi128_si32(v_v1)));
-    v_v2 = _mm_add_epi32(v_v2k, _mm_slli_epi32(v_v2j, (int32_t)(5)));
-    v_v2 = _mm_add_epi32(v_v2, _mm_shuffle_epi32(v_v2, (int32_t)(177)));
-    v_v2 = _mm_add_epi32(v_v2, _mm_shuffle_epi32(v_v2, (int32_t)(78)));
-    v_s2 += ((uint32_t)(_mm_cvtsi128_si32(v_v2)));
-    v_tail_index = (((uint64_t)(a_x.len)) & 18446744073709551584u);
-    if (v_tail_index < ((uint64_t)(a_x.len))) {
-      {
-        wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, v_tail_index);
-        v_p.ptr = i_slice_p.ptr;
-        v_p.len = 1;
-        uint8_t* i_end0_p = i_slice_p.ptr + i_slice_p.len;
-        while (v_p.ptr < i_end0_p) {
-          v_s1 += ((uint32_t)(v_p.ptr[0]));
-          v_s2 += v_s1;
-          v_p.ptr += 1;
-        }
-        v_p.len = 0;
-      }
-    }
-    v_s1 %= 65521;
-    v_s2 %= 65521;
-    a_x = v_remaining;
-  }
-  self->private_impl.f_state = (((v_s2 & 65535) << 16) | (v_s1 & 65535));
-  return wuffs_base__make_empty_struct();
-}
-#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
-
 // -------- func adler32.hasher.up_arm_neon
 
 #if defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
@@ -18065,6 +17973,98 @@
 }
 #endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
 
+// -------- func adler32.hasher.up_sse42
+
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+#if defined(__GNUC__)
+__attribute__((target("sse4.2")))
+#endif
+static wuffs_base__empty_struct
+wuffs_adler32__hasher__up_sse42(
+    wuffs_adler32__hasher* self,
+    wuffs_base__slice_u8 a_x) {
+  uint32_t v_s1 = 0;
+  uint32_t v_s2 = 0;
+  wuffs_base__slice_u8 v_remaining = {0};
+  wuffs_base__slice_u8 v_p = {0};
+  __m128i v_zeroes = {0};
+  __m128i v_ones = {0};
+  __m128i v_weights__left = {0};
+  __m128i v_weights_right = {0};
+  __m128i v_p__left = {0};
+  __m128i v_p_right = {0};
+  __m128i v_v1 = {0};
+  __m128i v_v2 = {0};
+  __m128i v_v2j = {0};
+  __m128i v_v2k = {0};
+  uint32_t v_num_iterate_bytes = 0;
+  uint64_t v_tail_index = 0;
+
+  v_zeroes = _mm_set1_epi16((int16_t)(0));
+  v_ones = _mm_set1_epi16((int16_t)(1));
+  v_weights__left = _mm_set_epi8((int8_t)(17), (int8_t)(18), (int8_t)(19), (int8_t)(20), (int8_t)(21), (int8_t)(22), (int8_t)(23), (int8_t)(24), (int8_t)(25), (int8_t)(26), (int8_t)(27), (int8_t)(28), (int8_t)(29), (int8_t)(30), (int8_t)(31), (int8_t)(32));
+  v_weights_right = _mm_set_epi8((int8_t)(1), (int8_t)(2), (int8_t)(3), (int8_t)(4), (int8_t)(5), (int8_t)(6), (int8_t)(7), (int8_t)(8), (int8_t)(9), (int8_t)(10), (int8_t)(11), (int8_t)(12), (int8_t)(13), (int8_t)(14), (int8_t)(15), (int8_t)(16));
+  v_s1 = ((self->private_impl.f_state) & 0xFFFF);
+  v_s2 = ((self->private_impl.f_state) >> (32 - (16)));
+  while (((uint64_t)(a_x.len)) > 0) {
+    v_remaining = wuffs_base__slice_u8__subslice_j(a_x, 0);
+    if (((uint64_t)(a_x.len)) > 5536) {
+      v_remaining = wuffs_base__slice_u8__subslice_i(a_x, 5536);
+      a_x = wuffs_base__slice_u8__subslice_j(a_x, 5536);
+    }
+    v_num_iterate_bytes = ((uint32_t)((((uint64_t)(a_x.len)) & 4294967264)));
+    v_s2 += ((uint32_t)(v_s1 * v_num_iterate_bytes));
+    v_v1 = _mm_setzero_si128();
+    v_v2j = _mm_setzero_si128();
+    v_v2k = _mm_setzero_si128();
+    {
+      wuffs_base__slice_u8 i_slice_p = a_x;
+      v_p.ptr = i_slice_p.ptr;
+      v_p.len = 32;
+      uint8_t* i_end0_p = v_p.ptr + (((i_slice_p.len - (size_t)(v_p.ptr - i_slice_p.ptr)) / 32) * 32);
+      while (v_p.ptr < i_end0_p) {
+        v_p__left = _mm_lddqu_si128((const __m128i*)(const void*)(wuffs_base__slice_u8__subslice_j(v_p, 16).ptr));
+        v_p_right = _mm_lddqu_si128((const __m128i*)(const void*)(wuffs_base__slice_u8__subslice_ij(v_p, 16, 32).ptr));
+        v_v2j = _mm_add_epi32(v_v2j, v_v1);
+        v_v1 = _mm_add_epi32(v_v1, _mm_sad_epu8(v_p__left, v_zeroes));
+        v_v1 = _mm_add_epi32(v_v1, _mm_sad_epu8(v_p_right, v_zeroes));
+        v_v2k = _mm_add_epi32(v_v2k, _mm_madd_epi16(v_ones, _mm_maddubs_epi16(v_p__left, v_weights__left)));
+        v_v2k = _mm_add_epi32(v_v2k, _mm_madd_epi16(v_ones, _mm_maddubs_epi16(v_p_right, v_weights_right)));
+        v_p.ptr += 32;
+      }
+      v_p.len = 0;
+    }
+    v_v1 = _mm_add_epi32(v_v1, _mm_shuffle_epi32(v_v1, (int32_t)(177)));
+    v_v1 = _mm_add_epi32(v_v1, _mm_shuffle_epi32(v_v1, (int32_t)(78)));
+    v_s1 += ((uint32_t)(_mm_cvtsi128_si32(v_v1)));
+    v_v2 = _mm_add_epi32(v_v2k, _mm_slli_epi32(v_v2j, (int32_t)(5)));
+    v_v2 = _mm_add_epi32(v_v2, _mm_shuffle_epi32(v_v2, (int32_t)(177)));
+    v_v2 = _mm_add_epi32(v_v2, _mm_shuffle_epi32(v_v2, (int32_t)(78)));
+    v_s2 += ((uint32_t)(_mm_cvtsi128_si32(v_v2)));
+    v_tail_index = (((uint64_t)(a_x.len)) & 18446744073709551584u);
+    if (v_tail_index < ((uint64_t)(a_x.len))) {
+      {
+        wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, v_tail_index);
+        v_p.ptr = i_slice_p.ptr;
+        v_p.len = 1;
+        uint8_t* i_end0_p = i_slice_p.ptr + i_slice_p.len;
+        while (v_p.ptr < i_end0_p) {
+          v_s1 += ((uint32_t)(v_p.ptr[0]));
+          v_s2 += v_s1;
+          v_p.ptr += 1;
+        }
+        v_p.len = 0;
+      }
+    }
+    v_s1 %= 65521;
+    v_s2 %= 65521;
+    a_x = v_remaining;
+  }
+  self->private_impl.f_state = (((v_s2 & 65535) << 16) | (v_s1 & 65535));
+  return wuffs_base__make_empty_struct();
+}
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+
 #endif  // !defined(WUFFS_CONFIG__MODULES) || defined(WUFFS_CONFIG__MODULE__ADLER32)
 
 #if !defined(WUFFS_CONFIG__MODULES) || defined(WUFFS_CONFIG__MODULE__BMP)

diff --git a/std/adler32/common_adler32.wuffs b/std/adler32/common_adler32.wuffs
index 0ad8054..6a5f2de 100644
--- a/std/adler32/common_adler32.wuffs
+++ b/std/adler32/common_adler32.wuffs

@@ -65,171 +65,3 @@
 	} endwhile
 	this.state = ((s2 & 0xFFFF) << 16) | (s1 & 0xFFFF)
 }
-
-pri func hasher.up_sse42!(x: slice base.u8),
-	choose cpu_arch >= x86_sse42,
-{
-	// These variables are the same as the non-SIMD version.
-	var s1        : base.u32
-	var s2        : base.u32
-	var remaining : slice base.u8
-	var p         : slice base.u8
-
-	// The remaining variables are specific to the SIMD version.
-
-	var zeroes        : base.x86_m128i
-	var ones          : base.x86_m128i
-	var weights__left : base.x86_m128i
-	var weights_right : base.x86_m128i
-	var p__left       : base.x86_m128i
-	var p_right       : base.x86_m128i
-	var v1            : base.x86_m128i
-	var v2            : base.x86_m128i
-	var v2j           : base.x86_m128i
-	var v2k           : base.x86_m128i
-
-	var num_iterate_bytes : base.u32
-	var tail_index        : base.u64
-
-	// zeroes and ones are uniform u16×8 vectors.
-	zeroes = zeroes.create_mm_set1_epi16(a: 0)
-	ones = ones.create_mm_set1_epi16(a: 1)
-
-	// weights__left and weights_right, little-endian, form the sequence 32,
-	// 31, 30, ..., 1.
-	weights__left = weights__left.create_mm_set_epi8(
-		e15: 0x11, e14: 0x12, e13: 0x13, e12: 0x14,
-		e11: 0x15, e10: 0x16, e9: 0x17, e8: 0x18,
-		e7: 0x19, e6: 0x1A, e5: 0x1B, e4: 0x1C,
-		e3: 0x1D, e2: 0x1E, e1: 0x1F, e0: 0x20)
-	weights_right = weights_right.create_mm_set_epi8(
-		e15: 0x01, e14: 0x02, e13: 0x03, e12: 0x04,
-		e11: 0x05, e10: 0x06, e9: 0x07, e8: 0x08,
-		e7: 0x09, e6: 0x0A, e5: 0x0B, e4: 0x0C,
-		e3: 0x0D, e2: 0x0E, e1: 0x0F, e0: 0x10)
-
-	// Decompose this.state.
-	s1 = this.state.low_bits(n: 16)
-	s2 = this.state.high_bits(n: 16)
-
-	// Just like the non-SIMD version, loop over args.x up to almost-5552 bytes
-	// at a time. The slightly smaller 5536 is the largest multiple of 32 less
-	// than non-SIMD's 5552.
-	while args.x.length() > 0 {
-		remaining = args.x[.. 0]
-		if args.x.length() > 5536 {
-			remaining = args.x[5536 ..]
-			args.x = args.x[.. 5536]
-		}
-
-		// The s1 state is the sum of the input bytes and the s2 state is the
-		// sum of the s1 state at each 1-byte step. Inside the iterate loop
-		// below, but starting fresh at each outer while loop iteration, s1
-		// consists of three parts (called s1i, s1j and s1k):
-		//  - s1i: the initial value, before any 32-byte iterations.
-		//  - s1j: the total contribution from previous 32-byte iterations.
-		//  - s1k: the contribution due to the current 32-byte iteration.
-		//
-		// The upcoming iterate loop (at 32 bytes per iteration) encompasses
-		// num_iterate_bytes 1-byte steps. We hoist the total s1i contribution,
-		// (s1i * num_iterate_bytes) out here.
-		num_iterate_bytes = (args.x.length() & 0xFFFF_FFE0) as base.u32
-		s2 ~mod+= (s1 ~mod* num_iterate_bytes)
-
-		// Zero-initialize some u32×4 vectors associated with the two state
-		// variables s1 and s2. The iterate loop accumulates four parallel u32
-		// sums in each vector. A post-iterate step merges the four u32 sums
-		// into a single u32 sum.
-		v1 = v1.create_mm_setzero_si128()
-		v2j = v2j.create_mm_setzero_si128()
-		v2k = v2k.create_mm_setzero_si128()
-
-		// The inner loop.
-		iterate (p = args.x)(length: 32, advance: 32, unroll: 1) {
-			// Split the 32-byte p into left and right halves. SSE4.2 works
-			// with 16-byte registers.
-			//
-			// Let p__left = [u8×16: p00, p01, p02, ..., p15]
-			// Let p_right = [u8×16: p16, p17, p18, ..., p31]
-			p__left.load_slice128!(a: p[.. 16])
-			p_right.load_slice128!(a: p[16 .. 32])
-
-			// For v2j, we need to calculate the sums of the s1j terms for each
-			// of p's 32 elements. This is simply 32 times the same number,
-			// that number being the sum of v1's four u32 accumulators. We add
-			// v1 now and multiply by 32 later, outside the inner loop.
-			v2j = v2j._mm_add_epi32(b: v1)
-
-			// For v1, we need to add the elements of p. Computing the sum of
-			// absolute differences (_mm_sad_epu8) with zero just sums the
-			// elements. p__left._mm_sad_epu8(b: zeroes) equals
-			//   [u64×2: p00 + p01 + ... + p07, p08 + p09 + ... + p15]
-			// This is equivalent (little-endian) to:
-			//   [u32×4: p00 + p01 + ... + p07, 0, p08 + p09 + ... + p15, 0]
-			// We accumulate those "sum of p__left elements" in v1, and ditto
-			// for the p_right elements.
-			v1 = v1._mm_add_epi32(b: p__left._mm_sad_epu8(b: zeroes))
-			v1 = v1._mm_add_epi32(b: p_right._mm_sad_epu8(b: zeroes))
-
-			// For v2k, we need to calculate a weighted sum: ((32 * p00) + (31
-			// * p01) + (30 * p02) + ... + (1 * p31)), which splits naturally
-			// into weighted sums of the left half and of the right half.
-			//
-			// The _mm_maddubs_epi16 call (vertically multiply u8 columns and
-			// then horizontally sum u16 pairs) with the left half produces:
-			//   [u16×8: ((32*p00)+(31*p01)),
-			//           ((30*p02)+(29*p03)),
-			//           ...
-			//           ((18*p14)+(17*p15))]
-			//
-			// The ones._mm_madd_epi16(b: etc) call is likewise a multiply-add
-			// (note that it's "madd" not "add"). Multiplying by 1 is a no-op,
-			// so this sums u16 pairs to produce u32 values:
-			//   [u32×4: ((32*p00)+(31*p01)+(30*p02)+(29*p03)),
-			//           ((28*p04)+(27*p05)+(26*p06)+(25*p07)),
-			//           ...
-			//           ((20*p12)+(19*p13)+(18*p14)+(17*p15))]
-			//
-			// Ditto again for the p_right elements.
-			v2k = v2k._mm_add_epi32(b: ones._mm_madd_epi16(b:
-				p__left._mm_maddubs_epi16(b: weights__left)))
-			v2k = v2k._mm_add_epi32(b: ones._mm_madd_epi16(b:
-				p_right._mm_maddubs_epi16(b: weights_right)))
-		}
-
-		// Merge the four parallel u32 sums (v1) into the single u32 sum (s1).
-		// Starting with a u32×4 vector [x0, x1, x2, x3]:
-		//  - shuffling with 0b1011_0001 gives [x1, x0, x3, x2].
-		//  - adding gives [x0+x1, x0+x1, x2+x3, x2+x3].
-		//  - shuffling with 0b0100_1110 gives [x2+x3, x2+x3, x0+x1, x0+x1].
-		//  - adding gives [x0+x1+x2+x3, ditto, ditto, ditto].
-		// The truncate_u32 call extracts the first u32: x0+x1+x2+x3.
-		v1 = v1._mm_add_epi32(b: v1._mm_shuffle_epi32(imm8: 0b1011_0001))
-		v1 = v1._mm_add_epi32(b: v1._mm_shuffle_epi32(imm8: 0b0100_1110))
-		s1 ~mod+= v1.truncate_u32()
-
-		// Combine v2j and v2k. The slli (shift logical left immediate) by 5
-		// multiplies v2j's four u32 elements each by 32, alluded to earlier.
-		v2 = v2k._mm_add_epi32(b: v2j._mm_slli_epi32(imm8: 5))
-
-		// Similarly merge v2 (a u32×4 vector) into s2 (a u32 scalar).
-		v2 = v2._mm_add_epi32(b: v2._mm_shuffle_epi32(imm8: 0b1011_0001))
-		v2 = v2._mm_add_epi32(b: v2._mm_shuffle_epi32(imm8: 0b0100_1110))
-		s2 ~mod+= v2.truncate_u32()
-
-		// Handle the tail of args.x that wasn't a complete 32-byte chunk.
-		tail_index = args.x.length() & 0xFFFF_FFFF_FFFF_FFE0  // And-not 32.
-		if tail_index < args.x.length() {
-			iterate (p = args.x[tail_index ..])(length: 1, advance: 1, unroll: 1) {
-				s1 ~mod+= p[0] as base.u32
-				s2 ~mod+= s1
-			}
-		}
-
-		// The rest of this function is the same as the non-SIMD version.
-		s1 %= 65521
-		s2 %= 65521
-		args.x = remaining
-	} endwhile
-	this.state = ((s2 & 0xFFFF) << 16) | (s1 & 0xFFFF)
-}

diff --git a/std/adler32/common_up_x86_sse42.wuffs b/std/adler32/common_up_x86_sse42.wuffs
new file mode 100644
index 0000000..a9f44c1
--- /dev/null
+++ b/std/adler32/common_up_x86_sse42.wuffs

@@ -0,0 +1,181 @@
+// Copyright 2021 The Wuffs Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pri func hasher.up_sse42!(x: slice base.u8),
+	choose cpu_arch >= x86_sse42,
+{
+	// These variables are the same as the non-SIMD version.
+	var s1        : base.u32
+	var s2        : base.u32
+	var remaining : slice base.u8
+	var p         : slice base.u8
+
+	// The remaining variables are specific to the SIMD version.
+
+	var zeroes        : base.x86_m128i
+	var ones          : base.x86_m128i
+	var weights__left : base.x86_m128i
+	var weights_right : base.x86_m128i
+	var p__left       : base.x86_m128i
+	var p_right       : base.x86_m128i
+	var v1            : base.x86_m128i
+	var v2            : base.x86_m128i
+	var v2j           : base.x86_m128i
+	var v2k           : base.x86_m128i
+
+	var num_iterate_bytes : base.u32
+	var tail_index        : base.u64
+
+	// zeroes and ones are uniform u16×8 vectors.
+	zeroes = zeroes.create_mm_set1_epi16(a: 0)
+	ones = ones.create_mm_set1_epi16(a: 1)
+
+	// weights__left and weights_right, little-endian, form the sequence 32,
+	// 31, 30, ..., 1.
+	weights__left = weights__left.create_mm_set_epi8(
+		e15: 0x11, e14: 0x12, e13: 0x13, e12: 0x14,
+		e11: 0x15, e10: 0x16, e9: 0x17, e8: 0x18,
+		e7: 0x19, e6: 0x1A, e5: 0x1B, e4: 0x1C,
+		e3: 0x1D, e2: 0x1E, e1: 0x1F, e0: 0x20)
+	weights_right = weights_right.create_mm_set_epi8(
+		e15: 0x01, e14: 0x02, e13: 0x03, e12: 0x04,
+		e11: 0x05, e10: 0x06, e9: 0x07, e8: 0x08,
+		e7: 0x09, e6: 0x0A, e5: 0x0B, e4: 0x0C,
+		e3: 0x0D, e2: 0x0E, e1: 0x0F, e0: 0x10)
+
+	// Decompose this.state.
+	s1 = this.state.low_bits(n: 16)
+	s2 = this.state.high_bits(n: 16)
+
+	// Just like the non-SIMD version, loop over args.x up to almost-5552 bytes
+	// at a time. The slightly smaller 5536 is the largest multiple of 32 less
+	// than non-SIMD's 5552.
+	while args.x.length() > 0 {
+		remaining = args.x[.. 0]
+		if args.x.length() > 5536 {
+			remaining = args.x[5536 ..]
+			args.x = args.x[.. 5536]
+		}
+
+		// The s1 state is the sum of the input bytes and the s2 state is the
+		// sum of the s1 state at each 1-byte step. Inside the iterate loop
+		// below, but starting fresh at each outer while loop iteration, s1
+		// consists of three parts (called s1i, s1j and s1k):
+		//  - s1i: the initial value, before any 32-byte iterations.
+		//  - s1j: the total contribution from previous 32-byte iterations.
+		//  - s1k: the contribution due to the current 32-byte iteration.
+		//
+		// The upcoming iterate loop (at 32 bytes per iteration) encompasses
+		// num_iterate_bytes 1-byte steps. We hoist the total s1i contribution,
+		// (s1i * num_iterate_bytes) out here.
+		num_iterate_bytes = (args.x.length() & 0xFFFF_FFE0) as base.u32
+		s2 ~mod+= (s1 ~mod* num_iterate_bytes)
+
+		// Zero-initialize some u32×4 vectors associated with the two state
+		// variables s1 and s2. The iterate loop accumulates four parallel u32
+		// sums in each vector. A post-iterate step merges the four u32 sums
+		// into a single u32 sum.
+		v1 = v1.create_mm_setzero_si128()
+		v2j = v2j.create_mm_setzero_si128()
+		v2k = v2k.create_mm_setzero_si128()
+
+		// The inner loop.
+		iterate (p = args.x)(length: 32, advance: 32, unroll: 1) {
+			// Split the 32-byte p into left and right halves. SSE4.2 works
+			// with 16-byte registers.
+			//
+			// Let p__left = [u8×16: p00, p01, p02, ..., p15]
+			// Let p_right = [u8×16: p16, p17, p18, ..., p31]
+			p__left.load_slice128!(a: p[.. 16])
+			p_right.load_slice128!(a: p[16 .. 32])
+
+			// For v2j, we need to calculate the sums of the s1j terms for each
+			// of p's 32 elements. This is simply 32 times the same number,
+			// that number being the sum of v1's four u32 accumulators. We add
+			// v1 now and multiply by 32 later, outside the inner loop.
+			v2j = v2j._mm_add_epi32(b: v1)
+
+			// For v1, we need to add the elements of p. Computing the sum of
+			// absolute differences (_mm_sad_epu8) with zero just sums the
+			// elements. p__left._mm_sad_epu8(b: zeroes) equals
+			//   [u64×2: p00 + p01 + ... + p07, p08 + p09 + ... + p15]
+			// This is equivalent (little-endian) to:
+			//   [u32×4: p00 + p01 + ... + p07, 0, p08 + p09 + ... + p15, 0]
+			// We accumulate those "sum of p__left elements" in v1, and ditto
+			// for the p_right elements.
+			v1 = v1._mm_add_epi32(b: p__left._mm_sad_epu8(b: zeroes))
+			v1 = v1._mm_add_epi32(b: p_right._mm_sad_epu8(b: zeroes))
+
+			// For v2k, we need to calculate a weighted sum: ((32 * p00) + (31
+			// * p01) + (30 * p02) + ... + (1 * p31)), which splits naturally
+			// into weighted sums of the left half and of the right half.
+			//
+			// The _mm_maddubs_epi16 call (vertically multiply u8 columns and
+			// then horizontally sum u16 pairs) with the left half produces:
+			//   [u16×8: ((32*p00)+(31*p01)),
+			//           ((30*p02)+(29*p03)),
+			//           ...
+			//           ((18*p14)+(17*p15))]
+			//
+			// The ones._mm_madd_epi16(b: etc) call is likewise a multiply-add
+			// (note that it's "madd" not "add"). Multiplying by 1 is a no-op,
+			// so this sums u16 pairs to produce u32 values:
+			//   [u32×4: ((32*p00)+(31*p01)+(30*p02)+(29*p03)),
+			//           ((28*p04)+(27*p05)+(26*p06)+(25*p07)),
+			//           ...
+			//           ((20*p12)+(19*p13)+(18*p14)+(17*p15))]
+			//
+			// Ditto again for the p_right elements.
+			v2k = v2k._mm_add_epi32(b: ones._mm_madd_epi16(b:
+				p__left._mm_maddubs_epi16(b: weights__left)))
+			v2k = v2k._mm_add_epi32(b: ones._mm_madd_epi16(b:
+				p_right._mm_maddubs_epi16(b: weights_right)))
+		}
+
+		// Merge the four parallel u32 sums (v1) into the single u32 sum (s1).
+		// Starting with a u32×4 vector [x0, x1, x2, x3]:
+		//  - shuffling with 0b1011_0001 gives [x1, x0, x3, x2].
+		//  - adding gives [x0+x1, x0+x1, x2+x3, x2+x3].
+		//  - shuffling with 0b0100_1110 gives [x2+x3, x2+x3, x0+x1, x0+x1].
+		//  - adding gives [x0+x1+x2+x3, ditto, ditto, ditto].
+		// The truncate_u32 call extracts the first u32: x0+x1+x2+x3.
+		v1 = v1._mm_add_epi32(b: v1._mm_shuffle_epi32(imm8: 0b1011_0001))
+		v1 = v1._mm_add_epi32(b: v1._mm_shuffle_epi32(imm8: 0b0100_1110))
+		s1 ~mod+= v1.truncate_u32()
+
+		// Combine v2j and v2k. The slli (shift logical left immediate) by 5
+		// multiplies v2j's four u32 elements each by 32, alluded to earlier.
+		v2 = v2k._mm_add_epi32(b: v2j._mm_slli_epi32(imm8: 5))
+
+		// Similarly merge v2 (a u32×4 vector) into s2 (a u32 scalar).
+		v2 = v2._mm_add_epi32(b: v2._mm_shuffle_epi32(imm8: 0b1011_0001))
+		v2 = v2._mm_add_epi32(b: v2._mm_shuffle_epi32(imm8: 0b0100_1110))
+		s2 ~mod+= v2.truncate_u32()
+
+		// Handle the tail of args.x that wasn't a complete 32-byte chunk.
+		tail_index = args.x.length() & 0xFFFF_FFFF_FFFF_FFE0  // And-not 32.
+		if tail_index < args.x.length() {
+			iterate (p = args.x[tail_index ..])(length: 1, advance: 1, unroll: 1) {
+				s1 ~mod+= p[0] as base.u32
+				s2 ~mod+= s1
+			}
+		}
+
+		// The rest of this function is the same as the non-SIMD version.
+		s1 %= 65521
+		s2 %= 65521
+		args.x = remaining
+	} endwhile
+	this.state = ((s2 & 0xFFFF) << 16) | (s1 & 0xFFFF)
+}
commit	4c3f8ccfff160550337623f2261b04a457f60058	[log] [tgz]
author	Nigel Tao <nigeltao@golang.org>	Tue Feb 16 22:23:54 2021 +1100
committer	Nigel Tao <nigeltao@golang.org>	Tue Feb 16 22:23:54 2021 +1100
tree	03a6206c03b1a337339dfef65f415039b925d7ad
parent	010b34ed38801091c3d45740059b1a9caa03612e [diff]