Add std/adler32 hasher.up_x86_avx2 While the std/adler32 benchmarks got better, the std/zlib and std/png benchmarks got worse. This commit will soon be followed by a rollback, but it is committed anyway so that we can refer to these numbers in the git log. Binary size, before: 4056 gen/lib/c/clang-9-dynamic/wuffs-std-adler32.lo 4104 gen/lib/c/clang-9-static/wuffs-std-adler32.o 4624 gen/lib/c/gcc-dynamic/wuffs-std-adler32.lo 4680 gen/lib/c/gcc-static/wuffs-std-adler32.o After: 4952 gen/lib/c/clang-9-dynamic/wuffs-std-adler32.lo 5016 gen/lib/c/clang-9-static/wuffs-std-adler32.o 5512 gen/lib/c/gcc-dynamic/wuffs-std-adler32.lo 5576 gen/lib/c/gcc-static/wuffs-std-adler32.o name old speed new speed delta wuffs_adler32_10k/clang9 11.5GB/s ± 1% 15.0GB/s ± 5% +30.73% (p=0.000 n=9+10) wuffs_adler32_100k/clang9 13.5GB/s ± 1% 15.6GB/s ±14% +16.06% (p=0.001 n=9+10) wuffs_adler32_10k/gcc10 11.3GB/s ± 2% 15.6GB/s ± 1% +38.27% (p=0.000 n=9+9) wuffs_adler32_100k/gcc10 11.3GB/s ± 1% 19.4GB/s ± 1% +71.20% (p=0.000 n=7+9) wuffs_zlib_decode_10k/clang9 223MB/s ± 1% 216MB/s ± 2% -3.45% (p=0.000 n=10+10) wuffs_zlib_decode_100k/clang9 285MB/s ± 1% 284MB/s ± 1% ~ (p=0.243 n=9+10) wuffs_zlib_decode_10k/gcc10 232MB/s ± 1% 216MB/s ± 1% -6.70% (p=0.000 n=10+9) wuffs_zlib_decode_100k/gcc10 291MB/s ± 1% 280MB/s ± 1% -3.70% (p=0.000 n=9+8) wuffs_png_decode_image_19k_8bpp/clang9 152MB/s ± 1% 143MB/s ± 3% -5.89% (p=0.000 n=8+9) wuffs_png_decode_image_40k_24bpp/clang9 171MB/s ± 1% 173MB/s ± 1% ~ (p=0.094 n=9+9) wuffs_png_decode_image_77k_8bpp/clang9 541MB/s ± 1% 509MB/s ± 1% -5.77% (p=0.000 n=8+9) wuffs_png_decode_image_552k_32bpp_ignore_checksum/clang9 479MB/s ± 1% 464MB/s ± 3% -3.20% (p=0.000 n=8+10) wuffs_png_decode_image_552k_32bpp_verify_checksum/clang9 461MB/s ± 1% 452MB/s ± 1% -2.02% (p=0.000 n=8+10) wuffs_png_decode_image_4002k_24bpp/clang9 177MB/s ± 1% 175MB/s ± 2% -1.14% (p=0.004 n=10+10) wuffs_png_decode_image_19k_8bpp/gcc10 159MB/s ± 1% 153MB/s ± 1% -3.54% (p=0.000 n=8+10) wuffs_png_decode_image_40k_24bpp/gcc10 188MB/s ± 1% 187MB/s ± 1% -0.47% (p=0.021 n=10+9) wuffs_png_decode_image_77k_8bpp/gcc10 557MB/s ± 0% 546MB/s ± 4% -1.94% (p=0.000 n=8+10) wuffs_png_decode_image_552k_32bpp_ignore_checksum/gcc10 510MB/s ± 0% 493MB/s ± 0% -3.33% (p=0.000 n=10+9) wuffs_png_decode_image_552k_32bpp_verify_checksum/gcc10 486MB/s ± 1% 478MB/s ± 0% -1.77% (p=0.000 n=8+9) wuffs_png_decode_image_4002k_24bpp/gcc10 189MB/s ± 1% 187MB/s ± 1% -1.26% (p=0.000 n=10+10)

commit: baec831fc88770cbfae8d4d0a14ccc99a42857b5 [log] [tgz]
author: Nigel Tao <nigeltao@golang.org> Tue Apr 27 23:54:31 2021 +1000
committer: Nigel Tao <nigeltao@golang.org> Wed Apr 28 00:18:20 2021 +1000
tree: 131812ce11587cda3ddcbbee05cda06032ffbf68
parent: 8ccaf81e965c287dd36930af7d681052eb4f24f2 [diff]
diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go
index b0e4d1b..e4ee115 100644
--- a/internal/cgen/builtin.go
+++ b/internal/cgen/builtin.go

@@ -452,7 +452,8 @@
 		return g.writeBuiltinCPUArchARMCRC32(b, recv, method, args, sideEffectsOnly, depth)
 	case id.IsBuiltInCPUArchARMNeon():
 		return g.writeBuiltinCPUArchARMNeon(b, recv, method, args, sideEffectsOnly, depth)
-	case id == t.IDX86SSE42Utility, id == t.IDX86M128I:
+	case id == t.IDX86SSE42Utility, id == t.IDX86M128I,
+		id == t.IDX86AVX2Utility, id == t.IDX86M256I:
 		return g.writeBuiltinCPUArchX86(b, recv, method, args, sideEffectsOnly, depth)
 	}
 	return fmt.Errorf("internal error: unsupported cpu_arch method %s.%s",
@@ -630,6 +631,27 @@
 			fName, tName, ptr = "_mm_lddqu_si128", "const __m128i*)(const void*", true
 		case "make_m128i_zeroes":
 			fName, tName = "_mm_setzero_si128", ""
+
+		case "make_m256i_multiple_u8":
+			fName, tName = "_mm256_set_epi8", "int8_t"
+		case "make_m256i_multiple_u16":
+			fName, tName = "_mm256_set_epi16", "int16_t"
+		case "make_m256i_multiple_u32":
+			fName, tName = "_mm256_set_epi32", "int32_t"
+		case "make_m256i_multiple_u64":
+			fName, tName = "_mm256_set_epi64x", "int64_t"
+		case "make_m256i_repeat_u8":
+			fName, tName = "_mm256_set1_epi8", "int8_t"
+		case "make_m256i_repeat_u16":
+			fName, tName = "_mm256_set1_epi16", "int16_t"
+		case "make_m256i_repeat_u32":
+			fName, tName = "_mm256_set1_epi32", "int32_t"
+		case "make_m256i_repeat_u64":
+			fName, tName = "_mm256_set1_epi64x", "int64_t"
+		case "make_m256i_slice256":
+			fName, tName, ptr = "_mm256_lddqu_si256", "const __m256i*)(const void*", true
+		case "make_m256i_zeroes":
+			fName, tName = "_mm256_setzero_si256", ""
 		default:
 			return fmt.Errorf("internal error: unsupported cpu_arch method %q", methodStr)
 		}

diff --git a/internal/cgen/expr.go b/internal/cgen/expr.go
index f90936c..f211c67 100644
--- a/internal/cgen/expr.go
+++ b/internal/cgen/expr.go

@@ -587,6 +587,7 @@
 	t.IDARMNeonU32x4: "uint32x4_t",
 	t.IDARMNeonU64x2: "uint64x2_t",
 	t.IDX86M128I:     "__m128i",
+	t.IDX86M256I:     "__m256i",
 }
 
 const noSuchCOperator = " no_such_C_operator "

diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index dc4efdb..63fd153 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go

@@ -306,6 +306,9 @@
 
 	"x86_sse42_utility",
 	"x86_m128i",
+
+	"x86_avx2_utility",
+	"x86_m256i",
 }
 
 var Funcs = [][]string{
@@ -773,6 +776,58 @@
 	"x86_m128i._mm_unpacklo_epi64(b: x86_m128i) x86_m128i",
 	"x86_m128i._mm_unpacklo_epi8(b: x86_m128i) x86_m128i",
 	"x86_m128i._mm_xor_si128(b: x86_m128i) x86_m128i",
+
+	// ---- x86_avx2_utility
+
+	"x86_avx2_utility.make_m256i_multiple_u8(" +
+		"a00: u8, a01: u8, a02: u8, a03: u8," +
+		"a04: u8, a05: u8, a06: u8, a07: u8," +
+		"a08: u8, a09: u8, a10: u8, a11: u8," +
+		"a12: u8, a13: u8, a14: u8, a15: u8," +
+		"a16: u8, a17: u8, a18: u8, a19: u8," +
+		"a20: u8, a21: u8, a22: u8, a23: u8," +
+		"a24: u8, a25: u8, a26: u8, a27: u8," +
+		"a28: u8, a29: u8, a30: u8, a31: u8) x86_m256i",
+	"x86_avx2_utility.make_m256i_multiple_u16(" +
+		"a00: u16, a01: u16, a02: u16, a03: u16," +
+		"a04: u16, a05: u16, a06: u16, a07: u16," +
+		"a08: u16, a09: u16, a10: u16, a11: u16," +
+		"a12: u16, a13: u16, a14: u16, a15: u16) x86_m256i",
+	"x86_avx2_utility.make_m256i_multiple_u32(" +
+		"a00: u32, a01: u32, a02: u32, a03: u32," +
+		"a04: u32, a05: u32, a06: u32, a07: u32) x86_m256i",
+	"x86_avx2_utility.make_m256i_multiple_u64(" +
+		"a00: u64, a01: u64, a02: u64, a03: u64) x86_m256i",
+
+	"x86_avx2_utility.make_m256i_repeat_u8(a: u8) x86_m256i",
+	"x86_avx2_utility.make_m256i_repeat_u16(a: u16) x86_m256i",
+	"x86_avx2_utility.make_m256i_repeat_u32(a: u32) x86_m256i",
+	"x86_avx2_utility.make_m256i_repeat_u64(a: u64) x86_m256i",
+
+	"x86_avx2_utility.make_m256i_slice256(a: slice base.u8) x86_m256i",
+
+	"x86_avx2_utility.make_m256i_zeroes() x86_m256i",
+
+	// ---- x86_m256i
+
+	// TODO: generate these methods automatically?
+
+	"x86_m256i._mm256_add_epi16(b: x86_m256i) x86_m256i",
+	"x86_m256i._mm256_add_epi32(b: x86_m256i) x86_m256i",
+	"x86_m256i._mm256_add_epi64(b: x86_m256i) x86_m256i",
+	"x86_m256i._mm256_add_epi8(b: x86_m256i) x86_m256i",
+	"x86_m256i._mm256_extracti128_si256(imm8: u32) x86_m128i",
+	"x86_m256i._mm256_madd_epi16(b: x86_m256i) x86_m256i",
+	"x86_m256i._mm256_maddubs_epi16(b: x86_m256i) x86_m256i",
+	"x86_m256i._mm256_sad_epu8(b: x86_m256i) x86_m256i",
+	"x86_m256i._mm256_slli_epi16(imm8: u32) x86_m256i",
+	"x86_m256i._mm256_slli_epi32(imm8: u32) x86_m256i",
+	"x86_m256i._mm256_slli_epi64(imm8: u32) x86_m256i",
+	"x86_m256i._mm256_slli_si256(imm8: u32) x86_m256i",
+	"x86_m256i._mm256_srli_epi16(imm8: u32) x86_m256i",
+	"x86_m256i._mm256_srli_epi32(imm8: u32) x86_m256i",
+	"x86_m256i._mm256_srli_epi64(imm8: u32) x86_m256i",
+	"x86_m256i._mm256_srli_si256(imm8: u32) x86_m256i",
 }
 
 var Interfaces = []string{

diff --git a/lang/check/resolve.go b/lang/check/resolve.go
index f0480a3..8b5ec7e 100644
--- a/lang/check/resolve.go
+++ b/lang/check/resolve.go

@@ -91,6 +91,9 @@
 	typeExprX86SSE42Utility = a.NewTypeExpr(0, t.IDBase, t.IDX86SSE42Utility, nil, nil, nil)
 	typeExprX86M128I        = a.NewTypeExpr(0, t.IDBase, t.IDX86M128I, nil, nil, nil)
 
+	typeExprX86AVX2Utility = a.NewTypeExpr(0, t.IDBase, t.IDX86AVX2Utility, nil, nil, nil)
+	typeExprX86M256I       = a.NewTypeExpr(0, t.IDBase, t.IDX86M256I, nil, nil, nil)
+
 	typeExprSliceU8 = a.NewTypeExpr(t.IDSlice, 0, 0, nil, nil, typeExprU8)
 	typeExprTableU8 = a.NewTypeExpr(t.IDTable, 0, 0, nil, nil, typeExprU8)
 )
@@ -154,6 +157,9 @@
 
 	t.IDX86SSE42Utility: typeExprX86SSE42Utility,
 	t.IDX86M128I:        typeExprX86M128I,
+
+	t.IDX86AVX2Utility: typeExprX86AVX2Utility,
+	t.IDX86M256I:       typeExprX86M256I,
 }
 
 func (c *Checker) parseBuiltInFuncs(m map[t.QQID]*a.Func, ss []string) error {

diff --git a/lang/token/list.go b/lang/token/list.go
index b8f33e3..949f440 100644
--- a/lang/token/list.go
+++ b/lang/token/list.go

@@ -713,6 +713,7 @@
 	IDX86BMI2         = ID(0x394)
 
 	IDX86M128I = ID(0x3A0)
+	IDX86M256I = ID(0x3A1)
 )
 
 var builtInsByID = [nBuiltInIDs]string{
@@ -1126,6 +1127,7 @@
 	IDX86BMI2:         "x86_bmi2",
 
 	IDX86M128I: "x86_m128i",
+	IDX86M256I: "x86_m256i",
 }
 
 var builtInsByName = map[string]ID{}

diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index b77f038..b5e03c6 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c

@@ -20561,6 +20561,13 @@
 
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
 static wuffs_base__empty_struct
+wuffs_adler32__hasher__up_x86_avx2(
+    wuffs_adler32__hasher* self,
+    wuffs_base__slice_u8 a_x);
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+static wuffs_base__empty_struct
 wuffs_adler32__hasher__up_x86_sse42(
     wuffs_adler32__hasher* self,
     wuffs_base__slice_u8 a_x);
@@ -20681,6 +20688,9 @@
         wuffs_base__cpu_arch__have_arm_neon() ? &wuffs_adler32__hasher__up_arm_neon :
 #endif
 #if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+        wuffs_base__cpu_arch__have_x86_avx2() ? &wuffs_adler32__hasher__up_x86_avx2 :
+#endif
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
         wuffs_base__cpu_arch__have_x86_sse42() ? &wuffs_adler32__hasher__up_x86_sse42 :
 #endif
         self->private_impl.choosy_up);
@@ -20869,6 +20879,96 @@
 #endif  // defined(WUFFS_BASE__CPU_ARCH__ARM_NEON)
 // ‼ WUFFS MULTI-FILE SECTION -arm_neon
 
+// ‼ WUFFS MULTI-FILE SECTION +x86_avx2
+// -------- func adler32.hasher.up_x86_avx2
+
+#if defined(WUFFS_BASE__CPU_ARCH__X86_64)
+WUFFS_BASE__MAYBE_ATTRIBUTE_TARGET("pclmul,popcnt,sse4.2,avx2")
+static wuffs_base__empty_struct
+wuffs_adler32__hasher__up_x86_avx2(
+    wuffs_adler32__hasher* self,
+    wuffs_base__slice_u8 a_x) {
+  uint32_t v_s1 = 0;
+  uint32_t v_s2 = 0;
+  wuffs_base__slice_u8 v_remaining = {0};
+  wuffs_base__slice_u8 v_p = {0};
+  __m256i v_zeroes = {0};
+  __m256i v_ones = {0};
+  __m256i v_weights = {0};
+  __m256i v_q = {0};
+  __m256i v_v1 = {0};
+  __m256i v_v2 = {0};
+  __m256i v_v2j = {0};
+  __m256i v_v2k = {0};
+  __m128i v_h1 = {0};
+  __m128i v_h2 = {0};
+  uint32_t v_num_iterate_bytes = 0;
+  uint64_t v_tail_index = 0;
+
+  v_zeroes = _mm256_set1_epi16((int16_t)(0));
+  v_ones = _mm256_set1_epi16((int16_t)(1));
+  v_weights = _mm256_set_epi8((int8_t)(1), (int8_t)(2), (int8_t)(3), (int8_t)(4), (int8_t)(5), (int8_t)(6), (int8_t)(7), (int8_t)(8), (int8_t)(9), (int8_t)(10), (int8_t)(11), (int8_t)(12), (int8_t)(13), (int8_t)(14), (int8_t)(15), (int8_t)(16), (int8_t)(17), (int8_t)(18), (int8_t)(19), (int8_t)(20), (int8_t)(21), (int8_t)(22), (int8_t)(23), (int8_t)(24), (int8_t)(25), (int8_t)(26), (int8_t)(27), (int8_t)(28), (int8_t)(29), (int8_t)(30), (int8_t)(31), (int8_t)(32));
+  v_s1 = ((self->private_impl.f_state) & 0xFFFF);
+  v_s2 = ((self->private_impl.f_state) >> (32 - (16)));
+  while (((uint64_t)(a_x.len)) > 0) {
+    v_remaining = wuffs_base__slice_u8__subslice_j(a_x, 0);
+    if (((uint64_t)(a_x.len)) > 5536) {
+      v_remaining = wuffs_base__slice_u8__subslice_i(a_x, 5536);
+      a_x = wuffs_base__slice_u8__subslice_j(a_x, 5536);
+    }
+    v_num_iterate_bytes = ((uint32_t)((((uint64_t)(a_x.len)) & 4294967264)));
+    v_s2 += ((uint32_t)(v_s1 * v_num_iterate_bytes));
+    v_v1 = _mm256_setzero_si256();
+    v_v2j = _mm256_setzero_si256();
+    v_v2k = _mm256_setzero_si256();
+    {
+      wuffs_base__slice_u8 i_slice_p = a_x;
+      v_p.ptr = i_slice_p.ptr;
+      v_p.len = 32;
+      uint8_t* i_end0_p = v_p.ptr + (((i_slice_p.len - (size_t)(v_p.ptr - i_slice_p.ptr)) / 32) * 32);
+      while (v_p.ptr < i_end0_p) {
+        v_q = _mm256_lddqu_si256((const __m256i*)(const void*)(v_p.ptr));
+        v_v2j = _mm256_add_epi32(v_v2j, v_v1);
+        v_v1 = _mm256_add_epi32(v_v1, _mm256_sad_epu8(v_q, v_zeroes));
+        v_v2k = _mm256_add_epi32(v_v2k, _mm256_madd_epi16(v_ones, _mm256_maddubs_epi16(v_q, v_weights)));
+        v_p.ptr += 32;
+      }
+      v_p.len = 0;
+    }
+    v_h1 = _mm_add_epi32(_mm256_extracti128_si256(v_v1, (int32_t)(0)), _mm256_extracti128_si256(v_v1, (int32_t)(1)));
+    v_h1 = _mm_add_epi32(v_h1, _mm_shuffle_epi32(v_h1, (int32_t)(177)));
+    v_h1 = _mm_add_epi32(v_h1, _mm_shuffle_epi32(v_h1, (int32_t)(78)));
+    v_s1 += ((uint32_t)(_mm_cvtsi128_si32(v_h1)));
+    v_v2 = _mm256_add_epi32(v_v2k, _mm256_slli_epi32(v_v2j, (int32_t)(5)));
+    v_h2 = _mm_add_epi32(_mm256_extracti128_si256(v_v2, (int32_t)(0)), _mm256_extracti128_si256(v_v2, (int32_t)(1)));
+    v_h2 = _mm_add_epi32(v_h2, _mm_shuffle_epi32(v_h2, (int32_t)(177)));
+    v_h2 = _mm_add_epi32(v_h2, _mm_shuffle_epi32(v_h2, (int32_t)(78)));
+    v_s2 += ((uint32_t)(_mm_cvtsi128_si32(v_h2)));
+    v_tail_index = (((uint64_t)(a_x.len)) & 18446744073709551584u);
+    if (v_tail_index < ((uint64_t)(a_x.len))) {
+      {
+        wuffs_base__slice_u8 i_slice_p = wuffs_base__slice_u8__subslice_i(a_x, v_tail_index);
+        v_p.ptr = i_slice_p.ptr;
+        v_p.len = 1;
+        uint8_t* i_end0_p = i_slice_p.ptr + i_slice_p.len;
+        while (v_p.ptr < i_end0_p) {
+          v_s1 += ((uint32_t)(v_p.ptr[0]));
+          v_s2 += v_s1;
+          v_p.ptr += 1;
+        }
+        v_p.len = 0;
+      }
+    }
+    v_s1 %= 65521;
+    v_s2 %= 65521;
+    a_x = v_remaining;
+  }
+  self->private_impl.f_state = (((v_s2 & 65535) << 16) | (v_s1 & 65535));
+  return wuffs_base__make_empty_struct();
+}
+#endif  // defined(WUFFS_BASE__CPU_ARCH__X86_64)
+// ‼ WUFFS MULTI-FILE SECTION -x86_avx2
+
 // ‼ WUFFS MULTI-FILE SECTION +x86_sse42
 // -------- func adler32.hasher.up_x86_sse42
 

diff --git a/std/adler32/common_adler32.wuffs b/std/adler32/common_adler32.wuffs
index eb2a93b..c3b4c08 100644
--- a/std/adler32/common_adler32.wuffs
+++ b/std/adler32/common_adler32.wuffs

@@ -27,6 +27,7 @@
 		this.state = 1
 		choose up = [
 			up_arm_neon,
+			up_x86_avx2,
 			up_x86_sse42]
 	}
 	this.up!(x: args.x)

diff --git a/std/adler32/common_up_x86_avx2.wuffs b/std/adler32/common_up_x86_avx2.wuffs
new file mode 100644
index 0000000..a8209f2
--- /dev/null
+++ b/std/adler32/common_up_x86_avx2.wuffs

@@ -0,0 +1,179 @@
+// Copyright 2021 The Wuffs Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+pri func hasher.up_x86_avx2!(x: slice base.u8),
+	choose cpu_arch >= x86_avx2,
+{
+	// These variables are the same as the non-SIMD version.
+	var s1        : base.u32
+	var s2        : base.u32
+	var remaining : slice base.u8
+	var p         : slice base.u8
+
+	// The remaining variables are specific to the SIMD version.
+
+	var util    : base.x86_avx2_utility
+	var zeroes  : base.x86_m256i
+	var ones    : base.x86_m256i
+	var weights : base.x86_m256i
+	var q       : base.x86_m256i
+	var v1      : base.x86_m256i
+	var v2      : base.x86_m256i
+	var v2j     : base.x86_m256i
+	var v2k     : base.x86_m256i
+
+	var h1 : base.x86_m128i
+	var h2 : base.x86_m128i
+
+	var num_iterate_bytes : base.u32
+	var tail_index        : base.u64
+
+	// zeroes and ones are uniform u16×8 vectors.
+	zeroes = util.make_m256i_repeat_u16(a: 0)
+	ones = util.make_m256i_repeat_u16(a: 1)
+
+	// weights form the sequence 32, 31, 30, ..., 1.
+	weights = util.make_m256i_multiple_u8(
+		a00: 0x20, a01: 0x1F, a02: 0x1E, a03: 0x1D,
+		a04: 0x1C, a05: 0x1B, a06: 0x1A, a07: 0x19,
+		a08: 0x18, a09: 0x17, a10: 0x16, a11: 0x15,
+		a12: 0x14, a13: 0x13, a14: 0x12, a15: 0x11,
+		a16: 0x10, a17: 0x0F, a18: 0x0E, a19: 0x0D,
+		a20: 0x0C, a21: 0x0B, a22: 0x0A, a23: 0x09,
+		a24: 0x08, a25: 0x07, a26: 0x06, a27: 0x05,
+		a28: 0x04, a29: 0x03, a30: 0x02, a31: 0x01)
+
+	// Decompose this.state.
+	s1 = this.state.low_bits(n: 16)
+	s2 = this.state.high_bits(n: 16)
+
+	// Just like the non-SIMD version, loop over args.x up to almost-5552 bytes
+	// at a time. The slightly smaller 5536 is the largest multiple of 32 less
+	// than non-SIMD's 5552.
+	while args.x.length() > 0 {
+		remaining = args.x[.. 0]
+		if args.x.length() > 5536 {
+			remaining = args.x[5536 ..]
+			args.x = args.x[.. 5536]
+		}
+
+		// The s1 state is the sum of the input bytes and the s2 state is the
+		// sum of the s1 state at each 1-byte step. Inside the iterate loop
+		// below, but starting fresh at each outer while loop iteration, s1
+		// consists of three parts (called s1i, s1j and s1k):
+		//  - s1i: the initial value, before any 32-byte iterations.
+		//  - s1j: the total contribution from previous 32-byte iterations.
+		//  - s1k: the contribution due to the current 32-byte iteration.
+		//
+		// The upcoming iterate loop (at 32 bytes per iteration) encompasses
+		// num_iterate_bytes 1-byte steps. We hoist the total s1i contribution,
+		// (s1i * num_iterate_bytes) out here.
+		num_iterate_bytes = (args.x.length() & 0xFFFF_FFE0) as base.u32
+		s2 ~mod+= (s1 ~mod* num_iterate_bytes)
+
+		// Zero-initialize some u32×8 vectors associated with the two state
+		// variables s1 and s2. The iterate loop accumulates eight parallel u32
+		// sums in each vector. A post-iterate step merges the eight u32 sums
+		// into a single u32 sum.
+		v1 = util.make_m256i_zeroes()
+		v2j = util.make_m256i_zeroes()
+		v2k = util.make_m256i_zeroes()
+
+		// The inner loop.
+		iterate (p = args.x)(length: 32, advance: 32, unroll: 1) {
+			// AVX2 works with 32-byte registers.
+			//
+			// Let q = [u8×32: p00, p01, p02, ..., p31]
+			q = util.make_m256i_slice256(a: p)
+
+			// For v2j, we need to calculate the sums of the s1j terms for each
+			// of p's 32 elements. This is simply 32 times the same number,
+			// that number being the sum of v1's eight u32 accumulators. We add
+			// v1 now and multiply by 32 later, outside the inner loop.
+			v2j = v2j._mm256_add_epi32(b: v1)
+
+			// For v1, we need to add the elements of p. Computing the sum of
+			// absolute differences (_mm256_sad_epu8) with zero just sums the
+			// elements. q._mm256_sad_epu8(b: zeroes) equals
+			//   [u64×4: p00 + p01 + ... + p07, p08 + p09 + ... + p15,
+			//           p16 + p17 + ... + p23, p24 + p25 + ... + p31]
+			// This is equivalent (little-endian) to:
+			//   [u32×8: p00 + p01 + ... + p07, 0, p08 + p09 + ... + p15, 0,
+			//           p16 + p17 + ... + p23, 0, p24 + p25 + ... + p31, 0]
+			// We accumulate those "sum of q's elements" in v1.
+			v1 = v1._mm256_add_epi32(b: q._mm256_sad_epu8(b: zeroes))
+
+			// For v2k, we need to calculate a weighted sum: ((32 * p00) + (31
+			// * p01) + (30 * p02) + ... + (1 * p31)).
+			//
+			// The _mm256_maddubs_epi16 call (vertically multiply u8 columns
+			// and then horizontally sum u16 pairs) produces:
+			//   [u16×16: ((32*p00)+(31*p01)),
+			//            ((30*p02)+(29*p03)),
+			//            ...
+			//            (( 2*p30)+( 1*p31))]
+			//
+			// The ones._mm256_madd_epi16(b: etc) call is a multiply-add (note
+			// that it's "madd" not "add"). Multiplying by 1 is a no-op, so
+			// this sums u16 pairs to produce u32 values:
+			//   [u32×8: (((32*p00)+(31*p01)+(30*p02)+(29*p03)),
+			//           (((28*p04)+(27*p05)+(26*p06)+(25*p07)),
+			//           ...
+			//           ((( 4*p28)+( 3*p29)+( 2*p30)+( 1*p31))]
+			v2k = v2k._mm256_add_epi32(b: ones._mm256_madd_epi16(
+				b: q._mm256_maddubs_epi16(b: weights)))
+		}
+
+		// Merge the eight parallel u32 sums (v1) into the single u32 sum (s1).
+		// First, merge the 256-bit (u32×8) v1 into the 128-bit (u32×4) h1.
+		h1 = v1._mm256_extracti128_si256(imm8: 0)._mm_add_epi32(
+			b: v1._mm256_extracti128_si256(imm8: 1))
+
+		// Starting with a u32×4 vector [x0, x1, x2, x3]:
+		//  - shuffling with 0b1011_0001 gives [x1, x0, x3, x2].
+		//  - adding gives [x0+x1, x0+x1, x2+x3, x2+x3].
+		//  - shuffling with 0b0100_1110 gives [x2+x3, x2+x3, x0+x1, x0+x1].
+		//  - adding gives [x0+x1+x2+x3, ditto, ditto, ditto].
+		// The truncate_u32 call extracts the first u32: x0+x1+x2+x3.
+		h1 = h1._mm_add_epi32(b: h1._mm_shuffle_epi32(imm8: 0b1011_0001))
+		h1 = h1._mm_add_epi32(b: h1._mm_shuffle_epi32(imm8: 0b0100_1110))
+		s1 ~mod+= h1.truncate_u32()
+
+		// Combine v2j and v2k. The slli (shift logical left immediate) by 5
+		// multiplies v2j's eight u32 elements each by 32, alluded to earlier.
+		v2 = v2k._mm256_add_epi32(b: v2j._mm256_slli_epi32(imm8: 5))
+
+		// Similarly merge v2 (a u32×8 vector) into s2 (a u32 scalar).
+		h2 = v2._mm256_extracti128_si256(imm8: 0)._mm_add_epi32(
+			b: v2._mm256_extracti128_si256(imm8: 1))
+		h2 = h2._mm_add_epi32(b: h2._mm_shuffle_epi32(imm8: 0b1011_0001))
+		h2 = h2._mm_add_epi32(b: h2._mm_shuffle_epi32(imm8: 0b0100_1110))
+		s2 ~mod+= h2.truncate_u32()
+
+		// Handle the tail of args.x that wasn't a complete 32-byte chunk.
+		tail_index = args.x.length() & 0xFFFF_FFFF_FFFF_FFE0  // And-not 32.
+		if tail_index < args.x.length() {
+			iterate (p = args.x[tail_index ..])(length: 1, advance: 1, unroll: 1) {
+				s1 ~mod+= p[0] as base.u32
+				s2 ~mod+= s1
+			}
+		}
+
+		// The rest of this function is the same as the non-SIMD version.
+		s1 %= 65521
+		s2 %= 65521
+		args.x = remaining
+	} endwhile
+	this.state = ((s2 & 0xFFFF) << 16) | (s1 & 0xFFFF)
+}
commit	baec831fc88770cbfae8d4d0a14ccc99a42857b5	[log] [tgz]
author	Nigel Tao <nigeltao@golang.org>	Tue Apr 27 23:54:31 2021 +1000
committer	Nigel Tao <nigeltao@golang.org>	Wed Apr 28 00:18:20 2021 +1000
tree	131812ce11587cda3ddcbbee05cda06032ffbf68
parent	8ccaf81e965c287dd36930af7d681052eb4f24f2 [diff]