Tweak some adler32 hasher.up_x86_sse42 var names
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index b77f038..3449b11 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -20886,8 +20886,8 @@
__m128i v_ones = {0};
__m128i v_weights__left = {0};
__m128i v_weights_right = {0};
- __m128i v_p__left = {0};
- __m128i v_p_right = {0};
+ __m128i v_q__left = {0};
+ __m128i v_q_right = {0};
__m128i v_v1 = {0};
__m128i v_v2 = {0};
__m128i v_v2j = {0};
@@ -20918,13 +20918,13 @@
v_p.len = 32;
uint8_t* i_end0_p = v_p.ptr + (((i_slice_p.len - (size_t)(v_p.ptr - i_slice_p.ptr)) / 32) * 32);
while (v_p.ptr < i_end0_p) {
- v_p__left = _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr));
- v_p_right = _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 16));
+ v_q__left = _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr));
+ v_q_right = _mm_lddqu_si128((const __m128i*)(const void*)(v_p.ptr + 16));
v_v2j = _mm_add_epi32(v_v2j, v_v1);
- v_v1 = _mm_add_epi32(v_v1, _mm_sad_epu8(v_p__left, v_zeroes));
- v_v1 = _mm_add_epi32(v_v1, _mm_sad_epu8(v_p_right, v_zeroes));
- v_v2k = _mm_add_epi32(v_v2k, _mm_madd_epi16(v_ones, _mm_maddubs_epi16(v_p__left, v_weights__left)));
- v_v2k = _mm_add_epi32(v_v2k, _mm_madd_epi16(v_ones, _mm_maddubs_epi16(v_p_right, v_weights_right)));
+ v_v1 = _mm_add_epi32(v_v1, _mm_sad_epu8(v_q__left, v_zeroes));
+ v_v1 = _mm_add_epi32(v_v1, _mm_sad_epu8(v_q_right, v_zeroes));
+ v_v2k = _mm_add_epi32(v_v2k, _mm_madd_epi16(v_ones, _mm_maddubs_epi16(v_q__left, v_weights__left)));
+ v_v2k = _mm_add_epi32(v_v2k, _mm_madd_epi16(v_ones, _mm_maddubs_epi16(v_q_right, v_weights_right)));
v_p.ptr += 32;
}
v_p.len = 0;
diff --git a/std/adler32/common_up_x86_sse42.wuffs b/std/adler32/common_up_x86_sse42.wuffs
index 93c5c1b..9599f2f 100644
--- a/std/adler32/common_up_x86_sse42.wuffs
+++ b/std/adler32/common_up_x86_sse42.wuffs
@@ -28,8 +28,8 @@
var ones : base.x86_m128i
var weights__left : base.x86_m128i
var weights_right : base.x86_m128i
- var p__left : base.x86_m128i
- var p_right : base.x86_m128i
+ var q__left : base.x86_m128i
+ var q_right : base.x86_m128i
var v1 : base.x86_m128i
var v2 : base.x86_m128i
var v2j : base.x86_m128i
@@ -92,13 +92,13 @@
// The inner loop.
iterate (p = args.x)(length: 32, advance: 32, unroll: 1) {
- // Split the 32-byte p into left and right halves. SSE4.2 works
- // with 16-byte registers.
+ // SSE4.2 works with 16-byte registers. Split the 32-byte p into
+ // left and right halves.
//
- // Let p__left = [u8×16: p00, p01, p02, ..., p15]
- // Let p_right = [u8×16: p16, p17, p18, ..., p31]
- p__left = util.make_m128i_slice128(a: p[.. 16])
- p_right = util.make_m128i_slice128(a: p[16 .. 32])
+ // Let q__left = [u8×16: p00, p01, p02, ..., p15]
+ // Let q_right = [u8×16: p16, p17, p18, ..., p31]
+ q__left = util.make_m128i_slice128(a: p[.. 16])
+ q_right = util.make_m128i_slice128(a: p[16 .. 32])
// For v2j, we need to calculate the sums of the s1j terms for each
// of p's 32 elements. This is simply 32 times the same number,
@@ -108,14 +108,14 @@
// For v1, we need to add the elements of p. Computing the sum of
// absolute differences (_mm_sad_epu8) with zero just sums the
- // elements. p__left._mm_sad_epu8(b: zeroes) equals
+ // elements. q__left._mm_sad_epu8(b: zeroes) equals
// [u64×2: p00 + p01 + ... + p07, p08 + p09 + ... + p15]
// This is equivalent (little-endian) to:
// [u32×4: p00 + p01 + ... + p07, 0, p08 + p09 + ... + p15, 0]
- // We accumulate those "sum of p__left elements" in v1, and ditto
- // for the p_right elements.
- v1 = v1._mm_add_epi32(b: p__left._mm_sad_epu8(b: zeroes))
- v1 = v1._mm_add_epi32(b: p_right._mm_sad_epu8(b: zeroes))
+ // We accumulate those "sum of q__left's elements" in v1, and ditto
+ // for q_right's elements.
+ v1 = v1._mm_add_epi32(b: q__left._mm_sad_epu8(b: zeroes))
+ v1 = v1._mm_add_epi32(b: q_right._mm_sad_epu8(b: zeroes))
// For v2k, we need to calculate a weighted sum: ((32 * p00) + (31
// * p01) + (30 * p02) + ... + (1 * p31)), which splits naturally
@@ -128,19 +128,19 @@
// ...
// ((18*p14)+(17*p15))]
//
- // The ones._mm_madd_epi16(b: etc) call is likewise a multiply-add
- // (note that it's "madd" not "add"). Multiplying by 1 is a no-op,
- // so this sums u16 pairs to produce u32 values:
+ // The ones._mm_madd_epi16(b: etc) call is a multiply-add (note
+ // that it's "madd" not "add"). Multiplying by 1 is a no-op, so
+ // this sums u16 pairs to produce u32 values:
// [u32×4: ((32*p00)+(31*p01)+(30*p02)+(29*p03)),
// ((28*p04)+(27*p05)+(26*p06)+(25*p07)),
// ...
// ((20*p12)+(19*p13)+(18*p14)+(17*p15))]
//
- // Ditto again for the p_right elements.
+ // Ditto again for q_right's elements.
v2k = v2k._mm_add_epi32(b: ones._mm_madd_epi16(b:
- p__left._mm_maddubs_epi16(b: weights__left)))
+ q__left._mm_maddubs_epi16(b: weights__left)))
v2k = v2k._mm_add_epi32(b: ones._mm_madd_epi16(b:
- p_right._mm_maddubs_epi16(b: weights_right)))
+ q_right._mm_maddubs_epi16(b: weights_right)))
}
// Merge the four parallel u32 sums (v1) into the single u32 sum (s1).