Have _mm_cvtsi32_si128 cast its arg to int32_t
diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go
index f3e22d2..90101d9 100644
--- a/internal/cgen/builtin.go
+++ b/internal/cgen/builtin.go
@@ -460,7 +460,7 @@
 
 		switch method {
 		case t.IDLoadU32:
-			b.writes(" = _mm_cvtsi32_si128((int)(")
+			b.writes(" = _mm_cvtsi32_si128((int32_t)(")
 		case t.IDLoadU64:
 			b.writes(" = _mm_cvtsi64_si128((int64_t)(")
 		case t.IDLoadSlice128:
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 708040f..482178c 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -31161,12 +31161,12 @@
     v_c.len = 4;
     uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 8) * 8);
     while (v_c.ptr < i_end0_c) {
-      v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+      v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
       v_x128 = _mm_add_epi8(v_x128, v_a128);
       v_a128 = v_x128;
       wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
       v_c.ptr += 4;
-      v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+      v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
       v_x128 = _mm_add_epi8(v_x128, v_a128);
       v_a128 = v_x128;
       wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
@@ -31175,7 +31175,7 @@
     v_c.len = 4;
     uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
     while (v_c.ptr < i_end1_c) {
-      v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+      v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
       v_x128 = _mm_add_epi8(v_x128, v_a128);
       v_a128 = v_x128;
       wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
@@ -31215,13 +31215,13 @@
       uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 8) * 8);
       while (v_c.ptr < i_end0_c) {
         v_p128 = _mm_avg_epu8(_mm_and_si128(v_a128, v_k128), v_b128);
-        v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+        v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
         v_x128 = _mm_add_epi8(v_x128, v_p128);
         v_a128 = v_x128;
         wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
         v_c.ptr += 4;
         v_p128 = _mm_avg_epu8(_mm_and_si128(v_a128, v_k128), v_b128);
-        v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+        v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
         v_x128 = _mm_add_epi8(v_x128, v_p128);
         v_a128 = v_x128;
         wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
@@ -31231,7 +31231,7 @@
       uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
       while (v_c.ptr < i_end1_c) {
         v_p128 = _mm_avg_epu8(_mm_and_si128(v_a128, v_k128), v_b128);
-        v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+        v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
         v_x128 = _mm_add_epi8(v_x128, v_p128);
         v_a128 = v_x128;
         wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
@@ -31251,19 +31251,19 @@
       v_p.len = 4;
       uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 8) * 8);
       while (v_c.ptr < i_end0_c) {
-        v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+        v_b128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
         v_p128 = _mm_avg_epu8(v_a128, v_b128);
         v_p128 = _mm_sub_epi8(v_p128, _mm_and_si128(v_k128, _mm_xor_si128(v_a128, v_b128)));
-        v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+        v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
         v_x128 = _mm_add_epi8(v_x128, v_p128);
         v_a128 = v_x128;
         wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
         v_c.ptr += 4;
         v_p.ptr += 4;
-        v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+        v_b128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
         v_p128 = _mm_avg_epu8(v_a128, v_b128);
         v_p128 = _mm_sub_epi8(v_p128, _mm_and_si128(v_k128, _mm_xor_si128(v_a128, v_b128)));
-        v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+        v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
         v_x128 = _mm_add_epi8(v_x128, v_p128);
         v_a128 = v_x128;
         wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
@@ -31274,10 +31274,10 @@
       v_p.len = 4;
       uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
       while (v_c.ptr < i_end1_c) {
-        v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+        v_b128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
         v_p128 = _mm_avg_epu8(v_a128, v_b128);
         v_p128 = _mm_sub_epi8(v_p128, _mm_and_si128(v_k128, _mm_xor_si128(v_a128, v_b128)));
-        v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+        v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
         v_x128 = _mm_add_epi8(v_x128, v_p128);
         v_a128 = v_x128;
         wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
@@ -31326,7 +31326,7 @@
     v_p.len = 4;
     uint8_t* i_end0_c = v_c.ptr + wuffs_base__iterate_total_advance((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)), 7, 6);
     while (v_c.ptr < i_end0_c) {
-      v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+      v_b128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
       v_b128 = _mm_unpacklo_epi8(v_b128, v_z128);
       v_pa128 = _mm_sub_epi16(v_b128, v_c128);
       v_pb128 = _mm_sub_epi16(v_a128, v_c128);
@@ -31336,7 +31336,7 @@
       v_pc128 = _mm_abs_epi16(v_pc128);
       v_smallest128 = _mm_min_epi16(v_pc128, _mm_min_epi16(v_pb128, v_pa128));
       v_p128 = _mm_blendv_epi8(_mm_blendv_epi8(v_c128, v_b128, _mm_cmpeq_epi16(v_smallest128, v_pb128)), v_a128, _mm_cmpeq_epi16(v_smallest128, v_pa128));
-      v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+      v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
       v_x128 = _mm_unpacklo_epi8(v_x128, v_z128);
       v_x128 = _mm_add_epi8(v_x128, v_p128);
       v_a128 = v_x128;
@@ -31345,7 +31345,7 @@
       wuffs_base__poke_u24le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
       v_c.ptr += 3;
       v_p.ptr += 3;
-      v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+      v_b128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
       v_b128 = _mm_unpacklo_epi8(v_b128, v_z128);
       v_pa128 = _mm_sub_epi16(v_b128, v_c128);
       v_pb128 = _mm_sub_epi16(v_a128, v_c128);
@@ -31355,7 +31355,7 @@
       v_pc128 = _mm_abs_epi16(v_pc128);
       v_smallest128 = _mm_min_epi16(v_pc128, _mm_min_epi16(v_pb128, v_pa128));
       v_p128 = _mm_blendv_epi8(_mm_blendv_epi8(v_c128, v_b128, _mm_cmpeq_epi16(v_smallest128, v_pb128)), v_a128, _mm_cmpeq_epi16(v_smallest128, v_pa128));
-      v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+      v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
       v_x128 = _mm_unpacklo_epi8(v_x128, v_z128);
       v_x128 = _mm_add_epi8(v_x128, v_p128);
       v_a128 = v_x128;
@@ -31369,7 +31369,7 @@
     v_p.len = 4;
     uint8_t* i_end1_c = v_c.ptr + wuffs_base__iterate_total_advance((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)), 4, 3);
     while (v_c.ptr < i_end1_c) {
-      v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+      v_b128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
       v_b128 = _mm_unpacklo_epi8(v_b128, v_z128);
       v_pa128 = _mm_sub_epi16(v_b128, v_c128);
       v_pb128 = _mm_sub_epi16(v_a128, v_c128);
@@ -31379,7 +31379,7 @@
       v_pc128 = _mm_abs_epi16(v_pc128);
       v_smallest128 = _mm_min_epi16(v_pc128, _mm_min_epi16(v_pb128, v_pa128));
       v_p128 = _mm_blendv_epi8(_mm_blendv_epi8(v_c128, v_b128, _mm_cmpeq_epi16(v_smallest128, v_pb128)), v_a128, _mm_cmpeq_epi16(v_smallest128, v_pa128));
-      v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+      v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
       v_x128 = _mm_unpacklo_epi8(v_x128, v_z128);
       v_x128 = _mm_add_epi8(v_x128, v_p128);
       v_a128 = v_x128;
@@ -31393,7 +31393,7 @@
     v_p.len = 3;
     uint8_t* i_end2_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 3) * 3);
     while (v_c.ptr < i_end2_c) {
-      v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u24le__no_bounds_check(v_p.ptr)));
+      v_b128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u24le__no_bounds_check(v_p.ptr)));
       v_b128 = _mm_unpacklo_epi8(v_b128, v_z128);
       v_pa128 = _mm_sub_epi16(v_b128, v_c128);
       v_pb128 = _mm_sub_epi16(v_a128, v_c128);
@@ -31403,7 +31403,7 @@
       v_pc128 = _mm_abs_epi16(v_pc128);
       v_smallest128 = _mm_min_epi16(v_pc128, _mm_min_epi16(v_pb128, v_pa128));
       v_p128 = _mm_blendv_epi8(_mm_blendv_epi8(v_c128, v_b128, _mm_cmpeq_epi16(v_smallest128, v_pb128)), v_a128, _mm_cmpeq_epi16(v_smallest128, v_pa128));
-      v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u24le__no_bounds_check(v_c.ptr)));
+      v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u24le__no_bounds_check(v_c.ptr)));
       v_x128 = _mm_unpacklo_epi8(v_x128, v_z128);
       v_x128 = _mm_add_epi8(v_x128, v_p128);
       v_x128 = _mm_packus_epi16(v_x128, v_x128);
@@ -31452,7 +31452,7 @@
     v_p.len = 4;
     uint8_t* i_end0_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 8) * 8);
     while (v_c.ptr < i_end0_c) {
-      v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+      v_b128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
       v_b128 = _mm_unpacklo_epi8(v_b128, v_z128);
       v_pa128 = _mm_sub_epi16(v_b128, v_c128);
       v_pb128 = _mm_sub_epi16(v_a128, v_c128);
@@ -31462,7 +31462,7 @@
       v_pc128 = _mm_abs_epi16(v_pc128);
       v_smallest128 = _mm_min_epi16(v_pc128, _mm_min_epi16(v_pb128, v_pa128));
       v_p128 = _mm_blendv_epi8(_mm_blendv_epi8(v_c128, v_b128, _mm_cmpeq_epi16(v_smallest128, v_pb128)), v_a128, _mm_cmpeq_epi16(v_smallest128, v_pa128));
-      v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+      v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
       v_x128 = _mm_unpacklo_epi8(v_x128, v_z128);
       v_x128 = _mm_add_epi8(v_x128, v_p128);
       v_a128 = v_x128;
@@ -31471,7 +31471,7 @@
       wuffs_base__poke_u32le__no_bounds_check(v_c.ptr, ((uint32_t)(_mm_cvtsi128_si32(v_x128))));
       v_c.ptr += 4;
       v_p.ptr += 4;
-      v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+      v_b128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
       v_b128 = _mm_unpacklo_epi8(v_b128, v_z128);
       v_pa128 = _mm_sub_epi16(v_b128, v_c128);
       v_pb128 = _mm_sub_epi16(v_a128, v_c128);
@@ -31481,7 +31481,7 @@
       v_pc128 = _mm_abs_epi16(v_pc128);
       v_smallest128 = _mm_min_epi16(v_pc128, _mm_min_epi16(v_pb128, v_pa128));
       v_p128 = _mm_blendv_epi8(_mm_blendv_epi8(v_c128, v_b128, _mm_cmpeq_epi16(v_smallest128, v_pb128)), v_a128, _mm_cmpeq_epi16(v_smallest128, v_pa128));
-      v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+      v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
       v_x128 = _mm_unpacklo_epi8(v_x128, v_z128);
       v_x128 = _mm_add_epi8(v_x128, v_p128);
       v_a128 = v_x128;
@@ -31495,7 +31495,7 @@
     v_p.len = 4;
     uint8_t* i_end1_c = v_c.ptr + (((i_slice_c.len - (size_t)(v_c.ptr - i_slice_c.ptr)) / 4) * 4);
     while (v_c.ptr < i_end1_c) {
-      v_b128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
+      v_b128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_p.ptr)));
       v_b128 = _mm_unpacklo_epi8(v_b128, v_z128);
       v_pa128 = _mm_sub_epi16(v_b128, v_c128);
       v_pb128 = _mm_sub_epi16(v_a128, v_c128);
@@ -31505,7 +31505,7 @@
       v_pc128 = _mm_abs_epi16(v_pc128);
       v_smallest128 = _mm_min_epi16(v_pc128, _mm_min_epi16(v_pb128, v_pa128));
       v_p128 = _mm_blendv_epi8(_mm_blendv_epi8(v_c128, v_b128, _mm_cmpeq_epi16(v_smallest128, v_pb128)), v_a128, _mm_cmpeq_epi16(v_smallest128, v_pa128));
-      v_x128 = _mm_cvtsi32_si128((int)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
+      v_x128 = _mm_cvtsi32_si128((int32_t)(wuffs_base__peek_u32le__no_bounds_check(v_c.ptr)));
       v_x128 = _mm_unpacklo_epi8(v_x128, v_z128);
       v_x128 = _mm_add_epi8(v_x128, v_p128);
       v_a128 = v_x128;