Add base.sse128_i
diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go
index 6e24158..6ba7062 100644
--- a/internal/cgen/builtin.go
+++ b/internal/cgen/builtin.go
@@ -141,6 +141,8 @@
 				b.writes("&empty_io_buffer")
 				return nil
 			}
+		case t.IDSSE128I:
+			return g.writeBuiltinCPUArch(b, recv, method.Ident(), n.Args(), depth)
 		}
 	}
 	return errNoSuchBuiltin
@@ -424,6 +426,38 @@
 	return g.writeBuiltinIO(b, recv, method, args, depth)
 }
 
+func (g *gen) writeBuiltinCPUArch(b *buffer, recv *a.Expr, method t.ID, args []*a.Node, depth uint32) error {
+	switch method {
+	case t.IDLoadU32:
+		// TODO: ensure that the receiver is a variable, not an arbitrary expression.
+		//
+		// Generate a two part expression using the comma operator: "(etc,
+		// return_empty_struct call)". The final part is a function call (to a
+		// static inline function) instead of a struct literal, to avoid a
+		// "expression result unused" compiler error.
+		b.writes("(")
+		if err := g.writeExpr(b, recv, depth); err != nil {
+			return err
+		}
+		b.writes(" = _mm_cvtsi32_si128((int)(")
+		if err := g.writeExpr(b, args[0].AsArg().Value(), depth); err != nil {
+			return err
+		}
+		b.writes(")), wuffs_base__make_empty_struct())")
+		return nil
+
+	case t.IDTruncateU32:
+		b.writes("((uint32_t)(_mm_cvtsi128_si32(")
+		if err := g.writeExpr(b, recv, depth); err != nil {
+			return err
+		}
+		b.writes(")))")
+		return nil
+	}
+
+	return errNoSuchBuiltin
+}
+
 func (g *gen) writeBuiltinNumType(b *buffer, recv *a.Expr, method t.ID, args []*a.Node, depth uint32) error {
 	switch method {
 	case t.IDLowBits:
diff --git a/internal/cgen/expr.go b/internal/cgen/expr.go
index a49e8f1..cd9096b 100644
--- a/internal/cgen/expr.go
+++ b/internal/cgen/expr.go
@@ -572,6 +572,8 @@
 	t.IDIOWriter:    "wuffs_base__io_buffer*",
 	t.IDTokenReader: "wuffs_base__token_buffer*",
 	t.IDTokenWriter: "wuffs_base__token_buffer*",
+
+	t.IDSSE128I: "__m128i",
 }
 
 const noSuchCOperator = " no_such_C_operator "
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index 4a4387e..84253fe 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -236,6 +236,10 @@
 	"pixel_swizzler",
 
 	"decode_frame_options",
+
+	// ----
+
+	"sse128_i",
 }
 
 var Funcs = []string{
@@ -523,6 +527,13 @@
 		"dst: slice u8, dst_palette: slice u8, src: slice u8) u64",
 	"pixel_swizzler.swizzle_interleaved_transparent_black!(" +
 		"dst: slice u8, dst_palette: slice u8, num_pixels: u64) u64",
+
+	// ---- sse128_i
+
+	"sse128_i.load_slice!(a: slice base.u8)",
+	"sse128_i.load_u32!(a: u32)",
+	"sse128_i.store_slice!(a: slice base.u8)",
+	"sse128_i.truncate_u32() u32",
 }
 
 var Interfaces = []string{
diff --git a/lang/check/check.go b/lang/check/check.go
index 94be556..610f9d6 100644
--- a/lang/check/check.go
+++ b/lang/check/check.go
@@ -681,7 +681,7 @@
 	}
 
 	// Fill in the TypeMap with all local variables.
-	if err := q.tcheckVars(n.Body()); err != nil {
+	if err := q.tcheckVars(calcCPUArchBits(q.astFunc), n.Body()); err != nil {
 		return &Error{
 			Err:      err,
 			Filename: q.errFilename,
diff --git a/lang/check/resolve.go b/lang/check/resolve.go
index 2ae723a..de615b2 100644
--- a/lang/check/resolve.go
+++ b/lang/check/resolve.go
@@ -75,6 +75,8 @@
 
 	typeExprDecodeFrameOptions = a.NewTypeExpr(0, t.IDBase, t.IDDecodeFrameOptions, nil, nil, nil)
 
+	typeExprSSE128I = a.NewTypeExpr(0, t.IDBase, t.IDSSE128I, nil, nil, nil)
+
 	typeExprSliceU8 = a.NewTypeExpr(t.IDSlice, 0, 0, nil, nil, typeExprU8)
 	typeExprTableU8 = a.NewTypeExpr(t.IDTable, 0, 0, nil, nil, typeExprU8)
 )
@@ -122,6 +124,8 @@
 	t.IDPixelSwizzler: typeExprPixelSwizzler,
 
 	t.IDDecodeFrameOptions: typeExprDecodeFrameOptions,
+
+	t.IDSSE128I: typeExprSSE128I,
 }
 
 func (c *Checker) parseBuiltInFuncs(m map[t.QQID]*a.Func, ss []string) error {
diff --git a/lang/check/type.go b/lang/check/type.go
index c6e8229..caedc7b 100644
--- a/lang/check/type.go
+++ b/lang/check/type.go
@@ -22,7 +22,41 @@
 	t "github.com/google/wuffs/lang/token"
 )
 
-func (q *checker) tcheckVars(block []*a.Node) error {
+type cpuArchBits uint32
+
+const (
+	cpuArchBitsSSE128 = cpuArchBits(0x00000001)
+)
+
+func calcCPUArchBits(n *a.Func) (ret cpuArchBits) {
+	for _, o := range n.Asserts() {
+		o := o.AsAssert()
+		if !o.IsChooseCPUArch() {
+			continue
+		}
+		switch o.Condition().RHS().AsExpr().Ident() {
+		case t.IDSSE128:
+			ret |= cpuArchBitsSSE128
+		}
+	}
+	return ret
+}
+
+func (q *checker) tcheckCPUArchBits(cab cpuArchBits, typ *a.TypeExpr) error {
+	if qid := typ.Innermost().QID(); qid[0] == t.IDBase {
+		need := cpuArchBits(0)
+		switch qid[1] {
+		case t.IDSSE128I:
+			need = cpuArchBitsSSE128
+		}
+		if (cab & need) != need {
+			return fmt.Errorf("check: missing cpu_arch for %q", typ.Innermost().Str(q.tm))
+		}
+	}
+	return nil
+}
+
+func (q *checker) tcheckVars(cab cpuArchBits, block []*a.Node) error {
 	for _, o := range block {
 		if o.Kind() != a.KVar {
 			break
@@ -44,6 +78,9 @@
 		if err := q.tcheckTypeExpr(o.XType(), 0); err != nil {
 			return err
 		}
+		if err := q.tcheckCPUArchBits(cab, o.XType()); err != nil {
+			return err
+		}
 		q.localVars[name] = o.XType()
 	}
 	return nil
@@ -549,15 +586,22 @@
 
 	genericType1 := (*a.TypeExpr)(nil)
 	genericType2 := (*a.TypeExpr)(nil)
-	switch f.Receiver() {
-	case t.QID{t.IDBase, t.IDDagger1}:
-		genericType1 = lhs.MType().Receiver()
-	case t.QID{t.IDBase, t.IDDagger2}:
-		genericType2 = lhs.MType().Receiver()
-		if genericType2.Decorator() != t.IDTable {
-			return fmt.Errorf("check: internal error: %q is not a generic table", genericType2.Str(q.tm))
+	if recv := f.Receiver(); recv[0] == t.IDBase {
+		switch recv[1] {
+		case t.IDDagger1:
+			genericType1 = lhs.MType().Receiver()
+		case t.IDDagger2:
+			genericType2 = lhs.MType().Receiver()
+			if genericType2.Decorator() != t.IDTable {
+				return fmt.Errorf("check: internal error: %q is not a generic table", genericType2.Str(q.tm))
+			}
+			genericType1 = a.NewTypeExpr(t.IDSlice, 0, 0, nil, nil, genericType2.Inner())
 		}
-		genericType1 = a.NewTypeExpr(t.IDSlice, 0, 0, nil, nil, genericType2.Inner())
+
+		if f.FuncName().IsBuiltInLoad() && (lhs.LHS().AsExpr().Operator() != 0) {
+			return fmt.Errorf(`check: %q receiver %q must be a local variable`,
+				f.QQID().Str(q.tm), lhs.LHS().AsExpr().Str(q.tm))
+		}
 	}
 
 	// Check that the func's in type matches the arguments.
diff --git a/lang/token/list.go b/lang/token/list.go
index a1fe302..c216ebf 100644
--- a/lang/token/list.go
+++ b/lang/token/list.go
@@ -114,6 +114,7 @@
 func (x ID) IsTightRight() bool { return x < ID(len(isTightRight)) && isTightRight[x] }
 
 func (x ID) IsAssign() bool         { return minAssign <= x && x <= maxAssign }
+func (x ID) IsBuiltInLoad() bool    { return minBuiltInLoad <= x && x <= maxBuiltInLoad }
 func (x ID) IsCannotAssignTo() bool { return minCannotAssignTo <= x && x <= maxCannotAssignTo }
 func (x ID) IsClose() bool          { return minClose <= x && x <= maxClose }
 func (x ID) IsKeyword() bool        { return minKeyword <= x && x <= maxKeyword }
@@ -646,6 +647,19 @@
 	IDSSE128I = ID(0x311)
 
 	// 0x32? are reserved for AVX256.
+
+	// --------
+
+	minBuiltInLoad = 0x380
+	maxBuiltInLoad = 0x387
+
+	IDLoadSlice = ID(0x380)
+	IDLoadU32   = ID(0x381)
+	IDLoadU64   = ID(0x382)
+
+	IDStoreSlice  = ID(0x388)
+	IDTruncateU32 = ID(0x389)
+	IDTruncateU64 = ID(0x38A)
 )
 
 var builtInsByID = [nBuiltInIDs]string{
@@ -1014,6 +1028,14 @@
 
 	IDSSE128:  "sse128",
 	IDSSE128I: "sse128_i",
+
+	IDLoadSlice: "load_slice",
+	IDLoadU32:   "load_u32",
+	IDLoadU64:   "load_u64",
+
+	IDStoreSlice:  "store_slice",
+	IDTruncateU32: "truncate_u32",
+	IDTruncateU64: "truncate_u64",
 }
 
 var builtInsByName = map[string]ID{}
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 58cd01f..fb45879 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -30976,6 +30976,8 @@
   uint8_t v_fa1 = 0;
   uint8_t v_fa2 = 0;
   uint8_t v_fa3 = 0;
+  uint32_t v_x32 = 0;
+  __m128i v_x128 = {0};
 
   {
     wuffs_base__slice_u8 i_slice_c = a_curr;
@@ -30983,6 +30985,10 @@
     v_c.len = 4;
     uint8_t* i_end0_c = i_slice_c.ptr + ((i_slice_c.len / 4) * 4);
     while (v_c.ptr < i_end0_c) {
+      (v_x128 = _mm_cvtsi32_si128((int)(v_x32)), wuffs_base__make_empty_struct());
+      v_x32 = ((uint32_t)(_mm_cvtsi128_si32(v_x128)));
+      if (v_x32 == 0) {
+      }
       v_fa0 += v_c.ptr[0];
       v_c.ptr[0] = v_fa0;
       v_fa1 += v_c.ptr[1];
diff --git a/std/png/decode_filter_sse128.wuffs b/std/png/decode_filter_sse128.wuffs
index aaabb58..c6f0b5e 100644
--- a/std/png/decode_filter_sse128.wuffs
+++ b/std/png/decode_filter_sse128.wuffs
@@ -19,13 +19,19 @@
 pri func decoder.filter_1_distance_4_sse128!(curr: slice base.u8),
 	choose cpu_arch >= sse128,
 {
-	var c   : slice base.u8
-	var fa0 : base.u8
-	var fa1 : base.u8
-	var fa2 : base.u8
-	var fa3 : base.u8
+	var c    : slice base.u8
+	var fa0  : base.u8
+	var fa1  : base.u8
+	var fa2  : base.u8
+	var fa3  : base.u8
+	var x32  : base.u32
+	var x128 : base.sse128_i
 
 	iterate (c = args.curr)(length: 4, advance: 4, unroll: 1) {
+		x128.load_u32!(a: x32)
+		x32 = x128.truncate_u32()
+		if x32 == 0 {
+		}
 		fa0 ~mod+= c[0]
 		c[0] = fa0
 		fa1 ~mod+= c[1]