Add sse128_i.load_slice128
diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go
index b525474..f3e22d2 100644
--- a/internal/cgen/builtin.go
+++ b/internal/cgen/builtin.go
@@ -446,8 +446,7 @@
 
 func (g *gen) writeBuiltinCPUArch(b *buffer, recv *a.Expr, method t.ID, args []*a.Node, sideEffectsOnly bool, depth uint32) error {
 	switch method {
-	case t.IDLoadU32:
-		// TODO: ensure that the receiver is a variable, not an arbitrary expression.
+	case t.IDLoadU32, t.IDLoadU64, t.IDLoadSlice128:
 		if !sideEffectsOnly {
 			// Generate a two part expression using the comma operator: "(etc,
 			// return_empty_struct call)". The final part is a function call
@@ -458,22 +457,66 @@
 		if err := g.writeExpr(b, recv, false, depth); err != nil {
 			return err
 		}
-		b.writes(" = _mm_cvtsi32_si128((int)(")
+
+		switch method {
+		case t.IDLoadU32:
+			b.writes(" = _mm_cvtsi32_si128((int)(")
+		case t.IDLoadU64:
+			b.writes(" = _mm_cvtsi64_si128((int64_t)(")
+		case t.IDLoadSlice128:
+			b.writes(" = _mm_lddqu_si128((const __m128i*)(const void*)(")
+		}
+
 		if err := g.writeExpr(b, args[0].AsArg().Value(), false, depth); err != nil {
 			return err
 		}
-		b.writes("))")
+
+		switch method {
+		case t.IDLoadSlice128:
+			b.writes(".ptr))")
+		default:
+			b.writes("))")
+		}
+
 		if !sideEffectsOnly {
 			b.writes(", wuffs_base__make_empty_struct())")
 		}
 		return nil
 
-	case t.IDTruncateU32:
-		b.writes("((uint32_t)(_mm_cvtsi128_si32(")
+	case t.IDTruncateU32, t.IDTruncateU64, t.IDStoreSlice128:
+		switch method {
+		case t.IDTruncateU32:
+			b.writes("((uint32_t)(_mm_cvtsi128_si32(")
+		case t.IDTruncateU64:
+			b.writes("((uint64_t)(_mm_cvtsi128_si64(")
+		case t.IDStoreSlice128:
+			if !sideEffectsOnly {
+				// Generate a two part expression using the comma operator: "(etc,
+				// return_empty_struct call)". The final part is a function call
+				// (to a static inline function) instead of a struct literal, to
+				// avoid a "expression result unused" compiler error.
+				b.writes("(")
+			}
+			b.writes("_mm_storeu_si128((__m128i*)(void*)(")
+			if err := g.writeExpr(b, args[0].AsArg().Value(), false, depth); err != nil {
+				return err
+			}
+			b.writes(".ptr), ")
+		}
+
 		if err := g.writeExpr(b, recv, false, depth); err != nil {
 			return err
 		}
-		b.writes(")))")
+
+		switch method {
+		case t.IDStoreSlice128:
+			b.writes(")")
+			if !sideEffectsOnly {
+				b.writes(", wuffs_base__make_empty_struct())")
+			}
+		default:
+			b.writes(")))")
+		}
 		return nil
 	}
 
@@ -503,9 +546,16 @@
 		}
 		for _, o := range args {
 			b.writes(", ")
-			if err := g.writeExpr(b, o.AsArg().Value(), false, depth); err != nil {
+			after := ""
+			v := o.AsArg().Value()
+			if !v.MType().IsCPUArchType() {
+				b.writes("(int32_t)(")
+				after = ")"
+			}
+			if err := g.writeExpr(b, v, false, depth); err != nil {
 				return err
 			}
+			b.writes(after)
 		}
 	}
 	b.writes(")")
diff --git a/lang/ast/ast.go b/lang/ast/ast.go
index fd462f5..a6a5b24 100644
--- a/lang/ast/ast.go
+++ b/lang/ast/ast.go
@@ -797,6 +797,10 @@
 	return n.id0 == 0 && n.id1 == t.IDBase && n.id2 == t.IDBool
 }
 
+func (n *TypeExpr) IsCPUArchType() bool {
+	return n.id0 == 0 && n.id1 == t.IDBase && n.id2.IsBuiltInCPUArch()
+}
+
 func (n *TypeExpr) IsIdeal() bool {
 	return n.id0 == 0 && n.id1 == t.IDBase && n.id2 == t.IDQIdeal
 }
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index a21803f..1e13710 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -534,10 +534,12 @@
 
 	// ---- sse128_i
 
-	"sse128_i.load_slice!(a: slice base.u8)",
 	"sse128_i.load_u32!(a: u32)",
-	"sse128_i.store_slice!(a: slice base.u8)",
+	"sse128_i.load_u64!(a: u64)",
+	"sse128_i.load_slice128!(a: slice base.u8)",
 	"sse128_i.truncate_u32() u32",
+	"sse128_i.truncate_u64() u64",
+	"sse128_i.store_slice128!(a: slice base.u8)",
 
 	// TODO: generate these methods automatically?
 
@@ -552,6 +554,8 @@
 	"sse128_i._mm_cmpeq_epi16(b: sse128_i) sse128_i",
 	"sse128_i._mm_min_epi16(b: sse128_i) sse128_i",
 	"sse128_i._mm_packus_epi16(b: sse128_i) sse128_i",
+	"sse128_i._mm_slli_si128(imm8: u32) sse128_i",
+	"sse128_i._mm_srli_si128(imm8: u32) sse128_i",
 	"sse128_i._mm_sub_epi16(b: sse128_i) sse128_i",
 	"sse128_i._mm_sub_epi8(b: sse128_i) sse128_i",
 	"sse128_i._mm_unpacklo_epi8(b: sse128_i) sse128_i",
diff --git a/lang/check/bounds.go b/lang/check/bounds.go
index ffcd7a8..423860d 100644
--- a/lang/check/bounds.go
+++ b/lang/check/bounds.go
@@ -1153,10 +1153,21 @@
 				advance, update = au.advance, au.update
 			}
 		}
+
+	} else if recvTyp.IsCPUArchType() {
+		if method >= t.IDLoadSlice128 {
+			if m := method - t.IDLoadSlice128; m < t.ID(len(lsMethodAdvances)) {
+				advance, update = lsMethodAdvances[m], false
+			}
+		}
 	}
 
 	if (advance != nil) || (advanceExpr != nil) {
-		if ok, err := q.optimizeIOMethodAdvance(recv, advance, advanceExpr, update); err != nil {
+		subject := recv
+		if recv.MType().IsCPUArchType() {
+			subject = n.Args()[0].AsArg().Value()
+		}
+		if ok, err := q.optimizeIOMethodAdvance(subject, advance, advanceExpr, update); err != nil {
 			return bounds{}, err
 		} else if !ok {
 			adv := ""
@@ -1166,9 +1177,9 @@
 				adv = advanceExpr.Str(q.tm)
 			}
 			return bounds{}, fmt.Errorf("check: could not prove %s pre-condition: %s.length() >= %s",
-				method.Str(q.tm), recv.Str(q.tm), adv)
+				method.Str(q.tm), subject.Str(q.tm), adv)
 		}
-		// TODO: drop other recv-related facts?
+		// TODO: drop other subject-related facts?
 	}
 
 	return bounds{}, errNotASpecialCase
@@ -1363,6 +1374,18 @@
 	t.IDWriteExtendedTokenFast - t.IDPeekU8: {one, true},
 }
 
+var lsMethodAdvances = [...]*big.Int{
+	// 128 bits is 16 bytes. 256 bits is 32 bytes. 512 bits is 64 bytes.
+
+	t.IDLoadSlice128 - t.IDLoadSlice128: sixteen,
+	t.IDLoadSlice256 - t.IDLoadSlice128: thirtyTwo,
+	t.IDLoadSlice512 - t.IDLoadSlice128: sixtyFour,
+
+	t.IDStoreSlice128 - t.IDLoadSlice128: sixteen,
+	t.IDStoreSlice256 - t.IDLoadSlice128: thirtyTwo,
+	t.IDStoreSlice512 - t.IDLoadSlice128: sixtyFour,
+}
+
 func makeConstValueExpr(tm *t.Map, cv *big.Int) (*a.Expr, error) {
 	id, err := tm.Insert(cv.String())
 	if err != nil {
diff --git a/lang/token/list.go b/lang/token/list.go
index ae05d9c..0ab1da5 100644
--- a/lang/token/list.go
+++ b/lang/token/list.go
@@ -114,6 +114,7 @@
 func (x ID) IsTightRight() bool { return x < ID(len(isTightRight)) && isTightRight[x] }
 
 func (x ID) IsAssign() bool         { return minAssign <= x && x <= maxAssign }
+func (x ID) IsBuiltInCPUArch() bool { return minBuiltInCPUArch <= x && x <= maxBuiltInCPUArch }
 func (x ID) IsBuiltInLoad() bool    { return minBuiltInLoad <= x && x <= maxBuiltInLoad }
 func (x ID) IsCannotAssignTo() bool { return minCannotAssignTo <= x && x <= maxCannotAssignTo }
 func (x ID) IsClose() bool          { return minClose <= x && x <= maxClose }
@@ -659,6 +660,9 @@
 
 	// -------- 0x300 block.
 
+	minBuiltInCPUArch = 0x300
+	maxBuiltInCPUArch = 0x32F
+
 	// 0x30? are reserved for NEON.
 
 	IDSSE128  = ID(0x310)
@@ -671,13 +675,17 @@
 	minBuiltInLoad = 0x380
 	maxBuiltInLoad = 0x387
 
-	IDLoadSlice = ID(0x380)
-	IDLoadU32   = ID(0x381)
-	IDLoadU64   = ID(0x382)
+	IDLoadU32      = ID(0x380)
+	IDLoadU64      = ID(0x381)
+	IDLoadSlice128 = ID(0x382)
+	IDLoadSlice256 = ID(0x383)
+	IDLoadSlice512 = ID(0x384)
 
-	IDStoreSlice  = ID(0x388)
-	IDTruncateU32 = ID(0x389)
-	IDTruncateU64 = ID(0x38A)
+	IDTruncateU32   = ID(0x388)
+	IDTruncateU64   = ID(0x389)
+	IDStoreSlice128 = ID(0x38A)
+	IDStoreSlice256 = ID(0x38B)
+	IDStoreSlice512 = ID(0x38C)
 
 	IDCreateMMSet1EPI8 = ID(0x390)
 )
@@ -1067,13 +1075,17 @@
 	IDSSE128:  "sse128",
 	IDSSE128I: "sse128_i",
 
-	IDLoadSlice: "load_slice",
-	IDLoadU32:   "load_u32",
-	IDLoadU64:   "load_u64",
+	IDLoadU32:      "load_u32",
+	IDLoadU64:      "load_u64",
+	IDLoadSlice128: "load_slice128",
+	IDLoadSlice256: "load_slice256",
+	IDLoadSlice512: "load_slice512",
 
-	IDStoreSlice:  "store_slice",
-	IDTruncateU32: "truncate_u32",
-	IDTruncateU64: "truncate_u64",
+	IDTruncateU32:   "truncate_u32",
+	IDTruncateU64:   "truncate_u64",
+	IDStoreSlice128: "store_slice128",
+	IDStoreSlice256: "store_slice256",
+	IDStoreSlice512: "store_slice512",
 
 	IDCreateMMSet1EPI8: "create_mm_set1_epi8",
 }