Add arm_neon u8x8, u16x4, etc types
diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go
index d7424aa..3d3a24f 100644
--- a/internal/cgen/builtin.go
+++ b/internal/cgen/builtin.go
@@ -448,7 +448,11 @@
 }
 
 func (g *gen) writeBuiltinCPUArch(b *buffer, recv *a.Expr, method t.ID, args []*a.Node, sideEffectsOnly bool, depth uint32) error {
-	switch recv.MType().QID()[1] {
+	id := recv.MType().QID()[1]
+	if id.IsBuiltInCPUArchARMNeon() {
+		return g.writeBuiltinCPUArchARMNeon(b, recv, method, args, sideEffectsOnly, depth)
+	}
+	switch id {
 	case t.IDX86SSE42Utility, t.IDX86M128I:
 		return g.writeBuiltinCPUArchX86(b, recv, method, args, sideEffectsOnly, depth)
 	}
@@ -768,6 +772,84 @@
 	return nil
 }
 
+func (g *gen) writeBuiltinCPUArchARMNeon(b *buffer, recv *a.Expr, method t.ID, args []*a.Node, sideEffectsOnly bool, depth uint32) error {
+	methodStr := method.Str(g.tm)
+	if strings.HasPrefix(methodStr, "make_") {
+		fName := ""
+		switch methodStr {
+		case "make_u8x8_repeat":
+			fName = "vdup_n_u8"
+		case "make_u16x4_repeat":
+			fName = "vdup_n_u16"
+		case "make_u32x2_repeat":
+			fName = "vdup_n_u32"
+		case "make_u64x1_repeat":
+			fName = "vdup_n_u64"
+		default:
+			return fmt.Errorf("internal error: unsupported cpu_arch method %q", methodStr)
+		}
+		b.printf("%s(", fName)
+		for i, o := range args {
+			if i > 0 {
+				b.writes(", ")
+			}
+			if err := g.writeExpr(b, o.AsArg().Value(), false, depth); err != nil {
+				return err
+			}
+		}
+		b.writes(")")
+		return nil
+
+	} else if strings.HasPrefix(methodStr, "as_") {
+		switch recv.MType().QID()[1] {
+		case t.IDARMNeonU8x8:
+			switch methodStr {
+			case "as_u16x4":
+				methodStr = "vreinterpret_u16_u8"
+			case "as_u32x2":
+				methodStr = "vreinterpret_u32_u8"
+			case "as_u64x1":
+				methodStr = "vreinterpret_u64_u8"
+			}
+		case t.IDARMNeonU16x4:
+			methodStr = "vreinterpret_u8_u16"
+		case t.IDARMNeonU32x2:
+			methodStr = "vreinterpret_u8_u32"
+		case t.IDARMNeonU64x1:
+			methodStr = "vreinterpret_u8_u64"
+		case t.IDARMNeonU8x16:
+			switch methodStr {
+			case "as_u16x8":
+				methodStr = "vreinterpretq_u16_u8"
+			case "as_u32x4":
+				methodStr = "vreinterpretq_u32_u8"
+			case "as_u64x2":
+				methodStr = "vreinterpretq_u64_u8"
+			}
+		case t.IDARMNeonU16x8:
+			methodStr = "vreinterpretq_u8_u16"
+		case t.IDARMNeonU32x4:
+			methodStr = "vreinterpretq_u8_u32"
+		case t.IDARMNeonU64x2:
+			methodStr = "vreinterpretq_u8_u64"
+		}
+	}
+
+	b.writes(methodStr)
+	b.writes("(")
+	if err := g.writeExpr(b, recv, false, depth); err != nil {
+		return err
+	}
+	for _, o := range args {
+		b.writes(", ")
+		if err := g.writeExpr(b, o.AsArg().Value(), false, depth); err != nil {
+			return err
+		}
+	}
+	b.writes(")")
+	return nil
+}
+
 func (g *gen) writeBuiltinCPUArchX86(b *buffer, recv *a.Expr, method t.ID, args []*a.Node, sideEffectsOnly bool, depth uint32) error {
 	methodStr := method.Str(g.tm)
 	if strings.HasPrefix(methodStr, "make_") {
diff --git a/internal/cgen/expr.go b/internal/cgen/expr.go
index 945a2bc..21d9457 100644
--- a/internal/cgen/expr.go
+++ b/internal/cgen/expr.go
@@ -577,10 +577,18 @@
 	t.IDTokenReader: "wuffs_base__token_buffer*",
 	t.IDTokenWriter: "wuffs_base__token_buffer*",
 
-	t.IDARMCRC32U32: "uint32_t",
-	t.IDARMNeon64:   "uint8x8_t",
-	t.IDARMNeon128:  "uint8x16_t",
-	t.IDX86M128I:    "__m128i",
+	t.IDARMCRC32U32:  "uint32_t",
+	t.IDARMNeon64:    "uint8x8_t",
+	t.IDARMNeon128:   "uint8x16_t",
+	t.IDARMNeonU8x8:  "uint8x8_t",
+	t.IDARMNeonU16x4: "uint16x4_t",
+	t.IDARMNeonU32x2: "uint32x2_t",
+	t.IDARMNeonU64x1: "uint64x1_t",
+	t.IDARMNeonU8x16: "uint8x16_t",
+	t.IDARMNeonU16x8: "uint16x8_t",
+	t.IDARMNeonU32x4: "uint32x4_t",
+	t.IDARMNeonU64x2: "uint64x2_t",
+	t.IDX86M128I:     "__m128i",
 }
 
 const noSuchCOperator = " no_such_C_operator "
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index cdb089b..a607a47 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -251,12 +251,30 @@
 	"arm_neon_utility",
 	"arm_neon_64",
 	"arm_neon_128",
+	"arm_neon_u8x8",
+	"arm_neon_u16x4",
+	"arm_neon_u32x2",
+	"arm_neon_u64x1",
+	"arm_neon_u8x16",
+	"arm_neon_u16x8",
+	"arm_neon_u32x4",
+	"arm_neon_u64x2",
 
 	"x86_sse42_utility",
 	"x86_m128i",
 }
 
-var Funcs = []string{
+var Funcs = [][]string{
+	funcsOther[:],
+	funcsARMNeon[:],
+}
+
+var funcsARMNeon = [...]string{
+	"arm_neon_u8x8.vadd_u8(b: arm_neon_u8x8) arm_neon_u8x8",
+	"arm_neon_u32x2.vget_lane_u32(b: u32[..= 1]) u32",
+}
+
+var funcsOther = [...]string{
 	"u8.high_bits(n: u32[..= 8]) u8",
 	"u8.low_bits(n: u32[..= 8]) u8",
 	"u8.max(a: u8) u8",
@@ -553,6 +571,29 @@
 
 	// ---- arm_neon_utility
 
+	"arm_neon_utility.make_u8x8_repeat(a: u8) arm_neon_u8x8",
+	"arm_neon_utility.make_u16x4_repeat(a: u16) arm_neon_u16x4",
+	"arm_neon_utility.make_u32x2_repeat(a: u32) arm_neon_u32x2",
+	"arm_neon_utility.make_u64x1_repeat(a: u64) arm_neon_u64x1",
+
+	// ---- arm_neon_uAxB.as_uCxD
+
+	"arm_neon_u8x8.as_u16x4() arm_neon_u16x4",
+	"arm_neon_u8x8.as_u32x2() arm_neon_u32x2",
+	"arm_neon_u8x8.as_u64x1() arm_neon_u64x1",
+
+	"arm_neon_u16x4.as_u8x8() arm_neon_u8x8",
+	"arm_neon_u32x2.as_u8x8() arm_neon_u8x8",
+	"arm_neon_u64x1.as_u8x8() arm_neon_u8x8",
+
+	"arm_neon_u16x8.as_u16x8() arm_neon_u16x8",
+	"arm_neon_u16x8.as_u32x4() arm_neon_u32x4",
+	"arm_neon_u16x8.as_u64x2() arm_neon_u64x2",
+
+	"arm_neon_u16x8.as_u8x16() arm_neon_u8x16",
+	"arm_neon_u32x4.as_u8x16() arm_neon_u8x16",
+	"arm_neon_u64x2.as_u8x16() arm_neon_u8x16",
+
 	// ---- arm_neon_64
 
 	// TODO: generate these methods automatically?
diff --git a/lang/check/check.go b/lang/check/check.go
index f549d1c..64e6cfd 100644
--- a/lang/check/check.go
+++ b/lang/check/check.go
@@ -100,8 +100,10 @@
 		unseenInterfaceImpls:  map[t.QQID]*a.Func{},
 	}
 
-	if err := c.parseBuiltInFuncs(nil, builtin.Funcs); err != nil {
-		return nil, err
+	for _, funcs := range builtin.Funcs {
+		if err := c.parseBuiltInFuncs(nil, funcs); err != nil {
+			return nil, err
+		}
 	}
 	if err := c.parseBuiltInFuncs(c.builtInSliceFuncs, builtin.SliceFuncs); err != nil {
 		return nil, err
diff --git a/lang/check/resolve.go b/lang/check/resolve.go
index 0502cd8..9605b25 100644
--- a/lang/check/resolve.go
+++ b/lang/check/resolve.go
@@ -81,6 +81,14 @@
 	typeExprARMNeonUtility = a.NewTypeExpr(0, t.IDBase, t.IDARMNeonUtility, nil, nil, nil)
 	typeExprARMNeon64      = a.NewTypeExpr(0, t.IDBase, t.IDARMNeon64, nil, nil, nil)
 	typeExprARMNeon128     = a.NewTypeExpr(0, t.IDBase, t.IDARMNeon128, nil, nil, nil)
+	typeExprARMNeonU8x8    = a.NewTypeExpr(0, t.IDBase, t.IDARMNeonU8x8, nil, nil, nil)
+	typeExprARMNeonU16x4   = a.NewTypeExpr(0, t.IDBase, t.IDARMNeonU16x4, nil, nil, nil)
+	typeExprARMNeonU32x2   = a.NewTypeExpr(0, t.IDBase, t.IDARMNeonU32x2, nil, nil, nil)
+	typeExprARMNeonU64x1   = a.NewTypeExpr(0, t.IDBase, t.IDARMNeonU64x1, nil, nil, nil)
+	typeExprARMNeonU8x16   = a.NewTypeExpr(0, t.IDBase, t.IDARMNeonU8x16, nil, nil, nil)
+	typeExprARMNeonU16x8   = a.NewTypeExpr(0, t.IDBase, t.IDARMNeonU16x8, nil, nil, nil)
+	typeExprARMNeonU32x4   = a.NewTypeExpr(0, t.IDBase, t.IDARMNeonU32x4, nil, nil, nil)
+	typeExprARMNeonU64x2   = a.NewTypeExpr(0, t.IDBase, t.IDARMNeonU64x2, nil, nil, nil)
 
 	typeExprX86SSE42Utility = a.NewTypeExpr(0, t.IDBase, t.IDX86SSE42Utility, nil, nil, nil)
 	typeExprX86M128I        = a.NewTypeExpr(0, t.IDBase, t.IDX86M128I, nil, nil, nil)
@@ -139,6 +147,14 @@
 	t.IDARMNeonUtility: typeExprARMNeonUtility,
 	t.IDARMNeon64:      typeExprARMNeon64,
 	t.IDARMNeon128:     typeExprARMNeon128,
+	t.IDARMNeonU8x8:    typeExprARMNeonU8x8,
+	t.IDARMNeonU16x4:   typeExprARMNeonU16x4,
+	t.IDARMNeonU32x2:   typeExprARMNeonU32x2,
+	t.IDARMNeonU64x1:   typeExprARMNeonU64x1,
+	t.IDARMNeonU8x16:   typeExprARMNeonU8x16,
+	t.IDARMNeonU16x8:   typeExprARMNeonU16x8,
+	t.IDARMNeonU32x4:   typeExprARMNeonU32x4,
+	t.IDARMNeonU64x2:   typeExprARMNeonU64x2,
 
 	t.IDX86SSE42Utility: typeExprX86SSE42Utility,
 	t.IDX86M128I:        typeExprX86M128I,
diff --git a/lang/check/type.go b/lang/check/type.go
index 8b53343..ff3370b 100644
--- a/lang/check/type.go
+++ b/lang/check/type.go
@@ -54,7 +54,9 @@
 		switch qid[1] {
 		case t.IDARMCRC32Utility, t.IDARMCRC32U32:
 			need = cpuArchBitsARMCRC32
-		case t.IDARMNeonUtility, t.IDARMNeon64, t.IDARMNeon128:
+		case t.IDARMNeonUtility, t.IDARMNeon64, t.IDARMNeon128,
+			t.IDARMNeonU8x8, t.IDARMNeonU16x4, t.IDARMNeonU32x2, t.IDARMNeonU64x1,
+			t.IDARMNeonU8x16, t.IDARMNeonU16x8, t.IDARMNeonU32x4, t.IDARMNeonU64x2:
 			need = cpuArchBitsARMNeon
 		case t.IDX86SSE42Utility, t.IDX86M128I:
 			need = cpuArchBitsX86SSE42
diff --git a/lang/token/list.go b/lang/token/list.go
index 539f3c0..acd6825 100644
--- a/lang/token/list.go
+++ b/lang/token/list.go
@@ -115,6 +115,9 @@
 
 func (x ID) IsAssign() bool         { return minAssign <= x && x <= maxAssign }
 func (x ID) IsBuiltInCPUArch() bool { return minBuiltInCPUArch <= x && x <= maxBuiltInCPUArch }
+func (x ID) IsBuiltInCPUArchARMNeon() bool {
+	return minBuiltInCPUArchARMNeon <= x && x <= maxBuiltInCPUArchARMNeon
+}
 func (x ID) IsCannotAssignTo() bool { return minCannotAssignTo <= x && x <= maxCannotAssignTo }
 func (x ID) IsClose() bool          { return minClose <= x && x <= maxClose }
 func (x ID) IsKeyword() bool        { return minKeyword <= x && x <= maxKeyword }
@@ -674,32 +677,48 @@
 
 	// -------- 0x300 block.
 
-	minBuiltInCPUArch = 0x300
-	maxBuiltInCPUArch = 0x33F
+	minBuiltInCPUArch        = 0x300
+	minBuiltInCPUArchARMNeon = 0x30E
+	maxBuiltInCPUArchARMNeon = 0x38F
+	maxBuiltInCPUArch        = 0x3AF
 
 	// If adding more CPUArch utility types, also update IsEtcUtility.
 
 	IDARMCRC32        = ID(0x300)
 	IDARMCRC32Utility = ID(0x301)
-	IDARMNeon         = ID(0x302)
-	IDARMNeonUtility  = ID(0x303)
 
-	IDARMCRC32U32 = ID(0x308)
+	IDARMCRC32U32 = ID(0x302)
 
-	IDARMNeon64  = ID(0x310) //  64-bit D (double-word) register
-	IDARMNeon128 = ID(0x311) // 128-bit Q (  quad-word) register
+	// Deprecated.
+	IDARMNeon64  = ID(0x308)
+	IDARMNeon128 = ID(0x309)
 
-	IDX86SSE42        = ID(0x320)
-	IDX86SSE42Utility = ID(0x321)
-	IDX86AVX2         = ID(0x322)
-	IDX86AVX2Utility  = ID(0x323)
+	IDARMNeon        = ID(0x30E)
+	IDARMNeonUtility = ID(0x30F)
 
-	IDX86M128I = ID(0x330)
+	// ARM Neon D register (64-bit double-word) types.
+	IDARMNeonU8x8  = ID(0x310)
+	IDARMNeonU16x4 = ID(0x311)
+	IDARMNeonU32x2 = ID(0x312)
+	IDARMNeonU64x1 = ID(0x313)
+
+	// ARM Neon Q register (128-bit quad-word) types.
+	IDARMNeonU8x16 = ID(0x320)
+	IDARMNeonU16x8 = ID(0x321)
+	IDARMNeonU32x4 = ID(0x322)
+	IDARMNeonU64x2 = ID(0x323)
+
+	IDX86SSE42        = ID(0x390)
+	IDX86SSE42Utility = ID(0x391)
+	IDX86AVX2         = ID(0x392)
+	IDX86AVX2Utility  = ID(0x393)
+
+	IDX86M128I = ID(0x3A0)
 
 	// --------
 
-	IDCreateSlice64  = ID(0x390)
-	IDCreateSlice128 = ID(0x391)
+	IDCreateSlice64  = ID(0x3AE)
+	IDCreateSlice128 = ID(0x3AF)
 )
 
 var builtInsByID = [nBuiltInIDs]string{
@@ -1088,14 +1107,25 @@
 
 	IDARMCRC32:        "arm_crc32",
 	IDARMCRC32Utility: "arm_crc32_utility",
-	IDARMNeon:         "arm_neon",
-	IDARMNeonUtility:  "arm_neon_utility",
 
 	IDARMCRC32U32: "arm_crc32_u32",
 
 	IDARMNeon64:  "arm_neon_64",
 	IDARMNeon128: "arm_neon_128",
 
+	IDARMNeon:        "arm_neon",
+	IDARMNeonUtility: "arm_neon_utility",
+
+	IDARMNeonU8x8:  "arm_neon_u8x8",
+	IDARMNeonU16x4: "arm_neon_u16x4",
+	IDARMNeonU32x2: "arm_neon_u32x2",
+	IDARMNeonU64x1: "arm_neon_u64x1",
+
+	IDARMNeonU8x16: "arm_neon_u8x16",
+	IDARMNeonU16x8: "arm_neon_u16x8",
+	IDARMNeonU32x4: "arm_neon_u32x4",
+	IDARMNeonU64x2: "arm_neon_u64x2",
+
 	IDX86SSE42:        "x86_sse42",
 	IDX86SSE42Utility: "x86_sse42_utility",
 	IDX86AVX2:         "x86_avx2",
diff --git a/std/png/decode_filter_arm_neon.wuffs b/std/png/decode_filter_arm_neon.wuffs
index 4fe7e07..0885839 100644
--- a/std/png/decode_filter_arm_neon.wuffs
+++ b/std/png/decode_filter_arm_neon.wuffs
@@ -33,13 +33,14 @@
 {
 	var curr : slice base.u8
 
-	var fa : base.arm_neon_64
-	var fx : base.arm_neon_64
+	var util : base.arm_neon_utility
+	var fa   : base.arm_neon_u8x8
+	var fx   : base.arm_neon_u8x8
 
 	iterate (curr = args.curr)(length: 4, advance: 4, unroll: 2) {
-		fx = fx.create_vdup_n_u32(a: curr.peek_u32le())
+		fx = util.make_u32x2_repeat(a: curr.peek_u32le()).as_u8x8()
 		fx = fx.vadd_u8(b: fa)
-		curr.poke_u32le!(a: fx.vget_lane_u32(b: 0))
+		curr.poke_u32le!(a: fx.as_u32x2().vget_lane_u32(b: 0))
 		fa = fx
 	}
 }