Add arm_neon u8x8, u16x4, etc types
diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go
index d7424aa..3d3a24f 100644
--- a/internal/cgen/builtin.go
+++ b/internal/cgen/builtin.go
@@ -448,7 +448,11 @@
}
func (g *gen) writeBuiltinCPUArch(b *buffer, recv *a.Expr, method t.ID, args []*a.Node, sideEffectsOnly bool, depth uint32) error {
- switch recv.MType().QID()[1] {
+ id := recv.MType().QID()[1]
+ if id.IsBuiltInCPUArchARMNeon() {
+ return g.writeBuiltinCPUArchARMNeon(b, recv, method, args, sideEffectsOnly, depth)
+ }
+ switch id {
case t.IDX86SSE42Utility, t.IDX86M128I:
return g.writeBuiltinCPUArchX86(b, recv, method, args, sideEffectsOnly, depth)
}
@@ -768,6 +772,84 @@
return nil
}
+func (g *gen) writeBuiltinCPUArchARMNeon(b *buffer, recv *a.Expr, method t.ID, args []*a.Node, sideEffectsOnly bool, depth uint32) error {
+ methodStr := method.Str(g.tm)
+ if strings.HasPrefix(methodStr, "make_") {
+ fName := ""
+ switch methodStr {
+ case "make_u8x8_repeat":
+ fName = "vdup_n_u8"
+ case "make_u16x4_repeat":
+ fName = "vdup_n_u16"
+ case "make_u32x2_repeat":
+ fName = "vdup_n_u32"
+ case "make_u64x1_repeat":
+ fName = "vdup_n_u64"
+ default:
+ return fmt.Errorf("internal error: unsupported cpu_arch method %q", methodStr)
+ }
+ b.printf("%s(", fName)
+ for i, o := range args {
+ if i > 0 {
+ b.writes(", ")
+ }
+ if err := g.writeExpr(b, o.AsArg().Value(), false, depth); err != nil {
+ return err
+ }
+ }
+ b.writes(")")
+ return nil
+
+ } else if strings.HasPrefix(methodStr, "as_") {
+ switch recv.MType().QID()[1] {
+ case t.IDARMNeonU8x8:
+ switch methodStr {
+ case "as_u16x4":
+ methodStr = "vreinterpret_u16_u8"
+ case "as_u32x2":
+ methodStr = "vreinterpret_u32_u8"
+ case "as_u64x1":
+ methodStr = "vreinterpret_u64_u8"
+ }
+ case t.IDARMNeonU16x4:
+ methodStr = "vreinterpret_u8_u16"
+ case t.IDARMNeonU32x2:
+ methodStr = "vreinterpret_u8_u32"
+ case t.IDARMNeonU64x1:
+ methodStr = "vreinterpret_u8_u64"
+ case t.IDARMNeonU8x16:
+ switch methodStr {
+ case "as_u16x8":
+ methodStr = "vreinterpretq_u16_u8"
+ case "as_u32x4":
+ methodStr = "vreinterpretq_u32_u8"
+ case "as_u64x2":
+ methodStr = "vreinterpretq_u64_u8"
+ }
+ case t.IDARMNeonU16x8:
+ methodStr = "vreinterpretq_u8_u16"
+ case t.IDARMNeonU32x4:
+ methodStr = "vreinterpretq_u8_u32"
+ case t.IDARMNeonU64x2:
+ methodStr = "vreinterpretq_u8_u64"
+ }
+ }
+
+ b.writes(methodStr)
+ b.writes("(")
+ if err := g.writeExpr(b, recv, false, depth); err != nil {
+ return err
+ }
+ for _, o := range args {
+ b.writes(", ")
+ if err := g.writeExpr(b, o.AsArg().Value(), false, depth); err != nil {
+ return err
+ }
+ }
+ b.writes(")")
+ return nil
+}
+
func (g *gen) writeBuiltinCPUArchX86(b *buffer, recv *a.Expr, method t.ID, args []*a.Node, sideEffectsOnly bool, depth uint32) error {
methodStr := method.Str(g.tm)
if strings.HasPrefix(methodStr, "make_") {
diff --git a/internal/cgen/expr.go b/internal/cgen/expr.go
index 945a2bc..21d9457 100644
--- a/internal/cgen/expr.go
+++ b/internal/cgen/expr.go
@@ -577,10 +577,18 @@
t.IDTokenReader: "wuffs_base__token_buffer*",
t.IDTokenWriter: "wuffs_base__token_buffer*",
- t.IDARMCRC32U32: "uint32_t",
- t.IDARMNeon64: "uint8x8_t",
- t.IDARMNeon128: "uint8x16_t",
- t.IDX86M128I: "__m128i",
+ t.IDARMCRC32U32: "uint32_t",
+ t.IDARMNeon64: "uint8x8_t",
+ t.IDARMNeon128: "uint8x16_t",
+ t.IDARMNeonU8x8: "uint8x8_t",
+ t.IDARMNeonU16x4: "uint16x4_t",
+ t.IDARMNeonU32x2: "uint32x2_t",
+ t.IDARMNeonU64x1: "uint64x1_t",
+ t.IDARMNeonU8x16: "uint8x16_t",
+ t.IDARMNeonU16x8: "uint16x8_t",
+ t.IDARMNeonU32x4: "uint32x4_t",
+ t.IDARMNeonU64x2: "uint64x2_t",
+ t.IDX86M128I: "__m128i",
}
const noSuchCOperator = " no_such_C_operator "
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index cdb089b..a607a47 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -251,12 +251,30 @@
"arm_neon_utility",
"arm_neon_64",
"arm_neon_128",
+ "arm_neon_u8x8",
+ "arm_neon_u16x4",
+ "arm_neon_u32x2",
+ "arm_neon_u64x1",
+ "arm_neon_u8x16",
+ "arm_neon_u16x8",
+ "arm_neon_u32x4",
+ "arm_neon_u64x2",
"x86_sse42_utility",
"x86_m128i",
}
-var Funcs = []string{
+var Funcs = [][]string{
+ funcsOther[:],
+ funcsARMNeon[:],
+}
+
+var funcsARMNeon = [...]string{
+ "arm_neon_u8x8.vadd_u8(b: arm_neon_u8x8) arm_neon_u8x8",
+ "arm_neon_u32x2.vget_lane_u32(b: u32[..= 1]) u32",
+}
+
+var funcsOther = [...]string{
"u8.high_bits(n: u32[..= 8]) u8",
"u8.low_bits(n: u32[..= 8]) u8",
"u8.max(a: u8) u8",
@@ -553,6 +571,29 @@
// ---- arm_neon_utility
+ "arm_neon_utility.make_u8x8_repeat(a: u8) arm_neon_u8x8",
+ "arm_neon_utility.make_u16x4_repeat(a: u16) arm_neon_u16x4",
+ "arm_neon_utility.make_u32x2_repeat(a: u32) arm_neon_u32x2",
+ "arm_neon_utility.make_u64x1_repeat(a: u64) arm_neon_u64x1",
+
+ // ---- arm_neon_uAxB.as_uCxD
+
+ "arm_neon_u8x8.as_u16x4() arm_neon_u16x4",
+ "arm_neon_u8x8.as_u32x2() arm_neon_u32x2",
+ "arm_neon_u8x8.as_u64x1() arm_neon_u64x1",
+
+ "arm_neon_u16x4.as_u8x8() arm_neon_u8x8",
+ "arm_neon_u32x2.as_u8x8() arm_neon_u8x8",
+ "arm_neon_u64x1.as_u8x8() arm_neon_u8x8",
+
+ "arm_neon_u16x8.as_u16x8() arm_neon_u16x8",
+ "arm_neon_u16x8.as_u32x4() arm_neon_u32x4",
+ "arm_neon_u16x8.as_u64x2() arm_neon_u64x2",
+
+ "arm_neon_u16x8.as_u8x16() arm_neon_u8x16",
+ "arm_neon_u32x4.as_u8x16() arm_neon_u8x16",
+ "arm_neon_u64x2.as_u8x16() arm_neon_u8x16",
+
// ---- arm_neon_64
// TODO: generate these methods automatically?
diff --git a/lang/check/check.go b/lang/check/check.go
index f549d1c..64e6cfd 100644
--- a/lang/check/check.go
+++ b/lang/check/check.go
@@ -100,8 +100,10 @@
unseenInterfaceImpls: map[t.QQID]*a.Func{},
}
- if err := c.parseBuiltInFuncs(nil, builtin.Funcs); err != nil {
- return nil, err
+ for _, funcs := range builtin.Funcs {
+ if err := c.parseBuiltInFuncs(nil, funcs); err != nil {
+ return nil, err
+ }
}
if err := c.parseBuiltInFuncs(c.builtInSliceFuncs, builtin.SliceFuncs); err != nil {
return nil, err
diff --git a/lang/check/resolve.go b/lang/check/resolve.go
index 0502cd8..9605b25 100644
--- a/lang/check/resolve.go
+++ b/lang/check/resolve.go
@@ -81,6 +81,14 @@
typeExprARMNeonUtility = a.NewTypeExpr(0, t.IDBase, t.IDARMNeonUtility, nil, nil, nil)
typeExprARMNeon64 = a.NewTypeExpr(0, t.IDBase, t.IDARMNeon64, nil, nil, nil)
typeExprARMNeon128 = a.NewTypeExpr(0, t.IDBase, t.IDARMNeon128, nil, nil, nil)
+ typeExprARMNeonU8x8 = a.NewTypeExpr(0, t.IDBase, t.IDARMNeonU8x8, nil, nil, nil)
+ typeExprARMNeonU16x4 = a.NewTypeExpr(0, t.IDBase, t.IDARMNeonU16x4, nil, nil, nil)
+ typeExprARMNeonU32x2 = a.NewTypeExpr(0, t.IDBase, t.IDARMNeonU32x2, nil, nil, nil)
+ typeExprARMNeonU64x1 = a.NewTypeExpr(0, t.IDBase, t.IDARMNeonU64x1, nil, nil, nil)
+ typeExprARMNeonU8x16 = a.NewTypeExpr(0, t.IDBase, t.IDARMNeonU8x16, nil, nil, nil)
+ typeExprARMNeonU16x8 = a.NewTypeExpr(0, t.IDBase, t.IDARMNeonU16x8, nil, nil, nil)
+ typeExprARMNeonU32x4 = a.NewTypeExpr(0, t.IDBase, t.IDARMNeonU32x4, nil, nil, nil)
+ typeExprARMNeonU64x2 = a.NewTypeExpr(0, t.IDBase, t.IDARMNeonU64x2, nil, nil, nil)
typeExprX86SSE42Utility = a.NewTypeExpr(0, t.IDBase, t.IDX86SSE42Utility, nil, nil, nil)
typeExprX86M128I = a.NewTypeExpr(0, t.IDBase, t.IDX86M128I, nil, nil, nil)
@@ -139,6 +147,14 @@
t.IDARMNeonUtility: typeExprARMNeonUtility,
t.IDARMNeon64: typeExprARMNeon64,
t.IDARMNeon128: typeExprARMNeon128,
+ t.IDARMNeonU8x8: typeExprARMNeonU8x8,
+ t.IDARMNeonU16x4: typeExprARMNeonU16x4,
+ t.IDARMNeonU32x2: typeExprARMNeonU32x2,
+ t.IDARMNeonU64x1: typeExprARMNeonU64x1,
+ t.IDARMNeonU8x16: typeExprARMNeonU8x16,
+ t.IDARMNeonU16x8: typeExprARMNeonU16x8,
+ t.IDARMNeonU32x4: typeExprARMNeonU32x4,
+ t.IDARMNeonU64x2: typeExprARMNeonU64x2,
t.IDX86SSE42Utility: typeExprX86SSE42Utility,
t.IDX86M128I: typeExprX86M128I,
diff --git a/lang/check/type.go b/lang/check/type.go
index 8b53343..ff3370b 100644
--- a/lang/check/type.go
+++ b/lang/check/type.go
@@ -54,7 +54,9 @@
switch qid[1] {
case t.IDARMCRC32Utility, t.IDARMCRC32U32:
need = cpuArchBitsARMCRC32
- case t.IDARMNeonUtility, t.IDARMNeon64, t.IDARMNeon128:
+ case t.IDARMNeonUtility, t.IDARMNeon64, t.IDARMNeon128,
+ t.IDARMNeonU8x8, t.IDARMNeonU16x4, t.IDARMNeonU32x2, t.IDARMNeonU64x1,
+ t.IDARMNeonU8x16, t.IDARMNeonU16x8, t.IDARMNeonU32x4, t.IDARMNeonU64x2:
need = cpuArchBitsARMNeon
case t.IDX86SSE42Utility, t.IDX86M128I:
need = cpuArchBitsX86SSE42
diff --git a/lang/token/list.go b/lang/token/list.go
index 539f3c0..acd6825 100644
--- a/lang/token/list.go
+++ b/lang/token/list.go
@@ -115,6 +115,9 @@
func (x ID) IsAssign() bool { return minAssign <= x && x <= maxAssign }
func (x ID) IsBuiltInCPUArch() bool { return minBuiltInCPUArch <= x && x <= maxBuiltInCPUArch }
+func (x ID) IsBuiltInCPUArchARMNeon() bool {
+ return minBuiltInCPUArchARMNeon <= x && x <= maxBuiltInCPUArchARMNeon
+}
func (x ID) IsCannotAssignTo() bool { return minCannotAssignTo <= x && x <= maxCannotAssignTo }
func (x ID) IsClose() bool { return minClose <= x && x <= maxClose }
func (x ID) IsKeyword() bool { return minKeyword <= x && x <= maxKeyword }
@@ -674,32 +677,48 @@
// -------- 0x300 block.
- minBuiltInCPUArch = 0x300
- maxBuiltInCPUArch = 0x33F
+ minBuiltInCPUArch = 0x300
+ minBuiltInCPUArchARMNeon = 0x30E
+ maxBuiltInCPUArchARMNeon = 0x38F
+ maxBuiltInCPUArch = 0x3AF
// If adding more CPUArch utility types, also update IsEtcUtility.
IDARMCRC32 = ID(0x300)
IDARMCRC32Utility = ID(0x301)
- IDARMNeon = ID(0x302)
- IDARMNeonUtility = ID(0x303)
- IDARMCRC32U32 = ID(0x308)
+ IDARMCRC32U32 = ID(0x302)
- IDARMNeon64 = ID(0x310) // 64-bit D (double-word) register
- IDARMNeon128 = ID(0x311) // 128-bit Q ( quad-word) register
+ // Deprecated.
+ IDARMNeon64 = ID(0x308)
+ IDARMNeon128 = ID(0x309)
- IDX86SSE42 = ID(0x320)
- IDX86SSE42Utility = ID(0x321)
- IDX86AVX2 = ID(0x322)
- IDX86AVX2Utility = ID(0x323)
+ IDARMNeon = ID(0x30E)
+ IDARMNeonUtility = ID(0x30F)
- IDX86M128I = ID(0x330)
+ // ARM Neon D register (64-bit double-word) types.
+ IDARMNeonU8x8 = ID(0x310)
+ IDARMNeonU16x4 = ID(0x311)
+ IDARMNeonU32x2 = ID(0x312)
+ IDARMNeonU64x1 = ID(0x313)
+
+ // ARM Neon Q register (128-bit quad-word) types.
+ IDARMNeonU8x16 = ID(0x320)
+ IDARMNeonU16x8 = ID(0x321)
+ IDARMNeonU32x4 = ID(0x322)
+ IDARMNeonU64x2 = ID(0x323)
+
+ IDX86SSE42 = ID(0x390)
+ IDX86SSE42Utility = ID(0x391)
+ IDX86AVX2 = ID(0x392)
+ IDX86AVX2Utility = ID(0x393)
+
+ IDX86M128I = ID(0x3A0)
// --------
- IDCreateSlice64 = ID(0x390)
- IDCreateSlice128 = ID(0x391)
+ IDCreateSlice64 = ID(0x3AE)
+ IDCreateSlice128 = ID(0x3AF)
)
var builtInsByID = [nBuiltInIDs]string{
@@ -1088,14 +1107,25 @@
IDARMCRC32: "arm_crc32",
IDARMCRC32Utility: "arm_crc32_utility",
- IDARMNeon: "arm_neon",
- IDARMNeonUtility: "arm_neon_utility",
IDARMCRC32U32: "arm_crc32_u32",
IDARMNeon64: "arm_neon_64",
IDARMNeon128: "arm_neon_128",
+ IDARMNeon: "arm_neon",
+ IDARMNeonUtility: "arm_neon_utility",
+
+ IDARMNeonU8x8: "arm_neon_u8x8",
+ IDARMNeonU16x4: "arm_neon_u16x4",
+ IDARMNeonU32x2: "arm_neon_u32x2",
+ IDARMNeonU64x1: "arm_neon_u64x1",
+
+ IDARMNeonU8x16: "arm_neon_u8x16",
+ IDARMNeonU16x8: "arm_neon_u16x8",
+ IDARMNeonU32x4: "arm_neon_u32x4",
+ IDARMNeonU64x2: "arm_neon_u64x2",
+
IDX86SSE42: "x86_sse42",
IDX86SSE42Utility: "x86_sse42_utility",
IDX86AVX2: "x86_avx2",
diff --git a/std/png/decode_filter_arm_neon.wuffs b/std/png/decode_filter_arm_neon.wuffs
index 4fe7e07..0885839 100644
--- a/std/png/decode_filter_arm_neon.wuffs
+++ b/std/png/decode_filter_arm_neon.wuffs
@@ -33,13 +33,14 @@
{
var curr : slice base.u8
- var fa : base.arm_neon_64
- var fx : base.arm_neon_64
+ var util : base.arm_neon_utility
+ var fa : base.arm_neon_u8x8
+ var fx : base.arm_neon_u8x8
iterate (curr = args.curr)(length: 4, advance: 4, unroll: 2) {
- fx = fx.create_vdup_n_u32(a: curr.peek_u32le())
+ fx = util.make_u32x2_repeat(a: curr.peek_u32le()).as_u8x8()
fx = fx.vadd_u8(b: fa)
- curr.poke_u32le!(a: fx.vget_lane_u32(b: 0))
+ curr.poke_u32le!(a: fx.as_u32x2().vget_lane_u32(b: 0))
fa = fx
}
}