Add base.sse128_i
diff --git a/internal/cgen/builtin.go b/internal/cgen/builtin.go
index 6e24158..6ba7062 100644
--- a/internal/cgen/builtin.go
+++ b/internal/cgen/builtin.go
@@ -141,6 +141,8 @@
b.writes("&empty_io_buffer")
return nil
}
+ case t.IDSSE128I:
+ return g.writeBuiltinCPUArch(b, recv, method.Ident(), n.Args(), depth)
}
}
return errNoSuchBuiltin
@@ -424,6 +426,38 @@
return g.writeBuiltinIO(b, recv, method, args, depth)
}
+func (g *gen) writeBuiltinCPUArch(b *buffer, recv *a.Expr, method t.ID, args []*a.Node, depth uint32) error {
+ switch method {
+ case t.IDLoadU32:
+ // TODO: ensure that the receiver is a variable, not an arbitrary expression.
+ //
+ // Generate a two part expression using the comma operator: "(etc,
+ // return_empty_struct call)". The final part is a function call (to a
+ // static inline function) instead of a struct literal, to avoid a
+ // "expression result unused" compiler error.
+ b.writes("(")
+ if err := g.writeExpr(b, recv, depth); err != nil {
+ return err
+ }
+ b.writes(" = _mm_cvtsi32_si128((int)(")
+ if err := g.writeExpr(b, args[0].AsArg().Value(), depth); err != nil {
+ return err
+ }
+ b.writes(")), wuffs_base__make_empty_struct())")
+ return nil
+
+ case t.IDTruncateU32:
+ b.writes("((uint32_t)(_mm_cvtsi128_si32(")
+ if err := g.writeExpr(b, recv, depth); err != nil {
+ return err
+ }
+ b.writes(")))")
+ return nil
+ }
+
+ return errNoSuchBuiltin
+}
+
func (g *gen) writeBuiltinNumType(b *buffer, recv *a.Expr, method t.ID, args []*a.Node, depth uint32) error {
switch method {
case t.IDLowBits:
diff --git a/internal/cgen/expr.go b/internal/cgen/expr.go
index a49e8f1..cd9096b 100644
--- a/internal/cgen/expr.go
+++ b/internal/cgen/expr.go
@@ -572,6 +572,8 @@
t.IDIOWriter: "wuffs_base__io_buffer*",
t.IDTokenReader: "wuffs_base__token_buffer*",
t.IDTokenWriter: "wuffs_base__token_buffer*",
+
+ t.IDSSE128I: "__m128i",
}
const noSuchCOperator = " no_such_C_operator "
diff --git a/lang/builtin/builtin.go b/lang/builtin/builtin.go
index 4a4387e..84253fe 100644
--- a/lang/builtin/builtin.go
+++ b/lang/builtin/builtin.go
@@ -236,6 +236,10 @@
"pixel_swizzler",
"decode_frame_options",
+
+ // ----
+
+ "sse128_i",
}
var Funcs = []string{
@@ -523,6 +527,13 @@
"dst: slice u8, dst_palette: slice u8, src: slice u8) u64",
"pixel_swizzler.swizzle_interleaved_transparent_black!(" +
"dst: slice u8, dst_palette: slice u8, num_pixels: u64) u64",
+
+ // ---- sse128_i
+
+ "sse128_i.load_slice!(a: slice base.u8)",
+ "sse128_i.load_u32!(a: u32)",
+ "sse128_i.store_slice!(a: slice base.u8)",
+ "sse128_i.truncate_u32() u32",
}
var Interfaces = []string{
diff --git a/lang/check/check.go b/lang/check/check.go
index 94be556..610f9d6 100644
--- a/lang/check/check.go
+++ b/lang/check/check.go
@@ -681,7 +681,7 @@
}
// Fill in the TypeMap with all local variables.
- if err := q.tcheckVars(n.Body()); err != nil {
+ if err := q.tcheckVars(calcCPUArchBits(q.astFunc), n.Body()); err != nil {
return &Error{
Err: err,
Filename: q.errFilename,
diff --git a/lang/check/resolve.go b/lang/check/resolve.go
index 2ae723a..de615b2 100644
--- a/lang/check/resolve.go
+++ b/lang/check/resolve.go
@@ -75,6 +75,8 @@
typeExprDecodeFrameOptions = a.NewTypeExpr(0, t.IDBase, t.IDDecodeFrameOptions, nil, nil, nil)
+ typeExprSSE128I = a.NewTypeExpr(0, t.IDBase, t.IDSSE128I, nil, nil, nil)
+
typeExprSliceU8 = a.NewTypeExpr(t.IDSlice, 0, 0, nil, nil, typeExprU8)
typeExprTableU8 = a.NewTypeExpr(t.IDTable, 0, 0, nil, nil, typeExprU8)
)
@@ -122,6 +124,8 @@
t.IDPixelSwizzler: typeExprPixelSwizzler,
t.IDDecodeFrameOptions: typeExprDecodeFrameOptions,
+
+ t.IDSSE128I: typeExprSSE128I,
}
func (c *Checker) parseBuiltInFuncs(m map[t.QQID]*a.Func, ss []string) error {
diff --git a/lang/check/type.go b/lang/check/type.go
index c6e8229..caedc7b 100644
--- a/lang/check/type.go
+++ b/lang/check/type.go
@@ -22,7 +22,41 @@
t "github.com/google/wuffs/lang/token"
)
-func (q *checker) tcheckVars(block []*a.Node) error {
+type cpuArchBits uint32
+
+const (
+ cpuArchBitsSSE128 = cpuArchBits(0x00000001)
+)
+
+func calcCPUArchBits(n *a.Func) (ret cpuArchBits) {
+ for _, o := range n.Asserts() {
+ o := o.AsAssert()
+ if !o.IsChooseCPUArch() {
+ continue
+ }
+ switch o.Condition().RHS().AsExpr().Ident() {
+ case t.IDSSE128:
+ ret |= cpuArchBitsSSE128
+ }
+ }
+ return ret
+}
+
+func (q *checker) tcheckCPUArchBits(cab cpuArchBits, typ *a.TypeExpr) error {
+ if qid := typ.Innermost().QID(); qid[0] == t.IDBase {
+ need := cpuArchBits(0)
+ switch qid[1] {
+ case t.IDSSE128I:
+ need = cpuArchBitsSSE128
+ }
+ if (cab & need) != need {
+ return fmt.Errorf("check: missing cpu_arch for %q", typ.Innermost().Str(q.tm))
+ }
+ }
+ return nil
+}
+
+func (q *checker) tcheckVars(cab cpuArchBits, block []*a.Node) error {
for _, o := range block {
if o.Kind() != a.KVar {
break
@@ -44,6 +78,9 @@
if err := q.tcheckTypeExpr(o.XType(), 0); err != nil {
return err
}
+ if err := q.tcheckCPUArchBits(cab, o.XType()); err != nil {
+ return err
+ }
q.localVars[name] = o.XType()
}
return nil
@@ -549,15 +586,22 @@
genericType1 := (*a.TypeExpr)(nil)
genericType2 := (*a.TypeExpr)(nil)
- switch f.Receiver() {
- case t.QID{t.IDBase, t.IDDagger1}:
- genericType1 = lhs.MType().Receiver()
- case t.QID{t.IDBase, t.IDDagger2}:
- genericType2 = lhs.MType().Receiver()
- if genericType2.Decorator() != t.IDTable {
- return fmt.Errorf("check: internal error: %q is not a generic table", genericType2.Str(q.tm))
+ if recv := f.Receiver(); recv[0] == t.IDBase {
+ switch recv[1] {
+ case t.IDDagger1:
+ genericType1 = lhs.MType().Receiver()
+ case t.IDDagger2:
+ genericType2 = lhs.MType().Receiver()
+ if genericType2.Decorator() != t.IDTable {
+ return fmt.Errorf("check: internal error: %q is not a generic table", genericType2.Str(q.tm))
+ }
+ genericType1 = a.NewTypeExpr(t.IDSlice, 0, 0, nil, nil, genericType2.Inner())
}
- genericType1 = a.NewTypeExpr(t.IDSlice, 0, 0, nil, nil, genericType2.Inner())
+
+ if f.FuncName().IsBuiltInLoad() && (lhs.LHS().AsExpr().Operator() != 0) {
+ return fmt.Errorf(`check: %q receiver %q must be a local variable`,
+ f.QQID().Str(q.tm), lhs.LHS().AsExpr().Str(q.tm))
+ }
}
// Check that the func's in type matches the arguments.
diff --git a/lang/token/list.go b/lang/token/list.go
index a1fe302..c216ebf 100644
--- a/lang/token/list.go
+++ b/lang/token/list.go
@@ -114,6 +114,7 @@
func (x ID) IsTightRight() bool { return x < ID(len(isTightRight)) && isTightRight[x] }
func (x ID) IsAssign() bool { return minAssign <= x && x <= maxAssign }
+func (x ID) IsBuiltInLoad() bool { return minBuiltInLoad <= x && x <= maxBuiltInLoad }
func (x ID) IsCannotAssignTo() bool { return minCannotAssignTo <= x && x <= maxCannotAssignTo }
func (x ID) IsClose() bool { return minClose <= x && x <= maxClose }
func (x ID) IsKeyword() bool { return minKeyword <= x && x <= maxKeyword }
@@ -646,6 +647,19 @@
IDSSE128I = ID(0x311)
// 0x32? are reserved for AVX256.
+
+ // --------
+
+ minBuiltInLoad = 0x380
+ maxBuiltInLoad = 0x387
+
+ IDLoadSlice = ID(0x380)
+ IDLoadU32 = ID(0x381)
+ IDLoadU64 = ID(0x382)
+
+ IDStoreSlice = ID(0x388)
+ IDTruncateU32 = ID(0x389)
+ IDTruncateU64 = ID(0x38A)
)
var builtInsByID = [nBuiltInIDs]string{
@@ -1014,6 +1028,14 @@
IDSSE128: "sse128",
IDSSE128I: "sse128_i",
+
+ IDLoadSlice: "load_slice",
+ IDLoadU32: "load_u32",
+ IDLoadU64: "load_u64",
+
+ IDStoreSlice: "store_slice",
+ IDTruncateU32: "truncate_u32",
+ IDTruncateU64: "truncate_u64",
}
var builtInsByName = map[string]ID{}
diff --git a/release/c/wuffs-unsupported-snapshot.c b/release/c/wuffs-unsupported-snapshot.c
index 58cd01f..fb45879 100644
--- a/release/c/wuffs-unsupported-snapshot.c
+++ b/release/c/wuffs-unsupported-snapshot.c
@@ -30976,6 +30976,8 @@
uint8_t v_fa1 = 0;
uint8_t v_fa2 = 0;
uint8_t v_fa3 = 0;
+ uint32_t v_x32 = 0;
+ __m128i v_x128 = {0};
{
wuffs_base__slice_u8 i_slice_c = a_curr;
@@ -30983,6 +30985,10 @@
v_c.len = 4;
uint8_t* i_end0_c = i_slice_c.ptr + ((i_slice_c.len / 4) * 4);
while (v_c.ptr < i_end0_c) {
+ (v_x128 = _mm_cvtsi32_si128((int)(v_x32)), wuffs_base__make_empty_struct());
+ v_x32 = ((uint32_t)(_mm_cvtsi128_si32(v_x128)));
+ if (v_x32 == 0) {
+ }
v_fa0 += v_c.ptr[0];
v_c.ptr[0] = v_fa0;
v_fa1 += v_c.ptr[1];
diff --git a/std/png/decode_filter_sse128.wuffs b/std/png/decode_filter_sse128.wuffs
index aaabb58..c6f0b5e 100644
--- a/std/png/decode_filter_sse128.wuffs
+++ b/std/png/decode_filter_sse128.wuffs
@@ -19,13 +19,19 @@
pri func decoder.filter_1_distance_4_sse128!(curr: slice base.u8),
choose cpu_arch >= sse128,
{
- var c : slice base.u8
- var fa0 : base.u8
- var fa1 : base.u8
- var fa2 : base.u8
- var fa3 : base.u8
+ var c : slice base.u8
+ var fa0 : base.u8
+ var fa1 : base.u8
+ var fa2 : base.u8
+ var fa3 : base.u8
+ var x32 : base.u32
+ var x128 : base.sse128_i
iterate (c = args.curr)(length: 4, advance: 4, unroll: 1) {
+ x128.load_u32!(a: x32)
+ x32 = x128.truncate_u32()
+ if x32 == 0 {
+ }
fa0 ~mod+= c[0]
c[0] = fa0
fa1 ~mod+= c[1]