bottom out clut recursion at dim=1

This is a modest (~10%) speedup for couple hundred extra bytes.

Change-Id: I80ffb0b13c8c6cfc5b03ff2b58d3a28bc876d769
Reviewed-on: https://skia-review.googlesource.com/c/162104
Auto-Submit: Mike Klein <mtklein@google.com>
Commit-Queue: Brian Osman <brianosman@google.com>
Reviewed-by: Brian Osman <brianosman@google.com>
diff --git a/src/Transform_inl.h b/src/Transform_inl.h
index 978a157..5baf5ef 100644
--- a/src/Transform_inl.h
+++ b/src/Transform_inl.h
@@ -549,10 +549,7 @@
 template <int kBitDepth>
 MAYBE_NOINLINE
 static void clut(const skcms_A2B* a2b, int dim, I32 ix, I32 stride, F* r, F* g, F* b, F a) {
-    if (dim == 0) {
-        sample_clut<kBitDepth>(a2b,ix, r,g,b);
-        return;
-    }
+    assert (0 < dim && dim <= 4);
 
     I32 limit = cast<I32>(F0);
     limit += a2b->grid_points[dim-1];
@@ -566,8 +563,14 @@
         hi = cast<I32>(minus_1_ulp(x+1.0f));
     F lr = *r, lg = *g, lb = *b,
       hr = *r, hg = *g, hb = *b;
-    clut<kBitDepth>(a2b, dim-1, stride*lo + ix, stride*limit, &lr,&lg,&lb,a);
-    clut<kBitDepth>(a2b, dim-1, stride*hi + ix, stride*limit, &hr,&hg,&hb,a);
+
+    if (dim == 1) {
+        sample_clut<kBitDepth>(a2b, stride*lo + ix, &lr,&lg,&lb);
+        sample_clut<kBitDepth>(a2b, stride*hi + ix, &hr,&hg,&hb);
+    } else {
+        clut<kBitDepth>(a2b, dim-1, stride*lo + ix, stride*limit, &lr,&lg,&lb,a);
+        clut<kBitDepth>(a2b, dim-1, stride*hi + ix, stride*limit, &hr,&hg,&hb,a);
+    }
 
     F t = x - cast<F>(lo);
     *r = lr + (hr-lr)*t;