Plumb through out_row byte length so we can assert we stay underneath it.

Sadly, not asserting for me yet.  Can't hurt.

BUG=chromium:491660

Review URL: https://codereview.chromium.org/1187173005
diff --git a/src/core/SkConvolver.cpp b/src/core/SkConvolver.cpp
index 49688db..bf00956 100644
--- a/src/core/SkConvolver.cpp
+++ b/src/core/SkConvolver.cpp
@@ -436,7 +436,7 @@
                     src[i] = &sourceData[(uint64_t)(nextXRow + i) * sourceByteRowStride];
                     outRow[i] = rowBuffer.advanceRow();
                 }
-                convolveProcs.fConvolve4RowsHorizontally(src, filterX, outRow);
+                convolveProcs.fConvolve4RowsHorizontally(src, filterX, outRow, 4*rowBufferWidth);
                 nextXRow += 4;
             } else {
                 // Check if we need to avoid SSE2 for this row.
diff --git a/src/core/SkConvolver.h b/src/core/SkConvolver.h
index 8e53da2..21419a9 100644
--- a/src/core/SkConvolver.h
+++ b/src/core/SkConvolver.h
@@ -153,7 +153,8 @@
 typedef void (*SkConvolve4RowsHorizontally_pointer)(
     const unsigned char* srcData[4],
     const SkConvolutionFilter1D& filter,
-    unsigned char* outRow[4]);
+    unsigned char* outRow[4],
+    size_t outRowBytes);
 typedef void (*SkConvolveHorizontally_pointer)(
     const unsigned char* srcData,
     const SkConvolutionFilter1D& filter,
diff --git a/src/opts/SkBitmapFilter_opts_SSE2.cpp b/src/opts/SkBitmapFilter_opts_SSE2.cpp
index de3dd3b..ecaad23 100644
--- a/src/opts/SkBitmapFilter_opts_SSE2.cpp
+++ b/src/opts/SkBitmapFilter_opts_SSE2.cpp
@@ -174,7 +174,10 @@
 // refer to that function for detailed comments.
 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
                                     const SkConvolutionFilter1D& filter,
-                                    unsigned char* out_row[4]) {
+                                    unsigned char* out_row[4],
+                                    size_t outRowBytes) {
+    SkDEBUGCODE(const unsigned char* out_row_0_start = out_row[0];)
+
     int num_values = filter.numValues();
 
     int filter_offset, filter_length;
@@ -275,6 +278,9 @@
         accum3 = _mm_packs_epi32(accum3, zero);
         accum3 = _mm_packus_epi16(accum3, zero);
 
+        // We seem to be running off the edge here (chromium:491660).
+        SkASSERT(((size_t)out_row[0] - (size_t)out_row_0_start) < outRowBytes);
+
         *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0);
         *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1);
         *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2);
diff --git a/src/opts/SkBitmapFilter_opts_SSE2.h b/src/opts/SkBitmapFilter_opts_SSE2.h
index 115c846..46ab5c8 100644
--- a/src/opts/SkBitmapFilter_opts_SSE2.h
+++ b/src/opts/SkBitmapFilter_opts_SSE2.h
@@ -19,7 +19,8 @@
                              bool has_alpha);
 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],
                                     const SkConvolutionFilter1D& filter,
-                                    unsigned char* out_row[4]);
+                                    unsigned char* out_row[4],
+                                    size_t outRowBytes);
 void convolveHorizontally_SSE2(const unsigned char* src_data,
                                const SkConvolutionFilter1D& filter,
                                unsigned char* out_row,
diff --git a/src/opts/SkBitmapProcState_arm_neon.cpp b/src/opts/SkBitmapProcState_arm_neon.cpp
index 08b83ea..d8a17d8 100644
--- a/src/opts/SkBitmapProcState_arm_neon.cpp
+++ b/src/opts/SkBitmapProcState_arm_neon.cpp
@@ -389,7 +389,8 @@
 // refer to that function for detailed comments.
 void convolve4RowsHorizontally_neon(const unsigned char* srcData[4],
                                     const SkConvolutionFilter1D& filter,
-                                    unsigned char* outRow[4]) {
+                                    unsigned char* outRow[4],
+                                    size_t outRowBytes) {
 
     uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
     uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);