renderer/src/intersection_board.cpp - external/github.com/rive-app/rive-cpp - Git at Google

 /*
  * Copyright 2024 Rive
  */

 #include "intersection_board.hpp"

 #include "rive/math/math_types.hpp"

 #if !SIMD_NATIVE_GVEC &&                                                       \
     (defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64))
 // MSVC doesn't get codegen for the inner loop. Provide direct SSE intrinsics.
 #include <emmintrin.h>
 #define FALLBACK_ON_SSE2_INTRINSICS
 #else
 #endif

 namespace rive::gpu
 {
 void IntersectionTile::reset(int left,
                              int top,
                              int16_t baselineGroupIndex,
                              uint16_t baselineOverlapBits)
 {
     // Since we mask non-intersecting groupIndices to zero, the "mask and max"
     // algorithm is only correct for positive values. (baselineGroupIndex is
     // only signed because SSE doesn't have an unsigned max instruction.)
     assert(baselineGroupIndex >= 0);
     m_topLeft = {left, top, left, top};
     m_baselineGroupIndex = baselineGroupIndex;
     m_maxGroupIndex = baselineGroupIndex;
     m_baselineOverlapBits = baselineOverlapBits;
     m_overlapBitsForMaxGroup = baselineOverlapBits;
     m_edges.clear();
     m_groupIndices.clear();
     m_overlapBits.clear();
     m_rectangleCount = 0;
 }

 template <GroupingType Type>
 void IntersectionTile::addRectangle(int4 ltrb,
                                     int16_t groupIndex,
                                     uint16_t currentRectangleOverlapBits)
 {
     assert(simd::all(ltrb.xy < ltrb.zw)); // Ensure ltrb isn't zero or negative.
     assert(groupIndex >= 0);

     if constexpr (Type == GroupingType::overlapAllowed)
     {
         if (m_overlapBits.size() != m_groupIndices.size())
         {
             // We did not previously have any overlap bits and now we're about
             // to - initialize the array with zeros (everything already in the
             // intersection tile has zeros as its overlap bits, or else the
             // boardContainsNonzeroOverlapBits flag would have already been
             // set).
             assert(m_overlapBits.empty());
             m_overlapBits.resize(m_groupIndices.size());
         }
     }
     else
     {
         static_assert(Type == GroupingType::disjoint);

         // Shouldn't pass any overlap bits in if we indicated they weren't
         // relevant
         assert(currentRectangleOverlapBits == 0);
     }

 #if !defined(NDEBUG)
     // Ensure this rectangle preserves the integrity of our list.
     {
         auto r = findMaxIntersectingGroupIndex<Type>(ltrb, {});
         auto maxGroup = simd::reduce_max(r.maxGroupIndices);
         if constexpr (Type == GroupingType::overlapAllowed)
         {
             // If overlapping is allowed, this rectangle can be in the same
             // group as another it overlaps, but cannot be below it.
             assert(groupIndex >= maxGroup);
             assert(groupIndex >= m_baselineGroupIndex);
         }
         else
         {
             // If no overlapping is allowed, this rectangle cannot be at or
             // below a group it overlaps.
             assert(groupIndex > maxGroup);
             assert(groupIndex > m_baselineGroupIndex);
         }
     }
 #endif

     // Translate ltrb to our tile and negate the right and bottom sides.
     ltrb -= m_topLeft;
     // right = TILE_DIM - right, bottom = TILE_DIM- bottom
     ltrb.zw = TILE_DIM - ltrb.zw;
     // Ensure ltrb isn't completely outside the tile.
     assert(simd::all(ltrb < TILE_DIM));
     ltrb = simd::max(ltrb, int4(0)); // Clamp ltrb to our tile.

     if constexpr (Type == GroupingType::overlapAllowed)
     {
         if (groupIndex == m_baselineGroupIndex &&
             (currentRectangleOverlapBits | m_baselineOverlapBits) ==
                 m_baselineOverlapBits)
         {
             // Nothing needs to be done here, this is going into the baseline
             // group and it adds no new overlap bits that are not already in the
             // baseline.
             return;
         }
     }

     if (simd::all(ltrb == 0))
     {
         // ltrb covers the entire tile
         if constexpr (Type == GroupingType::overlapAllowed)
         {
             // This rectangle is allowed to be at or higher than the max group
             // (rather than strictly above it, in the no-overlap case)
             assert(groupIndex >= m_maxGroupIndex);

             if (groupIndex == m_maxGroupIndex)
             {
                 // This rectangle is within the existing max group, so do a
                 // heavier-handed update (some rectangles in the max group
                 // may still need to stay, so we can't do a reset)
                 updateBaselineToMaxGroupIndex<Type>(
                     currentRectangleOverlapBits);
                 return;
             }
         }
         else
         {
             // If no overlapping is allowed, a full-tile rectangle had
             // better be above the current max group.
             assert(groupIndex > m_maxGroupIndex);
         }

         // Setting a new baseline, so reset everything, update the group index
         // and baseline overlap
         reset(m_topLeft.x,
               m_topLeft.y,
               groupIndex,
               currentRectangleOverlapBits);
         return;
     }

     // Append a chunk if needed, to make room for this new rectangle.
     uint32_t subIdx = m_rectangleCount % CHUNK_SIZE;
     if (subIdx == 0)
     {
         // Push back maximally negative rectangles so they always fail
         // intersection tests.
         assert(m_edges.size() * CHUNK_SIZE == m_rectangleCount);
         m_edges.push_back(int8x32(TILE_DIM + TILE_EDGE_BIAS));

         // Uninitialized since the corresponding rectangles never pass an
         // intersection test.
         assert(m_groupIndices.size() * CHUNK_SIZE == m_rectangleCount);
         m_groupIndices.emplace_back();

         if constexpr (Type == GroupingType::overlapAllowed)
         {
             assert(m_overlapBits.size() * CHUNK_SIZE == m_rectangleCount);
             m_overlapBits.emplace_back();
         }
     }

     // m_edges is a list of 8 rectangles encoded as
     // [L, T, TILE_DIM - R, TILE_DIM - B], relative to m_topLeft. The data is
     // also transposed: [L0..L7, T0..T7, -R0..R7, -B0..B7]. Bias ltrb so
     // we can use int8_t. SSE doesn't have an unsigned byte compare.
     int4 biased = ltrb + TILE_EDGE_BIAS;
     m_edges.back()[subIdx] = biased.x;
     m_edges.back()[subIdx + 1 * CHUNK_SIZE] = biased.y;
     m_edges.back()[subIdx + 2 * CHUNK_SIZE] =
         biased.z; // Already converted to "TILE_DIM - right" above.
     m_edges.back()[subIdx + 3 * CHUNK_SIZE] =
         biased.w; // Already converted to "TILE_DIM - bottom" above.

     m_groupIndices.back()[subIdx] = groupIndex;

     if constexpr (Type == GroupingType::overlapAllowed)
     {
         m_overlapBits.back()[subIdx] = currentRectangleOverlapBits;

         if (groupIndex > m_maxGroupIndex)
         {
             // There's a new maximum group, so set it and restart its overlap
             // bits.
             m_maxGroupIndex = groupIndex;
             m_overlapBitsForMaxGroup = currentRectangleOverlapBits;
         }
         else if (groupIndex == m_maxGroupIndex)
         {
             // This is going into the existing max group so add its overlap bits
             // to the max group's tracked overlap bits
             m_overlapBitsForMaxGroup |= currentRectangleOverlapBits;
         }
     }
     else
     {
         // If we aren't tracking overlap bits then simply ensuring the max group
         // index is correct is sufficient.
         m_maxGroupIndex = std::max(m_maxGroupIndex, groupIndex);
     }

     ++m_rectangleCount;
 }

 #ifdef WITH_RIVE_TOOLS
 template <GroupingType Type>
 void IntersectionTile::testingOnly_addRectangleAndValidate(
     int4 ltrb,
     int16_t groupIndex,
     uint16_t currentRectangleOverlapBits)
 {
     addRectangle<Type>(ltrb, groupIndex, currentRectangleOverlapBits);

     // Baseline should not be higher than the inserted rectangle,
     // max group should not be lower.
     assert(m_baselineGroupIndex <= groupIndex);
     assert(m_maxGroupIndex >= groupIndex);

     // Should be no baseline overlap bits if there's no baseline
     // (ditto for max group)
     assert(m_baselineGroupIndex != 0 || m_baselineOverlapBits == 0);
     assert(m_maxGroupIndex != 0 || m_overlapBitsForMaxGroup == 0);

     uint16_t foundOverlapBitsForMaxGroup;

     if constexpr (Type == GroupingType::overlapAllowed)
     {
         foundOverlapBitsForMaxGroup = (m_baselineGroupIndex == m_maxGroupIndex)
                                           ? m_baselineOverlapBits
                                           : 0u;
     }
     else
     {
         // Ignore this value (not using std::ignore here because it
         // would require initializing the value first, which would
         // mean the compiler won't catch a mistake where this gets
         // used elsewhere when it shouldn't be)
         (void)foundOverlapBitsForMaxGroup;

         // If overlap isn't allowed this should never be allocated
         assert(m_overlapBits.empty());
     }

     // Exhaustively check to ensure that the overlap bits and max
     // group indices match our tracked values.
     auto maxGroupIndexFound = m_baselineGroupIndex;
     for (auto i = 0u; i < m_rectangleCount; i++)
     {
         auto outerI = i / CHUNK_SIZE;
         auto innerI = i % CHUNK_SIZE;

         assert(m_groupIndices[outerI][innerI] >= m_baselineGroupIndex);
         assert(m_groupIndices[outerI][innerI] <= m_maxGroupIndex);
         if constexpr (Type == GroupingType::overlapAllowed)
         {
             if (m_groupIndices[outerI][innerI] == m_maxGroupIndex)
             {
                 foundOverlapBitsForMaxGroup |= m_overlapBits[outerI][innerI];
             }
         }
         maxGroupIndexFound =
             std::max(maxGroupIndexFound, m_groupIndices[outerI][innerI]);
     }

     if constexpr (Type == GroupingType::overlapAllowed)
     {
         assert(foundOverlapBitsForMaxGroup == m_overlapBitsForMaxGroup);
         if (m_baselineGroupIndex == m_maxGroupIndex)
         {
             // max group is allowed to have more set bits than the
             // baseline (as there might be non-full-tile rectangles
             // at that level), but there should never be bits in
             // baseline that aren't part of the max group set if
             // their group indices match.
             assert((m_overlapBitsForMaxGroup | m_baselineOverlapBits) ==
                    m_overlapBitsForMaxGroup);
         }
     }
     else
     {
         assert(m_overlapBitsForMaxGroup == 0);
         assert(m_baselineOverlapBits == 0);
     }

     assert(maxGroupIndexFound == m_maxGroupIndex);
 }
 #endif

 template <GroupingType Type>
 void IntersectionTile::updateBaselineToMaxGroupIndex(
     uint16_t additionalBaselineOverlapBits)
 {
     // This should only be called when overlap testing is enabled. The template
     // argument is really just here to make it hard to accidentally call this
     // from a place where it will be wrong
     static_assert(Type == GroupingType::overlapAllowed);

     // We've added a full-tile rectangle at m_maxGroupIndex, which means we can
     // update the baseline and its overlap bits.
     if ((additionalBaselineOverlapBits | m_overlapBitsForMaxGroup) ==
             additionalBaselineOverlapBits ||
         m_rectangleCount == 0)
     {
         // This rectangle both covers the whole tile and either it contains all
         // of the existing overlap bits in this group, or there are no
         // existing rectangles (the max group is the already just the baseline),
         // which means we can do a simple reset, as we don't need to keep any
         // rectangles - even ones that are in this max group.
         reset(m_topLeft.x,
               m_topLeft.y,
               m_maxGroupIndex,
               m_overlapBitsForMaxGroup | additionalBaselineOverlapBits);
         return;
     }

     m_overlapBitsForMaxGroup |= additionalBaselineOverlapBits;

     if (m_maxGroupIndex == m_baselineGroupIndex)
     {
         if (m_overlapBitsForMaxGroup == m_baselineOverlapBits)
         {
             // This rectangle does not change the baseline or its bits in
             // any way, so we're done early.
             return;
         }

         m_baselineOverlapBits |= additionalBaselineOverlapBits;
     }
     else
     {
         m_baselineGroupIndex = m_maxGroupIndex;
         m_baselineOverlapBits = additionalBaselineOverlapBits;
     }

     // Do a pass through the array and zero out the group indices of any
     // rectangles that should no longer remain in the list (this makes the test
     // in the compaction loop more efficient).
     {
 #if !defined(FALLBACK_ON_SSE2_INTRINSICS)
         auto baselineGroupIndexVector = int16x8(m_baselineGroupIndex);
         auto baselineOverlapBitsVector = uint16x8(m_baselineOverlapBits);
 #else
         auto baselineGroupIndexVector = _mm_set1_epi16(m_baselineGroupIndex);
         auto baselineOverlapBitsVector = _mm_set1_epi16(m_baselineOverlapBits);
 #endif
         for (size_t i = 0; i < m_groupIndices.size(); i++)
         {
 #if !defined(FALLBACK_ON_SSE2_INTRINSICS)
             // Groups under the baseline will go away
             auto mask = m_groupIndices[i] >= baselineGroupIndexVector;

             // Of those that remained, only keep the ones where the overlap bits
             // have bits that aren't in the baseline's bits.
             mask &= ((baselineOverlapBitsVector | m_overlapBits[i]) !=
                      baselineOverlapBitsVector);

             m_groupIndices[i] &= mask;
 #else
             // The SSE2 code has to build the mask backwards from the above
             // code, as there is neither a ">=" or "!=" operator.
             auto groupIndices = math::bit_cast<__m128i>(m_groupIndices[i]);
             auto overlapBits = math::bit_cast<__m128i>(m_overlapBits[i]);
             auto mask = _mm_cmpgt_epi16(baselineGroupIndexVector, groupIndices);

             mask = _mm_or_si128(
                 mask,
                 _mm_cmpeq_epi16(
                     baselineOverlapBitsVector,
                     _mm_or_si128(baselineOverlapBitsVector, overlapBits)));

             // Conveniently, there is a (~mask & groupIndices) single operation
             // to allow us to use the inverted mask directly.
             groupIndices = _mm_andnot_si128(mask, groupIndices);
             m_groupIndices[i] = math::bit_cast<int16x8>(groupIndices);
 #endif
         }
     }

     // Now, remove all of the rectangles that are no longer above the line.
     // (Thankfully, ordering doesn't matter, so we don't need the partitioning
     // to be stable)

     // Start by scaning the first index forward until we find a rectangle that
     // doesn't need to stay (or we hit the end)
     size_t firstIdx = 0u;
     while (m_groupIndices[firstIdx / CHUNK_SIZE][firstIdx % CHUNK_SIZE] != 0)
     {
         firstIdx++;
         if (firstIdx == m_rectangleCount)
         {
             // Every rectangle stays! We don't have to do any additional
             // work.
             return;
         }
     }

     size_t lastIdx = m_rectangleCount - 1;
     while (lastIdx > firstIdx &&
            m_groupIndices[lastIdx / CHUNK_SIZE][lastIdx % CHUNK_SIZE] == 0)
     {
         --lastIdx;
     }

     while (firstIdx < lastIdx)
     {
         // Now first index points to something that should go and last index
         // points to something that should stay, and they're out of order. Move
         // the latter element to the beginning to put it into place.
         {
             size_t firstOuterIdx = firstIdx / CHUNK_SIZE;
             size_t firstInnerIdx = firstIdx % CHUNK_SIZE;
             size_t lastOuterIdx = lastIdx / CHUNK_SIZE;
             size_t lastInnerIdx = lastIdx % CHUNK_SIZE;

             // Put the one that wants to stay at the first index.

             m_edges[firstOuterIdx][firstInnerIdx + 0] =
                 m_edges[lastOuterIdx][lastInnerIdx + 0];
             m_edges[firstOuterIdx][firstInnerIdx + CHUNK_SIZE] =
                 m_edges[lastOuterIdx][lastInnerIdx + CHUNK_SIZE];
             m_edges[firstOuterIdx][firstInnerIdx + 2 * CHUNK_SIZE] =
                 m_edges[lastOuterIdx][lastInnerIdx + 2 * CHUNK_SIZE];
             m_edges[firstOuterIdx][firstInnerIdx + 3 * CHUNK_SIZE] =
                 m_edges[lastOuterIdx][lastInnerIdx + 3 * CHUNK_SIZE];

             m_groupIndices[firstOuterIdx][firstInnerIdx] =
                 m_groupIndices[lastOuterIdx][lastInnerIdx];
             m_overlapBits[firstOuterIdx][firstInnerIdx] =
                 m_overlapBits[lastOuterIdx][lastInnerIdx];

             // Update the group index at the last index to be a 0, which now
             // marks it as a "this doesn't need to stay" rectangle. This allows
             // it to act as a sentinel value for the loop where we increment
             // firstRectangleIndex so it can avoid an extra bounds check.
             m_groupIndices[lastOuterIdx][lastInnerIdx] = 0;
         }

         // Because of that zero, we're now guaranteed that there
         // is a rectangle that should be removed to the right of the first index
         // and one that should stay to the left of the last index (minimally,
         // the ones we just adjusted), so these inner loops do not need to
         // bounds check.
         do
         {
             ++firstIdx;
             assert(firstIdx < m_rectangleCount);
         } while (m_groupIndices[firstIdx / CHUNK_SIZE][firstIdx % CHUNK_SIZE] !=
                  0);

         do
         {
             --lastIdx;
             // NOTE: This assert seems backwards, but it's an unsigned variable
             // and will wrap past zero, and thus go *very* positive.
             assert(lastIdx < m_rectangleCount);
         } while (m_groupIndices[lastIdx / CHUNK_SIZE][lastIdx % CHUNK_SIZE] ==
                  0);
     }

     // firstRectangleIndex now points to the first rectangle that needs to be
     // removed, which means it is also our new rectangle count.
     m_rectangleCount = firstIdx;
     auto vectorElementCount = (m_rectangleCount + CHUNK_SIZE - 1) / CHUNK_SIZE;
     m_edges.resize(vectorElementCount);
     m_groupIndices.resize(vectorElementCount);
     m_overlapBits.resize(vectorElementCount);

     if (vectorElementCount > 0 && (m_rectangleCount % CHUNK_SIZE) != 0)
     {
         auto& lastEdges = m_edges.back();
         auto& lastGroupIndices = m_groupIndices.back();
         auto& lastOverlapBits = m_overlapBits.back();

         constexpr auto DEFAULT_EDGE_EXTENT = TILE_DIM + TILE_EDGE_BIAS;

         // We have a chunk with potentially some boxes that had entries that
         // shouldn't anymore, so clear them.
         for (auto i = (m_rectangleCount % CHUNK_SIZE); i < CHUNK_SIZE; i++)
         {
             lastEdges[i + 0 * CHUNK_SIZE] = DEFAULT_EDGE_EXTENT;
             lastEdges[i + 1 * CHUNK_SIZE] = DEFAULT_EDGE_EXTENT;
             lastEdges[i + 2 * CHUNK_SIZE] = DEFAULT_EDGE_EXTENT;
             lastEdges[i + 3 * CHUNK_SIZE] = DEFAULT_EDGE_EXTENT;
             lastGroupIndices[i] = 0;
             lastOverlapBits[i] = 0;
         }
     }
 }

 template <GroupingType Type>
 IntersectionTile::FindResult IntersectionTile::findMaxIntersectingGroupIndex(
     int4 ltrb,
     FindResult running) const
 {
     assert(simd::all(ltrb.xy < ltrb.zw)); // Ensure ltrb isn't zero or negative.

     // Since we mask non-intersecting groupIndices to zero, the "mask and max"
     // algorithm is only correct for positive values. (running.maxGroupIndices
     // is only signed because SSE pre 4.1 doesn't have an unsigned max
     // instruction.)
     assert(simd::all(running.maxGroupIndices >= 0));
     assert(m_baselineGroupIndex >= 0);
     assert(m_maxGroupIndex >= m_baselineGroupIndex);

     // Translate ltrb to our tile and negate the left and top sides.
     ltrb -= m_topLeft;
     // left = TILE_DIM - left, top = TILE_DIM - top
     ltrb.xy = TILE_DIM - ltrb.xy;
     assert(
         simd::all(ltrb > 0)); // Ensure ltrb isn't completely outside the tile.
     ltrb = simd::min(ltrb, int4(TILE_DIM)); // Clamp ltrb to our tile.

     if (simd::all(ltrb == TILE_DIM))
     {
         // ltrb covers the tile -- we know it intersects with every rectangle.
         if constexpr (Type == GroupingType::overlapAllowed)
         {
             auto currentMax = running.maxGroupIndices[0];
             if (currentMax < m_maxGroupIndex)
             {
                 // The first element's max group index is changing, so restart
                 // its overlap bits with the new set.
                 running.maxGroupIndices[0] = m_maxGroupIndex;
                 running.overlapBits[0] = m_overlapBitsForMaxGroup;
             }
             else if (currentMax == m_maxGroupIndex)
             {
                 // The group index of this element is the same, so just add in
                 // any new overlap bits that might exist.
                 running.overlapBits[0] |= m_overlapBitsForMaxGroup;
             }
         }
         else
         {
             static_assert(Type == GroupingType::disjoint);

             running.maxGroupIndices[0] =
                 std::max(running.maxGroupIndices[0], m_maxGroupIndex);
         }

         return running;
     }

     // Intersection test: l0 < r1 &&
     //                    t0 < b1 &&
     //                    r0 > l1 &&
     //                    b0 > t1
     //
     // Or, to make them all use the same operator: +l0 < +r1 &&
     //                                             +t0 < +b1 &&
     //                                             -r0 < -l1 &&
     //                                             -b0 < -t1
     //
     // m_edges are already encoded like the left column, so encode "ltrb" like
     // the right.
     //
     // Bias ltrb so we can use int8_t. SSE (pre-4.1) doesn't have an unsigned
     // byte compare.
     int4 biased = ltrb + TILE_EDGE_BIAS;
     int8x8 r = biased.z;
     int8x8 b = biased.w;
     int8x8 _l = biased.x; // Already converted to "TILE_DIM - left" above.
     int8x8 _t = biased.y; // Already converted to "TILE_DIM - top" above.

     assert(m_edges.size() == m_groupIndices.size());
     if constexpr (Type == GroupingType::overlapAllowed)
     {
         assert(m_edges.size() == m_overlapBits.size());
     }

 #if !defined(FALLBACK_ON_SSE2_INTRINSICS)
     auto edges = m_edges.begin();
     auto groupIndices = m_groupIndices.begin();
     const uint16x8* groupOverlapBits = nullptr;

     // We only care about groupOverlapBits if this rectangle overlap testing is
     // enabled *and* there are non-zero overlap bits in the mix
     if constexpr (Type == GroupingType::overlapAllowed)
     {
         groupOverlapBits = m_overlapBits.data();
     }
     PUSH_DISABLE_CLANG_SIMD_ABI_WARNING()
     int8x32 complement = simd::join(r, b, _l, _t);
     for (; edges != m_edges.end(); ++edges, ++groupIndices)
     {
         // Test 32 edges!
         auto edgeMasks = *edges < complement;
         // Since the transposed L,T,R,B rows are a each 64-bit vectors,
         // "and-reducing" them returns the intersection test (l0 < r1 && t0 < b1
         // && r0 > l1 && b0 > t1) in each byte.
         int64_t isectMask =
             simd::reduce_and(math::bit_cast<int64x4>(edgeMasks));
         // Each element of isectMasks8 is 0xff if we intersect with the
         // corresponding rectangle, otherwise 0.
         int8x8 isectMasks8 = math::bit_cast<int8x8>(isectMask);
         // Widen isectMasks8 to 16 bits per mask, where each element of
         // isectMasks16 is 0xffff if we intersect with the rectangle, otherwise
         // 0.
         int16x8 isectMasks16 =
             math::bit_cast<int16x8>(simd::zip(isectMasks8, isectMasks8));
         // Mask out any groupIndices we don't intersect with so they don't
         // participate in the test for maximum groupIndex.
         int16x8 maskedGroupIndices = isectMasks16 & *groupIndices;

         // Doing this as a compile-time check to not have to do the test in the
         // loop at runtime.
         if constexpr (Type == GroupingType::overlapAllowed)
         {
             // Clear any bits from the running overlap where the new index is
             // covering it (i.e. there's a new closest group index)
             auto keepRunningBits =
                 (maskedGroupIndices <= running.maxGroupIndices);
             running.overlapBits &= keepRunningBits;

             // Keep any bits from the current group where the running index is
             // not closer (i.e. where current is not covered).
             // Note: this will additionally keep bits where there has not yet
             // been an intersection (index is 0), and the current rectangle was
             // not intersected. This is fine - these bits will get cleared by
             // the above code when a rectangle intersection finally *does*
             // happen, so we can save on an additional mask with isectMask16.
             auto keepCurrentBits =
                 (running.maxGroupIndices <= maskedGroupIndices);
             auto maskedOverlapBits = *groupOverlapBits & keepCurrentBits;

             // Combine the two sets, ultimately this will add bits from
             // overlapping rectangles that are in the new-highest draw group and
             // clear any bits that are not.
             running.overlapBits |= maskedOverlapBits;

             ++groupOverlapBits;
         }

         running.maxGroupIndices =
             simd::max(maskedGroupIndices, running.maxGroupIndices);
     }
     POP_DISABLE_CLANG_SIMD_ABI_WARNING()

 #else
     // MSVC doesn't get good codegen for the above loop. Provide direct SSE
     // intrinsics.
     const __m128i* edgeData = reinterpret_cast<const __m128i*>(m_edges.data());
     const __m128i* groupIndices =
         reinterpret_cast<const __m128i*>(m_groupIndices.data());
     const __m128i* groupOverlapBits = nullptr;
     if constexpr (Type == GroupingType::overlapAllowed)
     {
         groupOverlapBits =
             reinterpret_cast<const __m128i*>(m_overlapBits.data());
     }
     __m128i complementLO = math::bit_cast<__m128i>(simd::join(r, b));
     __m128i complementHI = math::bit_cast<__m128i>(simd::join(_l, _t));
     __m128i localMaxGroupIndices =
         math::bit_cast<__m128i>(running.maxGroupIndices);
     auto localOverlapBits = math::bit_cast<__m128i>(running.overlapBits);
     for (size_t i = 0; i < m_groupIndices.size(); ++i)
     {
         __m128i edgesLO = edgeData[i * 2];
         __m128i edgesHI = edgeData[i * 2 + 1];
         // Test 32 edges!
         __m128i edgeMasksLO = _mm_cmpgt_epi8(complementLO, edgesLO);
         __m128i edgeMasksHI = _mm_cmpgt_epi8(complementHI, edgesHI);
         // AND L & R masks (bits 0:63) and T & B masks (bits 63:127).
         __m128i partialIsectMasks = _mm_and_si128(edgeMasksLO, edgeMasksHI);
         // Widen partial edge masks from 8 bits to 16.
         __m128i partialIsectMasksTB16 =
             _mm_unpackhi_epi8(partialIsectMasks, partialIsectMasks);
         __m128i partialIsectMasksLR16 =
             _mm_unpacklo_epi8(partialIsectMasks, partialIsectMasks);
         // AND LR masks with TB masks for a full LTRB intersection mask.
         __m128i isectMasks16 =
             _mm_and_si128(partialIsectMasksLR16, partialIsectMasksTB16);
         // Mask out the groupIndices that don't intersect.
         __m128i maskedGroupIndices =
             _mm_and_si128(isectMasks16, groupIndices[i]);

         // Doing this as a compile-time check to not have to do the test in the
         // loop at runtime.
         if constexpr (Type == GroupingType::overlapAllowed)
         {
             // Clear any bits from the running overlap where the new index is
             // covering it (i.e. there's a new closest group index)
             // NOTE: that masks and the "and"s are reversed from the non-SSE2
             // code above: SSE2 does not have integral >= or <= operations, so
             // the formulation here uses "!(a > b)" as equivalent to (a <= b),
             // thankfully using no additional instructions thanks to the
             // existence of "andnot" instructions.
             auto clearRunningBits =
                 _mm_cmpgt_epi16(maskedGroupIndices, localMaxGroupIndices);

             // andnot is (~a & b) so clearRunningBits needs to come first as
             // it's what we intend to negate
             localOverlapBits =
                 _mm_andnot_si128(clearRunningBits, localOverlapBits);

             // Keep any bits from the current group where the running index is
             // not closer (i.e. where current is not covered).
             // Note: this will additionally keep bits where there has not yet
             // been an intersection (index is 0), and the current rectangle was
             // not intersected. This is fine - these bits will get cleared by
             // the above code when a rectangle intersection finally *does*
             // happen, so we can save on an additional mask with isectMask16.
             auto clearCurrentBits =
                 _mm_cmpgt_epi16(localMaxGroupIndices, maskedGroupIndices);
             auto maskedOverlapBits =
                 _mm_andnot_si128(clearCurrentBits, groupOverlapBits[i]);

             // Combine the two sets, ultimately this will add bits from
             // overlapping rectangles that are in the new-highest draw group and
             // clear any bits that are not.
             localOverlapBits =
                 _mm_or_si128(localOverlapBits, maskedOverlapBits);
         }

         // Accumulate max intersecting groupIndices.
         localMaxGroupIndices =
             _mm_max_epi16(maskedGroupIndices, localMaxGroupIndices);
     }
     running.maxGroupIndices = math::bit_cast<int16x8>(localMaxGroupIndices);

     if constexpr (Type == GroupingType::overlapAllowed)
     {
         running.overlapBits = math::bit_cast<uint16x8>(localOverlapBits);
     }

 #endif // !FALLBACK_ON_SSE2_INTRINSICS

     // Ensure we never drop below our baseline index.
     if constexpr (Type == GroupingType::overlapAllowed)
     {
         if (running.maxGroupIndices[0] < m_baselineGroupIndex)
         {
             // We're bumping this entry up so the overlap bits need to
             // be cleared.
             running.maxGroupIndices[0] = m_baselineGroupIndex;
             running.overlapBits[0] = m_baselineOverlapBits;
         }
         else if (running.maxGroupIndices[0] == m_baselineGroupIndex)
         {
             running.overlapBits[0] |= m_baselineOverlapBits;
         }
     }
     else
     {
         running.maxGroupIndices[0] =
             std::max(running.maxGroupIndices[0], m_baselineGroupIndex);
     }
     return running;
 }

 void IntersectionBoard::resizeAndReset(uint32_t viewportWidth,
                                        uint32_t viewportHeight)
 {
     m_viewportSize =
         int2{static_cast<int>(viewportWidth), static_cast<int>(viewportHeight)};

     // Divide the board into TILE_DIM x TILE_DIM tiles.
     int2 dims = (m_viewportSize + TILE_DIM - 1) / TILE_DIM;
     m_cols = dims.x;
     m_rows = dims.y;
     if (m_tiles.size() < m_cols * m_rows)
     {
         m_tiles.resize(m_cols * m_rows);
     }
     auto tileIter = m_tiles.begin();
     for (int y = 0; y < m_rows; ++y)
     {
         for (int x = 0; x < m_cols; ++x)
         {
             tileIter->reset(x * TILE_DIM, y * TILE_DIM);
             ++tileIter;
         }
     }
 }

 int16_t IntersectionBoard::addRectangle(int4 ltrb,
                                         uint16_t currentRectangleOverlapBits,
                                         uint16_t disallowOverlapMask,
                                         int16_t layerCount)
 {
     switch (m_groupingType)
     {
         case GroupingType::overlapAllowed:
             return addRectangle<GroupingType::overlapAllowed>(
                 ltrb,
                 currentRectangleOverlapBits,
                 disallowOverlapMask,
                 layerCount);
         case GroupingType::disjoint:
             return addRectangle<GroupingType::disjoint>(
                 ltrb,
                 currentRectangleOverlapBits,
                 disallowOverlapMask,
                 layerCount);
     }

     RIVE_UNREACHABLE();
 }

 template <GroupingType Type>
 int16_t IntersectionBoard::addRectangle(int4 ltrb,
                                         uint16_t currentRectangleOverlapBits,
                                         uint16_t disallowedOverlapBitsMask,
                                         int16_t layerCount)
 {
     if constexpr (Type == GroupingType::overlapAllowed)
     {
         assert(
             layerCount == 1 &&
             "rectangles that are allowed to overlap should only be a single layer");
     }

     // Discard empty, negative, or offscreen rectangles.
     if (simd::any(ltrb.xy >= m_viewportSize || ltrb.zw <= 0 ||
                   ltrb.xy >= ltrb.zw))
     {
         return 0;
     }

     // Clamp ltrb to the viewport to avoid integer overflows.
     ltrb.xy = simd::max(ltrb.xy, int2(0));
     ltrb.zw = simd::min(ltrb.zw, m_viewportSize);

     // Find the tiled row and column that each corner of the rectangle falls on.
     int4 span = (ltrb - int4{0, 0, 1, 1}) / TILE_DIM;
     span = simd::clamp(span, int4(0), int4{m_cols, m_rows, m_cols, m_rows} - 1);
     assert(simd::all(span.xy <= span.zw));

     // Accumulate the max groupIndex from each tile the rectangle touches.
     IntersectionTile::FindResult results;
     for (int y = span.y; y <= span.w; ++y)
     {
         auto tileIter = m_tiles.begin() + y * m_cols + span.x;
         for (int x = span.x; x <= span.z; ++x)
         {
             results =
                 tileIter->findMaxIntersectingGroupIndex<Type>(ltrb, results);
             ++tileIter;
         }
     }

     // Find the absolute max group index this rectangle intersects with.
     int16_t bottomGroupIndex = simd::reduce_max(results.maxGroupIndices);
     // It is the caller's responsibility to not insert more rectangles than can
     // fit in a signed 16-bit integer.
     assert(bottomGroupIndex <=
            std::numeric_limits<int16_t>::max() - layerCount);

     if constexpr (Type == GroupingType::overlapAllowed)
     {
         // Clear bits of anything that wasn't in our max draw group, they don't
         // matter (there's already a barrier between them)
         auto overlapMask = (results.maxGroupIndices == bottomGroupIndex);
         results.overlapBits &= overlapMask;

         // Now doing this reduce will give just the bits related to things we
         // overlapped.
         int16_t allOverlapBits = simd::reduce_or(results.overlapBits);
         if ((allOverlapBits & disallowedOverlapBitsMask) != 0)
         {
             // Bits were set in any overlapping rectangle that prevent an
             // overlap here, so we cannot stay in the same group.
             bottomGroupIndex++;
         }
         else
         {
             bottomGroupIndex = std::max(bottomGroupIndex, int16_t(1));
         }
     }
     else
     {
         // With no overlapping allowed, always start in the next group up from
         // the lowest one we intersected (or 1, if no intersection occurred)
         bottomGroupIndex++;
     }

     // Add the rectangle and its newly-found groupIndex to each tile it touches.
     int16_t topGroupIndex = bottomGroupIndex + layerCount - 1;
     for (int y = span.y; y <= span.w; ++y)
     {
         auto tileIter = m_tiles.begin() + y * m_cols + span.x;
         for (int x = span.x; x <= span.z; ++x)
         {
             tileIter->addRectangle<Type>(ltrb,
                                          topGroupIndex,
                                          currentRectangleOverlapBits);
             ++tileIter;
         }
     }

     return bottomGroupIndex;
 }

 // Explicitly instantiate the public template methods to ensure they get linked

 template IntersectionTile::FindResult IntersectionTile::
     findMaxIntersectingGroupIndex<GroupingType::disjoint>(
         int4 ltrb,
         FindResult running) const;
 template IntersectionTile::FindResult IntersectionTile::
     findMaxIntersectingGroupIndex<GroupingType::overlapAllowed>(
         int4 ltrb,
         FindResult running) const;

 template void IntersectionTile::addRectangle<GroupingType::disjoint>(
     int4 ltrb,
     int16_t groupIndex,
     uint16_t overlapBits);
 template void IntersectionTile::addRectangle<GroupingType::overlapAllowed>(
     int4 ltrb,
     int16_t groupIndex,
     uint16_t overlapBits);

 #ifdef WITH_RIVE_TOOLS
 template void IntersectionTile::testingOnly_addRectangleAndValidate<
     GroupingType::disjoint>(int4 ltrb,
                             int16_t groupIndex,
                             uint16_t overlapBits);
 template void IntersectionTile::testingOnly_addRectangleAndValidate<
     GroupingType::overlapAllowed>(int4 ltrb,
                                   int16_t groupIndex,
                                   uint16_t overlapBits);
 #endif
 } // namespace rive::gpu