blob: deb2d94b7d3c1a6e5a220f7c06dd37a46daa986e [file] [log] [blame]
//
// Copyright 2016 Google Inc.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//
#ifdef __cplusplus
extern "C"
{
#endif
#include "hs_cuda.h"
#ifdef __cplusplus
}
#endif
#include "hs_cuda_config.h"
#include "../hs_cuda_macros.h"
//
//
//
HS_OFFSET_BS_KERNEL_PROTO(1, 0)
{
HS_OFFSET_SLAB_GLOBAL_PREAMBLE();
HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8);
HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 9);
HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 10);
HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 11);
HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 12);
HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 13);
HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 14);
HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 15);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r6, r11);
HS_CMP_XCHG(r7, r10);
HS_CMP_XCHG(r4, r13);
HS_CMP_XCHG(r14, r15);
HS_CMP_XCHG(r8, r12);
HS_CMP_XCHG(r2, r3);
HS_CMP_XCHG(r5, r9);
HS_CMP_XCHG(r2, r5);
HS_CMP_XCHG(r8, r14);
HS_CMP_XCHG(r3, r9);
HS_CMP_XCHG(r12, r15);
HS_CMP_XCHG(r3, r5);
HS_CMP_XCHG(r6, r7);
HS_CMP_XCHG(r10, r11);
HS_CMP_XCHG(r12, r14);
HS_CMP_XCHG(r4, r9);
HS_CMP_XCHG(r8, r13);
HS_CMP_XCHG(r7, r9);
HS_CMP_XCHG(r11, r13);
HS_CMP_XCHG(r4, r6);
HS_CMP_XCHG(r8, r10);
HS_CMP_XCHG(r4, r5);
HS_CMP_XCHG(r6, r7);
HS_CMP_XCHG(r8, r9);
HS_CMP_XCHG(r10, r11);
HS_CMP_XCHG(r12, r13);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
{
HS_SLAB_FLIP_PREAMBLE(1);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
{
HS_SLAB_FLIP_PREAMBLE(3);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
{
HS_SLAB_FLIP_PREAMBLE(7);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
{
HS_SLAB_FLIP_PREAMBLE(15);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
{
HS_SLAB_FLIP_PREAMBLE(31);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
{
HS_SLAB_HALF_PREAMBLE(8);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
HS_SLAB_GLOBAL_STORE(0, r1);
HS_SLAB_GLOBAL_STORE(1, r2);
HS_SLAB_GLOBAL_STORE(2, r3);
HS_SLAB_GLOBAL_STORE(3, r4);
HS_SLAB_GLOBAL_STORE(4, r5);
HS_SLAB_GLOBAL_STORE(5, r6);
HS_SLAB_GLOBAL_STORE(6, r7);
HS_SLAB_GLOBAL_STORE(7, r8);
HS_SLAB_GLOBAL_STORE(8, r9);
HS_SLAB_GLOBAL_STORE(9, r10);
HS_SLAB_GLOBAL_STORE(10, r11);
HS_SLAB_GLOBAL_STORE(11, r12);
HS_SLAB_GLOBAL_STORE(12, r13);
HS_SLAB_GLOBAL_STORE(13, r14);
HS_SLAB_GLOBAL_STORE(14, r15);
HS_SLAB_GLOBAL_STORE(15, r16);
}
HS_OFFSET_BS_KERNEL_PROTO(2, 1)
{
HS_BLOCK_LOCAL_MEM_DECL(64, 16);
HS_OFFSET_SLAB_GLOBAL_PREAMBLE();
HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8);
HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 9);
HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 10);
HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 11);
HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 12);
HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 13);
HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 14);
HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 15);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r6, r11);
HS_CMP_XCHG(r7, r10);
HS_CMP_XCHG(r4, r13);
HS_CMP_XCHG(r14, r15);
HS_CMP_XCHG(r8, r12);
HS_CMP_XCHG(r2, r3);
HS_CMP_XCHG(r5, r9);
HS_CMP_XCHG(r2, r5);
HS_CMP_XCHG(r8, r14);
HS_CMP_XCHG(r3, r9);
HS_CMP_XCHG(r12, r15);
HS_CMP_XCHG(r3, r5);
HS_CMP_XCHG(r6, r7);
HS_CMP_XCHG(r10, r11);
HS_CMP_XCHG(r12, r14);
HS_CMP_XCHG(r4, r9);
HS_CMP_XCHG(r8, r13);
HS_CMP_XCHG(r7, r9);
HS_CMP_XCHG(r11, r13);
HS_CMP_XCHG(r4, r6);
HS_CMP_XCHG(r8, r10);
HS_CMP_XCHG(r4, r5);
HS_CMP_XCHG(r6, r7);
HS_CMP_XCHG(r8, r9);
HS_CMP_XCHG(r10, r11);
HS_CMP_XCHG(r12, r13);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
{
HS_SLAB_FLIP_PREAMBLE(1);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
{
HS_SLAB_FLIP_PREAMBLE(3);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
{
HS_SLAB_FLIP_PREAMBLE(7);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
{
HS_SLAB_FLIP_PREAMBLE(15);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
{
HS_SLAB_FLIP_PREAMBLE(31);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
{
HS_SLAB_HALF_PREAMBLE(8);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
HS_BS_MERGE_H_PREAMBLE(2);
HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0) = r1;
HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1) = r16;
HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2) = r2;
HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3) = r15;
HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4) = r3;
HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5) = r14;
HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6) = r4;
HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7) = r13;
HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 8) = r5;
HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 9) = r12;
HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 10) = r6;
HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 11) = r11;
HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 12) = r7;
HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 13) = r10;
HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 14) = r8;
HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 15) = r9;
HS_BLOCK_BARRIER();
{
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(32);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(0) = r0_1;
HS_SLAB_LOCAL_R(32) = r0_2;
}
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(128);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(160);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(128) = r0_1;
HS_SLAB_LOCAL_R(160) = r0_2;
}
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(256);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(288);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(256) = r0_1;
HS_SLAB_LOCAL_R(288) = r0_2;
}
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(384);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(416);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(384) = r0_1;
HS_SLAB_LOCAL_R(416) = r0_2;
}
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(512);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(544);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(512) = r0_1;
HS_SLAB_LOCAL_R(544) = r0_2;
}
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(640);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(672);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(640) = r0_1;
HS_SLAB_LOCAL_R(672) = r0_2;
}
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(768);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(800);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(768) = r0_1;
HS_SLAB_LOCAL_R(800) = r0_2;
}
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(896);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(928);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(896) = r0_1;
HS_SLAB_LOCAL_R(928) = r0_2;
}
}
HS_BLOCK_BARRIER();
r1 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0);
r16 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1);
r2 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2);
r15 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3);
r3 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4);
r14 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5);
r4 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6);
r13 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7);
r5 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 8);
r12 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 9);
r6 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 10);
r11 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 11);
r7 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 12);
r10 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 13);
r8 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 14);
r9 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 15);
{
{
HS_SLAB_HALF_PREAMBLE(16);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(8);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
}
HS_SLAB_GLOBAL_STORE(0, r1);
HS_SLAB_GLOBAL_STORE(1, r2);
HS_SLAB_GLOBAL_STORE(2, r3);
HS_SLAB_GLOBAL_STORE(3, r4);
HS_SLAB_GLOBAL_STORE(4, r5);
HS_SLAB_GLOBAL_STORE(5, r6);
HS_SLAB_GLOBAL_STORE(6, r7);
HS_SLAB_GLOBAL_STORE(7, r8);
HS_SLAB_GLOBAL_STORE(8, r9);
HS_SLAB_GLOBAL_STORE(9, r10);
HS_SLAB_GLOBAL_STORE(10, r11);
HS_SLAB_GLOBAL_STORE(11, r12);
HS_SLAB_GLOBAL_STORE(12, r13);
HS_SLAB_GLOBAL_STORE(13, r14);
HS_SLAB_GLOBAL_STORE(14, r15);
HS_SLAB_GLOBAL_STORE(15, r16);
}
HS_OFFSET_BS_KERNEL_PROTO(4, 2)
{
HS_BLOCK_LOCAL_MEM_DECL(128, 16);
HS_OFFSET_SLAB_GLOBAL_PREAMBLE();
HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8);
HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 9);
HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 10);
HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 11);
HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 12);
HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 13);
HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 14);
HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 15);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r6, r11);
HS_CMP_XCHG(r7, r10);
HS_CMP_XCHG(r4, r13);
HS_CMP_XCHG(r14, r15);
HS_CMP_XCHG(r8, r12);
HS_CMP_XCHG(r2, r3);
HS_CMP_XCHG(r5, r9);
HS_CMP_XCHG(r2, r5);
HS_CMP_XCHG(r8, r14);
HS_CMP_XCHG(r3, r9);
HS_CMP_XCHG(r12, r15);
HS_CMP_XCHG(r3, r5);
HS_CMP_XCHG(r6, r7);
HS_CMP_XCHG(r10, r11);
HS_CMP_XCHG(r12, r14);
HS_CMP_XCHG(r4, r9);
HS_CMP_XCHG(r8, r13);
HS_CMP_XCHG(r7, r9);
HS_CMP_XCHG(r11, r13);
HS_CMP_XCHG(r4, r6);
HS_CMP_XCHG(r8, r10);
HS_CMP_XCHG(r4, r5);
HS_CMP_XCHG(r6, r7);
HS_CMP_XCHG(r8, r9);
HS_CMP_XCHG(r10, r11);
HS_CMP_XCHG(r12, r13);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
{
HS_SLAB_FLIP_PREAMBLE(1);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
{
HS_SLAB_FLIP_PREAMBLE(3);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
{
HS_SLAB_FLIP_PREAMBLE(7);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
{
HS_SLAB_FLIP_PREAMBLE(15);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
{
HS_SLAB_FLIP_PREAMBLE(31);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
{
HS_SLAB_HALF_PREAMBLE(8);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
HS_BS_MERGE_H_PREAMBLE(4);
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0) = r1;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1) = r16;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2) = r2;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3) = r15;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4) = r3;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5) = r14;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6) = r4;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7) = r13;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 8) = r5;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 9) = r12;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 10) = r6;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 11) = r11;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 12) = r7;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 13) = r10;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 14) = r8;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 15) = r9;
HS_BLOCK_BARRIER();
{
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(32);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(0) = r0_1;
HS_SLAB_LOCAL_R(32) = r0_2;
}
{
HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(64);
HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(96);
HS_CMP_XCHG(r1_1, r1_2);
HS_SLAB_LOCAL_L(64) = r1_1;
HS_SLAB_LOCAL_R(96) = r1_2;
}
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(512);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(544);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(512) = r0_1;
HS_SLAB_LOCAL_R(544) = r0_2;
}
{
HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(576);
HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(608);
HS_CMP_XCHG(r1_1, r1_2);
HS_SLAB_LOCAL_L(576) = r1_1;
HS_SLAB_LOCAL_R(608) = r1_2;
}
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(1024);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(1056);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(1024) = r0_1;
HS_SLAB_LOCAL_R(1056) = r0_2;
}
{
HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(1088);
HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(1120);
HS_CMP_XCHG(r1_1, r1_2);
HS_SLAB_LOCAL_L(1088) = r1_1;
HS_SLAB_LOCAL_R(1120) = r1_2;
}
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(1536);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(1568);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(1536) = r0_1;
HS_SLAB_LOCAL_R(1568) = r0_2;
}
{
HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(1600);
HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(1632);
HS_CMP_XCHG(r1_1, r1_2);
HS_SLAB_LOCAL_L(1600) = r1_1;
HS_SLAB_LOCAL_R(1632) = r1_2;
}
}
HS_BLOCK_BARRIER();
r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0);
r16 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1);
r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2);
r15 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3);
r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4);
r14 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5);
r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6);
r13 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7);
r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 8);
r12 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 9);
r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 10);
r11 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 11);
r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 12);
r10 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 13);
r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 14);
r9 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 15);
{
{
HS_SLAB_HALF_PREAMBLE(16);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(8);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
}
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0) = r1;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1) = r16;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2) = r2;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3) = r15;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4) = r3;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5) = r14;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6) = r4;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7) = r13;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 8) = r5;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 9) = r12;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 10) = r6;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 11) = r11;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 12) = r7;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 13) = r10;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 14) = r8;
HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 15) = r9;
HS_BLOCK_BARRIER();
{
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32);
HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(64);
HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(96);
HS_CMP_XCHG(r0_2, r0_3);
HS_CMP_XCHG(r0_1, r0_4);
HS_CMP_XCHG(r0_3, r0_4);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(0) = r0_1;
HS_SLAB_LOCAL_L(32) = r0_2;
HS_SLAB_LOCAL_R(64) = r0_3;
HS_SLAB_LOCAL_R(96) = r0_4;
}
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(512);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(544);
HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(576);
HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(608);
HS_CMP_XCHG(r0_2, r0_3);
HS_CMP_XCHG(r0_1, r0_4);
HS_CMP_XCHG(r0_3, r0_4);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(512) = r0_1;
HS_SLAB_LOCAL_L(544) = r0_2;
HS_SLAB_LOCAL_R(576) = r0_3;
HS_SLAB_LOCAL_R(608) = r0_4;
}
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(1024);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(1056);
HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(1088);
HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(1120);
HS_CMP_XCHG(r0_2, r0_3);
HS_CMP_XCHG(r0_1, r0_4);
HS_CMP_XCHG(r0_3, r0_4);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(1024) = r0_1;
HS_SLAB_LOCAL_L(1056) = r0_2;
HS_SLAB_LOCAL_R(1088) = r0_3;
HS_SLAB_LOCAL_R(1120) = r0_4;
}
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(1536);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(1568);
HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(1600);
HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(1632);
HS_CMP_XCHG(r0_2, r0_3);
HS_CMP_XCHG(r0_1, r0_4);
HS_CMP_XCHG(r0_3, r0_4);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(1536) = r0_1;
HS_SLAB_LOCAL_L(1568) = r0_2;
HS_SLAB_LOCAL_R(1600) = r0_3;
HS_SLAB_LOCAL_R(1632) = r0_4;
}
}
HS_BLOCK_BARRIER();
r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0);
r16 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1);
r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2);
r15 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3);
r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4);
r14 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5);
r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6);
r13 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7);
r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 8);
r12 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 9);
r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 10);
r11 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 11);
r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 12);
r10 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 13);
r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 14);
r9 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 15);
{
{
HS_SLAB_HALF_PREAMBLE(16);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(8);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
}
HS_SLAB_GLOBAL_STORE(0, r1);
HS_SLAB_GLOBAL_STORE(1, r2);
HS_SLAB_GLOBAL_STORE(2, r3);
HS_SLAB_GLOBAL_STORE(3, r4);
HS_SLAB_GLOBAL_STORE(4, r5);
HS_SLAB_GLOBAL_STORE(5, r6);
HS_SLAB_GLOBAL_STORE(6, r7);
HS_SLAB_GLOBAL_STORE(7, r8);
HS_SLAB_GLOBAL_STORE(8, r9);
HS_SLAB_GLOBAL_STORE(9, r10);
HS_SLAB_GLOBAL_STORE(10, r11);
HS_SLAB_GLOBAL_STORE(11, r12);
HS_SLAB_GLOBAL_STORE(12, r13);
HS_SLAB_GLOBAL_STORE(13, r14);
HS_SLAB_GLOBAL_STORE(14, r15);
HS_SLAB_GLOBAL_STORE(15, r16);
}
HS_OFFSET_BS_KERNEL_PROTO(8, 3)
{
HS_BLOCK_LOCAL_MEM_DECL(256, 16);
HS_OFFSET_SLAB_GLOBAL_PREAMBLE();
HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8);
HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 9);
HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 10);
HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 11);
HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 12);
HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 13);
HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 14);
HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 15);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r6, r11);
HS_CMP_XCHG(r7, r10);
HS_CMP_XCHG(r4, r13);
HS_CMP_XCHG(r14, r15);
HS_CMP_XCHG(r8, r12);
HS_CMP_XCHG(r2, r3);
HS_CMP_XCHG(r5, r9);
HS_CMP_XCHG(r2, r5);
HS_CMP_XCHG(r8, r14);
HS_CMP_XCHG(r3, r9);
HS_CMP_XCHG(r12, r15);
HS_CMP_XCHG(r3, r5);
HS_CMP_XCHG(r6, r7);
HS_CMP_XCHG(r10, r11);
HS_CMP_XCHG(r12, r14);
HS_CMP_XCHG(r4, r9);
HS_CMP_XCHG(r8, r13);
HS_CMP_XCHG(r7, r9);
HS_CMP_XCHG(r11, r13);
HS_CMP_XCHG(r4, r6);
HS_CMP_XCHG(r8, r10);
HS_CMP_XCHG(r4, r5);
HS_CMP_XCHG(r6, r7);
HS_CMP_XCHG(r8, r9);
HS_CMP_XCHG(r10, r11);
HS_CMP_XCHG(r12, r13);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
{
HS_SLAB_FLIP_PREAMBLE(1);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
{
HS_SLAB_FLIP_PREAMBLE(3);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
{
HS_SLAB_FLIP_PREAMBLE(7);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
{
HS_SLAB_FLIP_PREAMBLE(15);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
{
HS_SLAB_FLIP_PREAMBLE(31);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
{
HS_SLAB_HALF_PREAMBLE(8);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
HS_BS_MERGE_H_PREAMBLE(8);
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r16;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r15;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r14;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r13;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8) = r5;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9) = r12;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10) = r6;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11) = r11;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12) = r7;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13) = r10;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14) = r8;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15) = r9;
HS_BLOCK_BARRIER();
{
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(32);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(0) = r0_1;
HS_SLAB_LOCAL_R(32) = r0_2;
}
{
HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(64);
HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(96);
HS_CMP_XCHG(r1_1, r1_2);
HS_SLAB_LOCAL_L(64) = r1_1;
HS_SLAB_LOCAL_R(96) = r1_2;
}
{
HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(128);
HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(160);
HS_CMP_XCHG(r2_1, r2_2);
HS_SLAB_LOCAL_L(128) = r2_1;
HS_SLAB_LOCAL_R(160) = r2_2;
}
{
HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(192);
HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(224);
HS_CMP_XCHG(r3_1, r3_2);
HS_SLAB_LOCAL_L(192) = r3_1;
HS_SLAB_LOCAL_R(224) = r3_2;
}
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(2048);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(2080);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(2048) = r0_1;
HS_SLAB_LOCAL_R(2080) = r0_2;
}
{
HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(2112);
HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(2144);
HS_CMP_XCHG(r1_1, r1_2);
HS_SLAB_LOCAL_L(2112) = r1_1;
HS_SLAB_LOCAL_R(2144) = r1_2;
}
{
HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(2176);
HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(2208);
HS_CMP_XCHG(r2_1, r2_2);
HS_SLAB_LOCAL_L(2176) = r2_1;
HS_SLAB_LOCAL_R(2208) = r2_2;
}
{
HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(2240);
HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(2272);
HS_CMP_XCHG(r3_1, r3_2);
HS_SLAB_LOCAL_L(2240) = r3_1;
HS_SLAB_LOCAL_R(2272) = r3_2;
}
}
HS_BLOCK_BARRIER();
r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0);
r16 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1);
r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2);
r15 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3);
r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4);
r14 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5);
r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6);
r13 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7);
r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8);
r12 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9);
r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10);
r11 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11);
r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12);
r10 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13);
r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14);
r9 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15);
{
{
HS_SLAB_HALF_PREAMBLE(16);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(8);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
}
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r16;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r15;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r14;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r13;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8) = r5;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9) = r12;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10) = r6;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11) = r11;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12) = r7;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13) = r10;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14) = r8;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15) = r9;
HS_BLOCK_BARRIER();
{
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32);
HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(64);
HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(96);
HS_CMP_XCHG(r0_2, r0_3);
HS_CMP_XCHG(r0_1, r0_4);
HS_CMP_XCHG(r0_3, r0_4);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(0) = r0_1;
HS_SLAB_LOCAL_L(32) = r0_2;
HS_SLAB_LOCAL_R(64) = r0_3;
HS_SLAB_LOCAL_R(96) = r0_4;
}
{
HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(128);
HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(160);
HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(192);
HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(224);
HS_CMP_XCHG(r1_2, r1_3);
HS_CMP_XCHG(r1_1, r1_4);
HS_CMP_XCHG(r1_3, r1_4);
HS_CMP_XCHG(r1_1, r1_2);
HS_SLAB_LOCAL_L(128) = r1_1;
HS_SLAB_LOCAL_L(160) = r1_2;
HS_SLAB_LOCAL_R(192) = r1_3;
HS_SLAB_LOCAL_R(224) = r1_4;
}
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(2048);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(2080);
HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(2112);
HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(2144);
HS_CMP_XCHG(r0_2, r0_3);
HS_CMP_XCHG(r0_1, r0_4);
HS_CMP_XCHG(r0_3, r0_4);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(2048) = r0_1;
HS_SLAB_LOCAL_L(2080) = r0_2;
HS_SLAB_LOCAL_R(2112) = r0_3;
HS_SLAB_LOCAL_R(2144) = r0_4;
}
{
HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(2176);
HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(2208);
HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(2240);
HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(2272);
HS_CMP_XCHG(r1_2, r1_3);
HS_CMP_XCHG(r1_1, r1_4);
HS_CMP_XCHG(r1_3, r1_4);
HS_CMP_XCHG(r1_1, r1_2);
HS_SLAB_LOCAL_L(2176) = r1_1;
HS_SLAB_LOCAL_L(2208) = r1_2;
HS_SLAB_LOCAL_R(2240) = r1_3;
HS_SLAB_LOCAL_R(2272) = r1_4;
}
}
HS_BLOCK_BARRIER();
r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0);
r16 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1);
r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2);
r15 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3);
r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4);
r14 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5);
r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6);
r13 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7);
r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8);
r12 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9);
r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10);
r11 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11);
r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12);
r10 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13);
r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14);
r9 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15);
{
{
HS_SLAB_HALF_PREAMBLE(16);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(8);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
}
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r16;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r15;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r14;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r13;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8) = r5;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9) = r12;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10) = r6;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11) = r11;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12) = r7;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13) = r10;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14) = r8;
HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15) = r9;
HS_BLOCK_BARRIER();
{
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32);
HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(64);
HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(96);
HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(128);
HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(160);
HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(192);
HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(224);
HS_CMP_XCHG(r0_4, r0_5);
HS_CMP_XCHG(r0_3, r0_6);
HS_CMP_XCHG(r0_2, r0_7);
HS_CMP_XCHG(r0_1, r0_8);
HS_CMP_XCHG(r0_5, r0_7);
HS_CMP_XCHG(r0_6, r0_8);
HS_CMP_XCHG(r0_5, r0_6);
HS_CMP_XCHG(r0_7, r0_8);
HS_CMP_XCHG(r0_1, r0_3);
HS_CMP_XCHG(r0_2, r0_4);
HS_CMP_XCHG(r0_1, r0_2);
HS_CMP_XCHG(r0_3, r0_4);
HS_SLAB_LOCAL_L(0) = r0_1;
HS_SLAB_LOCAL_L(32) = r0_2;
HS_SLAB_LOCAL_L(64) = r0_3;
HS_SLAB_LOCAL_L(96) = r0_4;
HS_SLAB_LOCAL_R(128) = r0_5;
HS_SLAB_LOCAL_R(160) = r0_6;
HS_SLAB_LOCAL_R(192) = r0_7;
HS_SLAB_LOCAL_R(224) = r0_8;
}
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(2048);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(2080);
HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(2112);
HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(2144);
HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(2176);
HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(2208);
HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(2240);
HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(2272);
HS_CMP_XCHG(r0_4, r0_5);
HS_CMP_XCHG(r0_3, r0_6);
HS_CMP_XCHG(r0_2, r0_7);
HS_CMP_XCHG(r0_1, r0_8);
HS_CMP_XCHG(r0_5, r0_7);
HS_CMP_XCHG(r0_6, r0_8);
HS_CMP_XCHG(r0_5, r0_6);
HS_CMP_XCHG(r0_7, r0_8);
HS_CMP_XCHG(r0_1, r0_3);
HS_CMP_XCHG(r0_2, r0_4);
HS_CMP_XCHG(r0_1, r0_2);
HS_CMP_XCHG(r0_3, r0_4);
HS_SLAB_LOCAL_L(2048) = r0_1;
HS_SLAB_LOCAL_L(2080) = r0_2;
HS_SLAB_LOCAL_L(2112) = r0_3;
HS_SLAB_LOCAL_L(2144) = r0_4;
HS_SLAB_LOCAL_R(2176) = r0_5;
HS_SLAB_LOCAL_R(2208) = r0_6;
HS_SLAB_LOCAL_R(2240) = r0_7;
HS_SLAB_LOCAL_R(2272) = r0_8;
}
}
HS_BLOCK_BARRIER();
r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0);
r16 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1);
r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2);
r15 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3);
r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4);
r14 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5);
r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6);
r13 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7);
r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8);
r12 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9);
r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10);
r11 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11);
r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12);
r10 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13);
r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14);
r9 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15);
{
{
HS_SLAB_HALF_PREAMBLE(16);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(8);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
}
HS_SLAB_GLOBAL_STORE(0, r1);
HS_SLAB_GLOBAL_STORE(1, r2);
HS_SLAB_GLOBAL_STORE(2, r3);
HS_SLAB_GLOBAL_STORE(3, r4);
HS_SLAB_GLOBAL_STORE(4, r5);
HS_SLAB_GLOBAL_STORE(5, r6);
HS_SLAB_GLOBAL_STORE(6, r7);
HS_SLAB_GLOBAL_STORE(7, r8);
HS_SLAB_GLOBAL_STORE(8, r9);
HS_SLAB_GLOBAL_STORE(9, r10);
HS_SLAB_GLOBAL_STORE(10, r11);
HS_SLAB_GLOBAL_STORE(11, r12);
HS_SLAB_GLOBAL_STORE(12, r13);
HS_SLAB_GLOBAL_STORE(13, r14);
HS_SLAB_GLOBAL_STORE(14, r15);
HS_SLAB_GLOBAL_STORE(15, r16);
}
HS_BS_KERNEL_PROTO(16, 4)
{
HS_BLOCK_LOCAL_MEM_DECL(512, 16);
HS_SLAB_GLOBAL_PREAMBLE();
HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0);
HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1);
HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2);
HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3);
HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4);
HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5);
HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6);
HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7);
HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8);
HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 9);
HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 10);
HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 11);
HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 12);
HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 13);
HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 14);
HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 15);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r6, r11);
HS_CMP_XCHG(r7, r10);
HS_CMP_XCHG(r4, r13);
HS_CMP_XCHG(r14, r15);
HS_CMP_XCHG(r8, r12);
HS_CMP_XCHG(r2, r3);
HS_CMP_XCHG(r5, r9);
HS_CMP_XCHG(r2, r5);
HS_CMP_XCHG(r8, r14);
HS_CMP_XCHG(r3, r9);
HS_CMP_XCHG(r12, r15);
HS_CMP_XCHG(r3, r5);
HS_CMP_XCHG(r6, r7);
HS_CMP_XCHG(r10, r11);
HS_CMP_XCHG(r12, r14);
HS_CMP_XCHG(r4, r9);
HS_CMP_XCHG(r8, r13);
HS_CMP_XCHG(r7, r9);
HS_CMP_XCHG(r11, r13);
HS_CMP_XCHG(r4, r6);
HS_CMP_XCHG(r8, r10);
HS_CMP_XCHG(r4, r5);
HS_CMP_XCHG(r6, r7);
HS_CMP_XCHG(r8, r9);
HS_CMP_XCHG(r10, r11);
HS_CMP_XCHG(r12, r13);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
{
HS_SLAB_FLIP_PREAMBLE(1);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
{
HS_SLAB_FLIP_PREAMBLE(3);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
{
HS_SLAB_FLIP_PREAMBLE(7);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
{
HS_SLAB_FLIP_PREAMBLE(15);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
{
HS_SLAB_FLIP_PREAMBLE(31);
HS_CMP_FLIP(0, r1, r16);
HS_CMP_FLIP(1, r2, r15);
HS_CMP_FLIP(2, r3, r14);
HS_CMP_FLIP(3, r4, r13);
HS_CMP_FLIP(4, r5, r12);
HS_CMP_FLIP(5, r6, r11);
HS_CMP_FLIP(6, r7, r10);
HS_CMP_FLIP(7, r8, r9);
}
{
HS_SLAB_HALF_PREAMBLE(8);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
HS_BS_MERGE_H_PREAMBLE(16);
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r16;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r15;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r14;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r13;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8) = r5;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9) = r12;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10) = r6;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11) = r11;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12) = r7;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13) = r10;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14) = r8;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15) = r9;
HS_BLOCK_BARRIER();
{
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(32);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(0) = r0_1;
HS_SLAB_LOCAL_R(32) = r0_2;
}
{
HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(64);
HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(96);
HS_CMP_XCHG(r1_1, r1_2);
HS_SLAB_LOCAL_L(64) = r1_1;
HS_SLAB_LOCAL_R(96) = r1_2;
}
{
HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(128);
HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(160);
HS_CMP_XCHG(r2_1, r2_2);
HS_SLAB_LOCAL_L(128) = r2_1;
HS_SLAB_LOCAL_R(160) = r2_2;
}
{
HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(192);
HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(224);
HS_CMP_XCHG(r3_1, r3_2);
HS_SLAB_LOCAL_L(192) = r3_1;
HS_SLAB_LOCAL_R(224) = r3_2;
}
{
HS_KEY_TYPE r4_1 = HS_SLAB_LOCAL_L(256);
HS_KEY_TYPE r4_2 = HS_SLAB_LOCAL_R(288);
HS_CMP_XCHG(r4_1, r4_2);
HS_SLAB_LOCAL_L(256) = r4_1;
HS_SLAB_LOCAL_R(288) = r4_2;
}
{
HS_KEY_TYPE r5_1 = HS_SLAB_LOCAL_L(320);
HS_KEY_TYPE r5_2 = HS_SLAB_LOCAL_R(352);
HS_CMP_XCHG(r5_1, r5_2);
HS_SLAB_LOCAL_L(320) = r5_1;
HS_SLAB_LOCAL_R(352) = r5_2;
}
{
HS_KEY_TYPE r6_1 = HS_SLAB_LOCAL_L(384);
HS_KEY_TYPE r6_2 = HS_SLAB_LOCAL_R(416);
HS_CMP_XCHG(r6_1, r6_2);
HS_SLAB_LOCAL_L(384) = r6_1;
HS_SLAB_LOCAL_R(416) = r6_2;
}
{
HS_KEY_TYPE r7_1 = HS_SLAB_LOCAL_L(448);
HS_KEY_TYPE r7_2 = HS_SLAB_LOCAL_R(480);
HS_CMP_XCHG(r7_1, r7_2);
HS_SLAB_LOCAL_L(448) = r7_1;
HS_SLAB_LOCAL_R(480) = r7_2;
}
}
HS_BLOCK_BARRIER();
r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
r16 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
r15 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
r14 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
r13 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8);
r12 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9);
r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10);
r11 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11);
r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12);
r10 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13);
r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14);
r9 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15);
{
{
HS_SLAB_HALF_PREAMBLE(16);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(8);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
}
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r16;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r15;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r14;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r13;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8) = r5;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9) = r12;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10) = r6;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11) = r11;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12) = r7;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13) = r10;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14) = r8;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15) = r9;
HS_BLOCK_BARRIER();
{
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32);
HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(64);
HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(96);
HS_CMP_XCHG(r0_2, r0_3);
HS_CMP_XCHG(r0_1, r0_4);
HS_CMP_XCHG(r0_3, r0_4);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(0) = r0_1;
HS_SLAB_LOCAL_L(32) = r0_2;
HS_SLAB_LOCAL_R(64) = r0_3;
HS_SLAB_LOCAL_R(96) = r0_4;
}
{
HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(128);
HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(160);
HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(192);
HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(224);
HS_CMP_XCHG(r1_2, r1_3);
HS_CMP_XCHG(r1_1, r1_4);
HS_CMP_XCHG(r1_3, r1_4);
HS_CMP_XCHG(r1_1, r1_2);
HS_SLAB_LOCAL_L(128) = r1_1;
HS_SLAB_LOCAL_L(160) = r1_2;
HS_SLAB_LOCAL_R(192) = r1_3;
HS_SLAB_LOCAL_R(224) = r1_4;
}
{
HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(256);
HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_L(288);
HS_KEY_TYPE r2_3 = HS_SLAB_LOCAL_R(320);
HS_KEY_TYPE r2_4 = HS_SLAB_LOCAL_R(352);
HS_CMP_XCHG(r2_2, r2_3);
HS_CMP_XCHG(r2_1, r2_4);
HS_CMP_XCHG(r2_3, r2_4);
HS_CMP_XCHG(r2_1, r2_2);
HS_SLAB_LOCAL_L(256) = r2_1;
HS_SLAB_LOCAL_L(288) = r2_2;
HS_SLAB_LOCAL_R(320) = r2_3;
HS_SLAB_LOCAL_R(352) = r2_4;
}
{
HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(384);
HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_L(416);
HS_KEY_TYPE r3_3 = HS_SLAB_LOCAL_R(448);
HS_KEY_TYPE r3_4 = HS_SLAB_LOCAL_R(480);
HS_CMP_XCHG(r3_2, r3_3);
HS_CMP_XCHG(r3_1, r3_4);
HS_CMP_XCHG(r3_3, r3_4);
HS_CMP_XCHG(r3_1, r3_2);
HS_SLAB_LOCAL_L(384) = r3_1;
HS_SLAB_LOCAL_L(416) = r3_2;
HS_SLAB_LOCAL_R(448) = r3_3;
HS_SLAB_LOCAL_R(480) = r3_4;
}
}
HS_BLOCK_BARRIER();
r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
r16 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
r15 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
r14 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
r13 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8);
r12 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9);
r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10);
r11 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11);
r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12);
r10 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13);
r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14);
r9 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15);
{
{
HS_SLAB_HALF_PREAMBLE(16);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(8);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
}
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r16;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r15;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r14;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r13;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8) = r5;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9) = r12;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10) = r6;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11) = r11;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12) = r7;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13) = r10;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14) = r8;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15) = r9;
HS_BLOCK_BARRIER();
{
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32);
HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(64);
HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(96);
HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(128);
HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(160);
HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(192);
HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(224);
HS_CMP_XCHG(r0_4, r0_5);
HS_CMP_XCHG(r0_3, r0_6);
HS_CMP_XCHG(r0_2, r0_7);
HS_CMP_XCHG(r0_1, r0_8);
HS_CMP_XCHG(r0_5, r0_7);
HS_CMP_XCHG(r0_6, r0_8);
HS_CMP_XCHG(r0_5, r0_6);
HS_CMP_XCHG(r0_7, r0_8);
HS_CMP_XCHG(r0_1, r0_3);
HS_CMP_XCHG(r0_2, r0_4);
HS_CMP_XCHG(r0_1, r0_2);
HS_CMP_XCHG(r0_3, r0_4);
HS_SLAB_LOCAL_L(0) = r0_1;
HS_SLAB_LOCAL_L(32) = r0_2;
HS_SLAB_LOCAL_L(64) = r0_3;
HS_SLAB_LOCAL_L(96) = r0_4;
HS_SLAB_LOCAL_R(128) = r0_5;
HS_SLAB_LOCAL_R(160) = r0_6;
HS_SLAB_LOCAL_R(192) = r0_7;
HS_SLAB_LOCAL_R(224) = r0_8;
}
{
HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(256);
HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(288);
HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_L(320);
HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_L(352);
HS_KEY_TYPE r1_5 = HS_SLAB_LOCAL_R(384);
HS_KEY_TYPE r1_6 = HS_SLAB_LOCAL_R(416);
HS_KEY_TYPE r1_7 = HS_SLAB_LOCAL_R(448);
HS_KEY_TYPE r1_8 = HS_SLAB_LOCAL_R(480);
HS_CMP_XCHG(r1_4, r1_5);
HS_CMP_XCHG(r1_3, r1_6);
HS_CMP_XCHG(r1_2, r1_7);
HS_CMP_XCHG(r1_1, r1_8);
HS_CMP_XCHG(r1_5, r1_7);
HS_CMP_XCHG(r1_6, r1_8);
HS_CMP_XCHG(r1_5, r1_6);
HS_CMP_XCHG(r1_7, r1_8);
HS_CMP_XCHG(r1_1, r1_3);
HS_CMP_XCHG(r1_2, r1_4);
HS_CMP_XCHG(r1_1, r1_2);
HS_CMP_XCHG(r1_3, r1_4);
HS_SLAB_LOCAL_L(256) = r1_1;
HS_SLAB_LOCAL_L(288) = r1_2;
HS_SLAB_LOCAL_L(320) = r1_3;
HS_SLAB_LOCAL_L(352) = r1_4;
HS_SLAB_LOCAL_R(384) = r1_5;
HS_SLAB_LOCAL_R(416) = r1_6;
HS_SLAB_LOCAL_R(448) = r1_7;
HS_SLAB_LOCAL_R(480) = r1_8;
}
}
HS_BLOCK_BARRIER();
r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
r16 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
r15 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
r14 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
r13 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8);
r12 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9);
r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10);
r11 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11);
r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12);
r10 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13);
r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14);
r9 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15);
{
{
HS_SLAB_HALF_PREAMBLE(16);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(8);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
}
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r16;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r15;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r14;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r13;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8) = r5;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9) = r12;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10) = r6;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11) = r11;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12) = r7;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13) = r10;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14) = r8;
HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15) = r9;
HS_BLOCK_BARRIER();
{
{
HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0);
HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32);
HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(64);
HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(96);
HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_L(128);
HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_L(160);
HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_L(192);
HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_L(224);
HS_KEY_TYPE r0_9 = HS_SLAB_LOCAL_R(256);
HS_KEY_TYPE r0_10 = HS_SLAB_LOCAL_R(288);
HS_KEY_TYPE r0_11 = HS_SLAB_LOCAL_R(320);
HS_KEY_TYPE r0_12 = HS_SLAB_LOCAL_R(352);
HS_KEY_TYPE r0_13 = HS_SLAB_LOCAL_R(384);
HS_KEY_TYPE r0_14 = HS_SLAB_LOCAL_R(416);
HS_KEY_TYPE r0_15 = HS_SLAB_LOCAL_R(448);
HS_KEY_TYPE r0_16 = HS_SLAB_LOCAL_R(480);
HS_CMP_XCHG(r0_8, r0_9);
HS_CMP_XCHG(r0_7, r0_10);
HS_CMP_XCHG(r0_6, r0_11);
HS_CMP_XCHG(r0_5, r0_12);
HS_CMP_XCHG(r0_4, r0_13);
HS_CMP_XCHG(r0_3, r0_14);
HS_CMP_XCHG(r0_2, r0_15);
HS_CMP_XCHG(r0_1, r0_16);
HS_CMP_XCHG(r0_9, r0_13);
HS_CMP_XCHG(r0_11, r0_15);
HS_CMP_XCHG(r0_9, r0_11);
HS_CMP_XCHG(r0_13, r0_15);
HS_CMP_XCHG(r0_10, r0_14);
HS_CMP_XCHG(r0_12, r0_16);
HS_CMP_XCHG(r0_10, r0_12);
HS_CMP_XCHG(r0_14, r0_16);
HS_CMP_XCHG(r0_9, r0_10);
HS_CMP_XCHG(r0_11, r0_12);
HS_CMP_XCHG(r0_13, r0_14);
HS_CMP_XCHG(r0_15, r0_16);
HS_CMP_XCHG(r0_1, r0_5);
HS_CMP_XCHG(r0_3, r0_7);
HS_CMP_XCHG(r0_1, r0_3);
HS_CMP_XCHG(r0_5, r0_7);
HS_CMP_XCHG(r0_2, r0_6);
HS_CMP_XCHG(r0_4, r0_8);
HS_CMP_XCHG(r0_2, r0_4);
HS_CMP_XCHG(r0_6, r0_8);
HS_CMP_XCHG(r0_1, r0_2);
HS_CMP_XCHG(r0_3, r0_4);
HS_CMP_XCHG(r0_5, r0_6);
HS_CMP_XCHG(r0_7, r0_8);
HS_SLAB_LOCAL_L(0) = r0_1;
HS_SLAB_LOCAL_L(32) = r0_2;
HS_SLAB_LOCAL_L(64) = r0_3;
HS_SLAB_LOCAL_L(96) = r0_4;
HS_SLAB_LOCAL_L(128) = r0_5;
HS_SLAB_LOCAL_L(160) = r0_6;
HS_SLAB_LOCAL_L(192) = r0_7;
HS_SLAB_LOCAL_L(224) = r0_8;
HS_SLAB_LOCAL_R(256) = r0_9;
HS_SLAB_LOCAL_R(288) = r0_10;
HS_SLAB_LOCAL_R(320) = r0_11;
HS_SLAB_LOCAL_R(352) = r0_12;
HS_SLAB_LOCAL_R(384) = r0_13;
HS_SLAB_LOCAL_R(416) = r0_14;
HS_SLAB_LOCAL_R(448) = r0_15;
HS_SLAB_LOCAL_R(480) = r0_16;
}
}
HS_BLOCK_BARRIER();
r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
r16 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
r15 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
r14 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
r13 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8);
r12 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9);
r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10);
r11 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11);
r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12);
r10 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13);
r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14);
r9 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15);
{
{
HS_SLAB_HALF_PREAMBLE(16);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(8);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
}
HS_SLAB_GLOBAL_STORE(0, r1);
HS_SLAB_GLOBAL_STORE(1, r2);
HS_SLAB_GLOBAL_STORE(2, r3);
HS_SLAB_GLOBAL_STORE(3, r4);
HS_SLAB_GLOBAL_STORE(4, r5);
HS_SLAB_GLOBAL_STORE(5, r6);
HS_SLAB_GLOBAL_STORE(6, r7);
HS_SLAB_GLOBAL_STORE(7, r8);
HS_SLAB_GLOBAL_STORE(8, r9);
HS_SLAB_GLOBAL_STORE(9, r10);
HS_SLAB_GLOBAL_STORE(10, r11);
HS_SLAB_GLOBAL_STORE(11, r12);
HS_SLAB_GLOBAL_STORE(12, r13);
HS_SLAB_GLOBAL_STORE(13, r14);
HS_SLAB_GLOBAL_STORE(14, r15);
HS_SLAB_GLOBAL_STORE(15, r16);
}
HS_BC_KERNEL_PROTO(1, 0)
{
HS_SLAB_GLOBAL_PREAMBLE();
HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vout, 0);
HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vout, 1);
HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vout, 2);
HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vout, 3);
HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vout, 4);
HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vout, 5);
HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vout, 6);
HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vout, 7);
HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vout, 8);
HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vout, 9);
HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vout, 10);
HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vout, 11);
HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vout, 12);
HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vout, 13);
HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vout, 14);
HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vout, 15);
{
{
HS_SLAB_HALF_PREAMBLE(16);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(8);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
}
HS_SLAB_GLOBAL_STORE(0, r1);
HS_SLAB_GLOBAL_STORE(1, r2);
HS_SLAB_GLOBAL_STORE(2, r3);
HS_SLAB_GLOBAL_STORE(3, r4);
HS_SLAB_GLOBAL_STORE(4, r5);
HS_SLAB_GLOBAL_STORE(5, r6);
HS_SLAB_GLOBAL_STORE(6, r7);
HS_SLAB_GLOBAL_STORE(7, r8);
HS_SLAB_GLOBAL_STORE(8, r9);
HS_SLAB_GLOBAL_STORE(9, r10);
HS_SLAB_GLOBAL_STORE(10, r11);
HS_SLAB_GLOBAL_STORE(11, r12);
HS_SLAB_GLOBAL_STORE(12, r13);
HS_SLAB_GLOBAL_STORE(13, r14);
HS_SLAB_GLOBAL_STORE(14, r15);
HS_SLAB_GLOBAL_STORE(15, r16);
}
HS_BC_KERNEL_PROTO(2, 1)
{
HS_BLOCK_LOCAL_MEM_DECL(64, 16);
HS_SLAB_GLOBAL_PREAMBLE();
HS_BC_MERGE_H_PREAMBLE(2);
{
{
HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0);
HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(16);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(0) = r0_1;
HS_SLAB_LOCAL_L(32) = r0_2;
}
{
HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(2);
HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(18);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(128) = r0_1;
HS_SLAB_LOCAL_L(160) = r0_2;
}
{
HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(4);
HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(20);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(256) = r0_1;
HS_SLAB_LOCAL_L(288) = r0_2;
}
{
HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(6);
HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(22);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(384) = r0_1;
HS_SLAB_LOCAL_L(416) = r0_2;
}
{
HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8);
HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(24);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(512) = r0_1;
HS_SLAB_LOCAL_L(544) = r0_2;
}
{
HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(10);
HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(26);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(640) = r0_1;
HS_SLAB_LOCAL_L(672) = r0_2;
}
{
HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(12);
HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(28);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(768) = r0_1;
HS_SLAB_LOCAL_L(800) = r0_2;
}
{
HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(14);
HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(30);
HS_CMP_XCHG(r0_1, r0_2);
HS_SLAB_LOCAL_L(896) = r0_1;
HS_SLAB_LOCAL_L(928) = r0_2;
}
}
HS_BLOCK_BARRIER();
HS_KEY_TYPE r1 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0);
HS_KEY_TYPE r2 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1);
HS_KEY_TYPE r3 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2);
HS_KEY_TYPE r4 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3);
HS_KEY_TYPE r5 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4);
HS_KEY_TYPE r6 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5);
HS_KEY_TYPE r7 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6);
HS_KEY_TYPE r8 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7);
HS_KEY_TYPE r9 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 8);
HS_KEY_TYPE r10 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 9);
HS_KEY_TYPE r11 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 10);
HS_KEY_TYPE r12 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 11);
HS_KEY_TYPE r13 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 12);
HS_KEY_TYPE r14 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 13);
HS_KEY_TYPE r15 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 14);
HS_KEY_TYPE r16 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 15);
{
{
HS_SLAB_HALF_PREAMBLE(16);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(8);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
}
HS_SLAB_GLOBAL_STORE(0, r1);
HS_SLAB_GLOBAL_STORE(1, r2);
HS_SLAB_GLOBAL_STORE(2, r3);
HS_SLAB_GLOBAL_STORE(3, r4);
HS_SLAB_GLOBAL_STORE(4, r5);
HS_SLAB_GLOBAL_STORE(5, r6);
HS_SLAB_GLOBAL_STORE(6, r7);
HS_SLAB_GLOBAL_STORE(7, r8);
HS_SLAB_GLOBAL_STORE(8, r9);
HS_SLAB_GLOBAL_STORE(9, r10);
HS_SLAB_GLOBAL_STORE(10, r11);
HS_SLAB_GLOBAL_STORE(11, r12);
HS_SLAB_GLOBAL_STORE(12, r13);
HS_SLAB_GLOBAL_STORE(13, r14);
HS_SLAB_GLOBAL_STORE(14, r15);
HS_SLAB_GLOBAL_STORE(15, r16);
}
HS_BC_KERNEL_PROTO(4, 2)
{
HS_BLOCK_LOCAL_MEM_DECL(128, 16);
HS_SLAB_GLOBAL_PREAMBLE();
HS_BC_MERGE_H_PREAMBLE(4);
{
{
HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0);
HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(16);
HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(32);
HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(48);
HS_CMP_XCHG(r0_1, r0_3);
HS_CMP_XCHG(r0_2, r0_4);
HS_CMP_XCHG(r0_1, r0_2);
HS_CMP_XCHG(r0_3, r0_4);
HS_SLAB_LOCAL_L(0) = r0_1;
HS_SLAB_LOCAL_L(32) = r0_2;
HS_SLAB_LOCAL_L(64) = r0_3;
HS_SLAB_LOCAL_L(96) = r0_4;
}
{
HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(4);
HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(20);
HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(36);
HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(52);
HS_CMP_XCHG(r0_1, r0_3);
HS_CMP_XCHG(r0_2, r0_4);
HS_CMP_XCHG(r0_1, r0_2);
HS_CMP_XCHG(r0_3, r0_4);
HS_SLAB_LOCAL_L(512) = r0_1;
HS_SLAB_LOCAL_L(544) = r0_2;
HS_SLAB_LOCAL_L(576) = r0_3;
HS_SLAB_LOCAL_L(608) = r0_4;
}
{
HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8);
HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(24);
HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(40);
HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(56);
HS_CMP_XCHG(r0_1, r0_3);
HS_CMP_XCHG(r0_2, r0_4);
HS_CMP_XCHG(r0_1, r0_2);
HS_CMP_XCHG(r0_3, r0_4);
HS_SLAB_LOCAL_L(1024) = r0_1;
HS_SLAB_LOCAL_L(1056) = r0_2;
HS_SLAB_LOCAL_L(1088) = r0_3;
HS_SLAB_LOCAL_L(1120) = r0_4;
}
{
HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(12);
HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(28);
HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(44);
HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(60);
HS_CMP_XCHG(r0_1, r0_3);
HS_CMP_XCHG(r0_2, r0_4);
HS_CMP_XCHG(r0_1, r0_2);
HS_CMP_XCHG(r0_3, r0_4);
HS_SLAB_LOCAL_L(1536) = r0_1;
HS_SLAB_LOCAL_L(1568) = r0_2;
HS_SLAB_LOCAL_L(1600) = r0_3;
HS_SLAB_LOCAL_L(1632) = r0_4;
}
}
HS_BLOCK_BARRIER();
HS_KEY_TYPE r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0);
HS_KEY_TYPE r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1);
HS_KEY_TYPE r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2);
HS_KEY_TYPE r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3);
HS_KEY_TYPE r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4);
HS_KEY_TYPE r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5);
HS_KEY_TYPE r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6);
HS_KEY_TYPE r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7);
HS_KEY_TYPE r9 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 8);
HS_KEY_TYPE r10 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 9);
HS_KEY_TYPE r11 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 10);
HS_KEY_TYPE r12 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 11);
HS_KEY_TYPE r13 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 12);
HS_KEY_TYPE r14 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 13);
HS_KEY_TYPE r15 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 14);
HS_KEY_TYPE r16 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 15);
{
{
HS_SLAB_HALF_PREAMBLE(16);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(8);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
}
HS_SLAB_GLOBAL_STORE(0, r1);
HS_SLAB_GLOBAL_STORE(1, r2);
HS_SLAB_GLOBAL_STORE(2, r3);
HS_SLAB_GLOBAL_STORE(3, r4);
HS_SLAB_GLOBAL_STORE(4, r5);
HS_SLAB_GLOBAL_STORE(5, r6);
HS_SLAB_GLOBAL_STORE(6, r7);
HS_SLAB_GLOBAL_STORE(7, r8);
HS_SLAB_GLOBAL_STORE(8, r9);
HS_SLAB_GLOBAL_STORE(9, r10);
HS_SLAB_GLOBAL_STORE(10, r11);
HS_SLAB_GLOBAL_STORE(11, r12);
HS_SLAB_GLOBAL_STORE(12, r13);
HS_SLAB_GLOBAL_STORE(13, r14);
HS_SLAB_GLOBAL_STORE(14, r15);
HS_SLAB_GLOBAL_STORE(15, r16);
}
HS_BC_KERNEL_PROTO(8, 3)
{
HS_BLOCK_LOCAL_MEM_DECL(256, 16);
HS_SLAB_GLOBAL_PREAMBLE();
HS_BC_MERGE_H_PREAMBLE(8);
{
{
HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0);
HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(16);
HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(32);
HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(48);
HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(64);
HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(80);
HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(96);
HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(112);
HS_CMP_XCHG(r0_1, r0_5);
HS_CMP_XCHG(r0_3, r0_7);
HS_CMP_XCHG(r0_1, r0_3);
HS_CMP_XCHG(r0_5, r0_7);
HS_CMP_XCHG(r0_2, r0_6);
HS_CMP_XCHG(r0_4, r0_8);
HS_CMP_XCHG(r0_2, r0_4);
HS_CMP_XCHG(r0_6, r0_8);
HS_CMP_XCHG(r0_1, r0_2);
HS_CMP_XCHG(r0_3, r0_4);
HS_CMP_XCHG(r0_5, r0_6);
HS_CMP_XCHG(r0_7, r0_8);
HS_SLAB_LOCAL_L(0) = r0_1;
HS_SLAB_LOCAL_L(32) = r0_2;
HS_SLAB_LOCAL_L(64) = r0_3;
HS_SLAB_LOCAL_L(96) = r0_4;
HS_SLAB_LOCAL_L(128) = r0_5;
HS_SLAB_LOCAL_L(160) = r0_6;
HS_SLAB_LOCAL_L(192) = r0_7;
HS_SLAB_LOCAL_L(224) = r0_8;
}
{
HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8);
HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(24);
HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(40);
HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(56);
HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(72);
HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(88);
HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(104);
HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(120);
HS_CMP_XCHG(r0_1, r0_5);
HS_CMP_XCHG(r0_3, r0_7);
HS_CMP_XCHG(r0_1, r0_3);
HS_CMP_XCHG(r0_5, r0_7);
HS_CMP_XCHG(r0_2, r0_6);
HS_CMP_XCHG(r0_4, r0_8);
HS_CMP_XCHG(r0_2, r0_4);
HS_CMP_XCHG(r0_6, r0_8);
HS_CMP_XCHG(r0_1, r0_2);
HS_CMP_XCHG(r0_3, r0_4);
HS_CMP_XCHG(r0_5, r0_6);
HS_CMP_XCHG(r0_7, r0_8);
HS_SLAB_LOCAL_L(2048) = r0_1;
HS_SLAB_LOCAL_L(2080) = r0_2;
HS_SLAB_LOCAL_L(2112) = r0_3;
HS_SLAB_LOCAL_L(2144) = r0_4;
HS_SLAB_LOCAL_L(2176) = r0_5;
HS_SLAB_LOCAL_L(2208) = r0_6;
HS_SLAB_LOCAL_L(2240) = r0_7;
HS_SLAB_LOCAL_L(2272) = r0_8;
}
}
HS_BLOCK_BARRIER();
HS_KEY_TYPE r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0);
HS_KEY_TYPE r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1);
HS_KEY_TYPE r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2);
HS_KEY_TYPE r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3);
HS_KEY_TYPE r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4);
HS_KEY_TYPE r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5);
HS_KEY_TYPE r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6);
HS_KEY_TYPE r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7);
HS_KEY_TYPE r9 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8);
HS_KEY_TYPE r10 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9);
HS_KEY_TYPE r11 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10);
HS_KEY_TYPE r12 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11);
HS_KEY_TYPE r13 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12);
HS_KEY_TYPE r14 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13);
HS_KEY_TYPE r15 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14);
HS_KEY_TYPE r16 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15);
{
{
HS_SLAB_HALF_PREAMBLE(16);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(8);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
}
HS_SLAB_GLOBAL_STORE(0, r1);
HS_SLAB_GLOBAL_STORE(1, r2);
HS_SLAB_GLOBAL_STORE(2, r3);
HS_SLAB_GLOBAL_STORE(3, r4);
HS_SLAB_GLOBAL_STORE(4, r5);
HS_SLAB_GLOBAL_STORE(5, r6);
HS_SLAB_GLOBAL_STORE(6, r7);
HS_SLAB_GLOBAL_STORE(7, r8);
HS_SLAB_GLOBAL_STORE(8, r9);
HS_SLAB_GLOBAL_STORE(9, r10);
HS_SLAB_GLOBAL_STORE(10, r11);
HS_SLAB_GLOBAL_STORE(11, r12);
HS_SLAB_GLOBAL_STORE(12, r13);
HS_SLAB_GLOBAL_STORE(13, r14);
HS_SLAB_GLOBAL_STORE(14, r15);
HS_SLAB_GLOBAL_STORE(15, r16);
}
HS_BC_KERNEL_PROTO(16, 4)
{
HS_BLOCK_LOCAL_MEM_DECL(512, 16);
HS_SLAB_GLOBAL_PREAMBLE();
HS_BC_MERGE_H_PREAMBLE(16);
{
{
HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0);
HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(16);
HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(32);
HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(48);
HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(64);
HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(80);
HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(96);
HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(112);
HS_KEY_TYPE r0_9 = HS_BC_GLOBAL_LOAD_L(128);
HS_KEY_TYPE r0_10 = HS_BC_GLOBAL_LOAD_L(144);
HS_KEY_TYPE r0_11 = HS_BC_GLOBAL_LOAD_L(160);
HS_KEY_TYPE r0_12 = HS_BC_GLOBAL_LOAD_L(176);
HS_KEY_TYPE r0_13 = HS_BC_GLOBAL_LOAD_L(192);
HS_KEY_TYPE r0_14 = HS_BC_GLOBAL_LOAD_L(208);
HS_KEY_TYPE r0_15 = HS_BC_GLOBAL_LOAD_L(224);
HS_KEY_TYPE r0_16 = HS_BC_GLOBAL_LOAD_L(240);
HS_CMP_XCHG(r0_1, r0_9);
HS_CMP_XCHG(r0_5, r0_13);
HS_CMP_XCHG(r0_1, r0_5);
HS_CMP_XCHG(r0_9, r0_13);
HS_CMP_XCHG(r0_3, r0_11);
HS_CMP_XCHG(r0_7, r0_15);
HS_CMP_XCHG(r0_3, r0_7);
HS_CMP_XCHG(r0_11, r0_15);
HS_CMP_XCHG(r0_1, r0_3);
HS_CMP_XCHG(r0_5, r0_7);
HS_CMP_XCHG(r0_9, r0_11);
HS_CMP_XCHG(r0_13, r0_15);
HS_CMP_XCHG(r0_2, r0_10);
HS_CMP_XCHG(r0_6, r0_14);
HS_CMP_XCHG(r0_2, r0_6);
HS_CMP_XCHG(r0_10, r0_14);
HS_CMP_XCHG(r0_4, r0_12);
HS_CMP_XCHG(r0_8, r0_16);
HS_CMP_XCHG(r0_4, r0_8);
HS_CMP_XCHG(r0_12, r0_16);
HS_CMP_XCHG(r0_2, r0_4);
HS_CMP_XCHG(r0_6, r0_8);
HS_CMP_XCHG(r0_10, r0_12);
HS_CMP_XCHG(r0_14, r0_16);
HS_CMP_XCHG(r0_1, r0_2);
HS_CMP_XCHG(r0_3, r0_4);
HS_CMP_XCHG(r0_5, r0_6);
HS_CMP_XCHG(r0_7, r0_8);
HS_CMP_XCHG(r0_9, r0_10);
HS_CMP_XCHG(r0_11, r0_12);
HS_CMP_XCHG(r0_13, r0_14);
HS_CMP_XCHG(r0_15, r0_16);
HS_SLAB_LOCAL_L(0) = r0_1;
HS_SLAB_LOCAL_L(32) = r0_2;
HS_SLAB_LOCAL_L(64) = r0_3;
HS_SLAB_LOCAL_L(96) = r0_4;
HS_SLAB_LOCAL_L(128) = r0_5;
HS_SLAB_LOCAL_L(160) = r0_6;
HS_SLAB_LOCAL_L(192) = r0_7;
HS_SLAB_LOCAL_L(224) = r0_8;
HS_SLAB_LOCAL_L(256) = r0_9;
HS_SLAB_LOCAL_L(288) = r0_10;
HS_SLAB_LOCAL_L(320) = r0_11;
HS_SLAB_LOCAL_L(352) = r0_12;
HS_SLAB_LOCAL_L(384) = r0_13;
HS_SLAB_LOCAL_L(416) = r0_14;
HS_SLAB_LOCAL_L(448) = r0_15;
HS_SLAB_LOCAL_L(480) = r0_16;
}
}
HS_BLOCK_BARRIER();
HS_KEY_TYPE r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0);
HS_KEY_TYPE r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1);
HS_KEY_TYPE r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2);
HS_KEY_TYPE r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3);
HS_KEY_TYPE r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4);
HS_KEY_TYPE r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5);
HS_KEY_TYPE r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6);
HS_KEY_TYPE r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7);
HS_KEY_TYPE r9 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8);
HS_KEY_TYPE r10 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9);
HS_KEY_TYPE r11 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10);
HS_KEY_TYPE r12 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11);
HS_KEY_TYPE r13 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12);
HS_KEY_TYPE r14 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13);
HS_KEY_TYPE r15 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14);
HS_KEY_TYPE r16 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15);
{
{
HS_SLAB_HALF_PREAMBLE(16);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(8);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(4);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(2);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
{
HS_SLAB_HALF_PREAMBLE(1);
HS_CMP_HALF(0, r1);
HS_CMP_HALF(1, r2);
HS_CMP_HALF(2, r3);
HS_CMP_HALF(3, r4);
HS_CMP_HALF(4, r5);
HS_CMP_HALF(5, r6);
HS_CMP_HALF(6, r7);
HS_CMP_HALF(7, r8);
HS_CMP_HALF(8, r9);
HS_CMP_HALF(9, r10);
HS_CMP_HALF(10, r11);
HS_CMP_HALF(11, r12);
HS_CMP_HALF(12, r13);
HS_CMP_HALF(13, r14);
HS_CMP_HALF(14, r15);
HS_CMP_HALF(15, r16);
}
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
}
HS_SLAB_GLOBAL_STORE(0, r1);
HS_SLAB_GLOBAL_STORE(1, r2);
HS_SLAB_GLOBAL_STORE(2, r3);
HS_SLAB_GLOBAL_STORE(3, r4);
HS_SLAB_GLOBAL_STORE(4, r5);
HS_SLAB_GLOBAL_STORE(5, r6);
HS_SLAB_GLOBAL_STORE(6, r7);
HS_SLAB_GLOBAL_STORE(7, r8);
HS_SLAB_GLOBAL_STORE(8, r9);
HS_SLAB_GLOBAL_STORE(9, r10);
HS_SLAB_GLOBAL_STORE(10, r11);
HS_SLAB_GLOBAL_STORE(11, r12);
HS_SLAB_GLOBAL_STORE(12, r13);
HS_SLAB_GLOBAL_STORE(13, r14);
HS_SLAB_GLOBAL_STORE(14, r15);
HS_SLAB_GLOBAL_STORE(15, r16);
}
HS_OFFSET_FM_KERNEL_PROTO(0, 0)
{
HS_OFFSET_FM_PREAMBLE(8);
HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0);
HS_CMP_XCHG(r8, r9);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_XM_GLOBAL_STORE_L(0, r1);
HS_XM_GLOBAL_STORE_L(1, r2);
HS_XM_GLOBAL_STORE_L(2, r3);
HS_XM_GLOBAL_STORE_L(3, r4);
HS_XM_GLOBAL_STORE_L(4, r5);
HS_XM_GLOBAL_STORE_L(5, r6);
HS_XM_GLOBAL_STORE_L(6, r7);
HS_XM_GLOBAL_STORE_L(7, r8);
HS_FM_GLOBAL_STORE_R(0, r9);
}
HS_OFFSET_FM_KERNEL_PROTO(0, 1)
{
HS_OFFSET_FM_PREAMBLE(8);
HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0);
HS_KEY_TYPE r10 = HS_FM_GLOBAL_LOAD_R(1);
HS_CMP_XCHG(r8, r9);
HS_CMP_XCHG(r7, r10);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_XM_GLOBAL_STORE_L(0, r1);
HS_XM_GLOBAL_STORE_L(1, r2);
HS_XM_GLOBAL_STORE_L(2, r3);
HS_XM_GLOBAL_STORE_L(3, r4);
HS_XM_GLOBAL_STORE_L(4, r5);
HS_XM_GLOBAL_STORE_L(5, r6);
HS_XM_GLOBAL_STORE_L(6, r7);
HS_XM_GLOBAL_STORE_L(7, r8);
HS_FM_GLOBAL_STORE_R(0, r9);
HS_FM_GLOBAL_STORE_R(1, r10);
}
HS_OFFSET_FM_KERNEL_PROTO(0, 2)
{
HS_OFFSET_FM_PREAMBLE(8);
HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0);
HS_KEY_TYPE r10 = HS_FM_GLOBAL_LOAD_R(1);
HS_KEY_TYPE r11 = HS_FM_GLOBAL_LOAD_R(2);
HS_KEY_TYPE r12 = HS_FM_GLOBAL_LOAD_R(3);
HS_CMP_XCHG(r8, r9);
HS_CMP_XCHG(r7, r10);
HS_CMP_XCHG(r6, r11);
HS_CMP_XCHG(r5, r12);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_XM_GLOBAL_STORE_L(0, r1);
HS_XM_GLOBAL_STORE_L(1, r2);
HS_XM_GLOBAL_STORE_L(2, r3);
HS_XM_GLOBAL_STORE_L(3, r4);
HS_XM_GLOBAL_STORE_L(4, r5);
HS_XM_GLOBAL_STORE_L(5, r6);
HS_XM_GLOBAL_STORE_L(6, r7);
HS_XM_GLOBAL_STORE_L(7, r8);
HS_FM_GLOBAL_STORE_R(0, r9);
HS_FM_GLOBAL_STORE_R(1, r10);
HS_FM_GLOBAL_STORE_R(2, r11);
HS_FM_GLOBAL_STORE_R(3, r12);
}
HS_FM_KERNEL_PROTO(0, 3)
{
HS_FM_PREAMBLE(8);
HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0);
HS_KEY_TYPE r10 = HS_FM_GLOBAL_LOAD_R(1);
HS_KEY_TYPE r11 = HS_FM_GLOBAL_LOAD_R(2);
HS_KEY_TYPE r12 = HS_FM_GLOBAL_LOAD_R(3);
HS_KEY_TYPE r13 = HS_FM_GLOBAL_LOAD_R(4);
HS_KEY_TYPE r14 = HS_FM_GLOBAL_LOAD_R(5);
HS_KEY_TYPE r15 = HS_FM_GLOBAL_LOAD_R(6);
HS_KEY_TYPE r16 = HS_FM_GLOBAL_LOAD_R(7);
HS_CMP_XCHG(r8, r9);
HS_CMP_XCHG(r7, r10);
HS_CMP_XCHG(r6, r11);
HS_CMP_XCHG(r5, r12);
HS_CMP_XCHG(r4, r13);
HS_CMP_XCHG(r3, r14);
HS_CMP_XCHG(r2, r15);
HS_CMP_XCHG(r1, r16);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
HS_XM_GLOBAL_STORE_L(0, r1);
HS_XM_GLOBAL_STORE_L(1, r2);
HS_XM_GLOBAL_STORE_L(2, r3);
HS_XM_GLOBAL_STORE_L(3, r4);
HS_XM_GLOBAL_STORE_L(4, r5);
HS_XM_GLOBAL_STORE_L(5, r6);
HS_XM_GLOBAL_STORE_L(6, r7);
HS_XM_GLOBAL_STORE_L(7, r8);
HS_FM_GLOBAL_STORE_R(0, r9);
HS_FM_GLOBAL_STORE_R(1, r10);
HS_FM_GLOBAL_STORE_R(2, r11);
HS_FM_GLOBAL_STORE_R(3, r12);
HS_FM_GLOBAL_STORE_R(4, r13);
HS_FM_GLOBAL_STORE_R(5, r14);
HS_FM_GLOBAL_STORE_R(6, r15);
HS_FM_GLOBAL_STORE_R(7, r16);
}
HS_HM_KERNEL_PROTO(0)
{
HS_HM_PREAMBLE(8);
HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0);
HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1);
HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2);
HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3);
HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4);
HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5);
HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6);
HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7);
HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8);
HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9);
HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10);
HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11);
HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12);
HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13);
HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14);
HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15);
HS_CMP_XCHG(r1, r9);
HS_CMP_XCHG(r5, r13);
HS_CMP_XCHG(r1, r5);
HS_CMP_XCHG(r9, r13);
HS_CMP_XCHG(r3, r11);
HS_CMP_XCHG(r7, r15);
HS_CMP_XCHG(r3, r7);
HS_CMP_XCHG(r11, r15);
HS_CMP_XCHG(r1, r3);
HS_CMP_XCHG(r5, r7);
HS_CMP_XCHG(r9, r11);
HS_CMP_XCHG(r13, r15);
HS_CMP_XCHG(r2, r10);
HS_CMP_XCHG(r6, r14);
HS_CMP_XCHG(r2, r6);
HS_CMP_XCHG(r10, r14);
HS_CMP_XCHG(r4, r12);
HS_CMP_XCHG(r8, r16);
HS_CMP_XCHG(r4, r8);
HS_CMP_XCHG(r12, r16);
HS_CMP_XCHG(r2, r4);
HS_CMP_XCHG(r6, r8);
HS_CMP_XCHG(r10, r12);
HS_CMP_XCHG(r14, r16);
HS_CMP_XCHG(r1, r2);
HS_CMP_XCHG(r3, r4);
HS_CMP_XCHG(r5, r6);
HS_CMP_XCHG(r7, r8);
HS_CMP_XCHG(r9, r10);
HS_CMP_XCHG(r11, r12);
HS_CMP_XCHG(r13, r14);
HS_CMP_XCHG(r15, r16);
HS_XM_GLOBAL_STORE_L(0, r1);
HS_XM_GLOBAL_STORE_L(1, r2);
HS_XM_GLOBAL_STORE_L(2, r3);
HS_XM_GLOBAL_STORE_L(3, r4);
HS_XM_GLOBAL_STORE_L(4, r5);
HS_XM_GLOBAL_STORE_L(5, r6);
HS_XM_GLOBAL_STORE_L(6, r7);
HS_XM_GLOBAL_STORE_L(7, r8);
HS_XM_GLOBAL_STORE_L(8, r9);
HS_XM_GLOBAL_STORE_L(9, r10);
HS_XM_GLOBAL_STORE_L(10, r11);
HS_XM_GLOBAL_STORE_L(11, r12);
HS_XM_GLOBAL_STORE_L(12, r13);
HS_XM_GLOBAL_STORE_L(13, r14);
HS_XM_GLOBAL_STORE_L(14, r15);
HS_XM_GLOBAL_STORE_L(15, r16);
}
HS_TRANSPOSE_KERNEL_PROTO()
{
HS_SLAB_GLOBAL_PREAMBLE();
HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vout, 0);
HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vout, 1);
HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vout, 2);
HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vout, 3);
HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vout, 4);
HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vout, 5);
HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vout, 6);
HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vout, 7);
HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vout, 8);
HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vout, 9);
HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vout, 10);
HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vout, 11);
HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vout, 12);
HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vout, 13);
HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vout, 14);
HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vout, 15);
HS_TRANSPOSE_SLAB();
}
//
//
//
#include "../../hs_cuda.inl"
//
//
//