| // |
| // Copyright 2016 Google Inc. |
| // |
| // Use of this source code is governed by a BSD-style |
| // license that can be found in the LICENSE file. |
| // |
| |
| #ifdef __cplusplus |
| extern "C" |
| { |
| #endif |
| |
| #include "hs_cuda.h" |
| |
| #ifdef __cplusplus |
| } |
| #endif |
| |
| #include "hs_cuda_config.h" |
| |
| #include "../hs_cuda_macros.h" |
| |
| // |
| // |
| // |
| |
| HS_OFFSET_BS_KERNEL_PROTO(1, 0) |
| { |
| HS_OFFSET_SLAB_GLOBAL_PREAMBLE(); |
| HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0); |
| HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1); |
| HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2); |
| HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3); |
| HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4); |
| HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5); |
| HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6); |
| HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7); |
| HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8); |
| HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 9); |
| HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 10); |
| HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 11); |
| HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 12); |
| HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 13); |
| HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 14); |
| HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 15); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r6, r11); |
| HS_CMP_XCHG(r7, r10); |
| HS_CMP_XCHG(r4, r13); |
| HS_CMP_XCHG(r14, r15); |
| HS_CMP_XCHG(r8, r12); |
| HS_CMP_XCHG(r2, r3); |
| HS_CMP_XCHG(r5, r9); |
| HS_CMP_XCHG(r2, r5); |
| HS_CMP_XCHG(r8, r14); |
| HS_CMP_XCHG(r3, r9); |
| HS_CMP_XCHG(r12, r15); |
| HS_CMP_XCHG(r3, r5); |
| HS_CMP_XCHG(r6, r7); |
| HS_CMP_XCHG(r10, r11); |
| HS_CMP_XCHG(r12, r14); |
| HS_CMP_XCHG(r4, r9); |
| HS_CMP_XCHG(r8, r13); |
| HS_CMP_XCHG(r7, r9); |
| HS_CMP_XCHG(r11, r13); |
| HS_CMP_XCHG(r4, r6); |
| HS_CMP_XCHG(r8, r10); |
| HS_CMP_XCHG(r4, r5); |
| HS_CMP_XCHG(r6, r7); |
| HS_CMP_XCHG(r8, r9); |
| HS_CMP_XCHG(r10, r11); |
| HS_CMP_XCHG(r12, r13); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| { |
| HS_SLAB_FLIP_PREAMBLE(1); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| { |
| HS_SLAB_FLIP_PREAMBLE(3); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| { |
| HS_SLAB_FLIP_PREAMBLE(7); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| { |
| HS_SLAB_FLIP_PREAMBLE(15); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| { |
| HS_SLAB_FLIP_PREAMBLE(31); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(8); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| HS_SLAB_GLOBAL_STORE(0, r1); |
| HS_SLAB_GLOBAL_STORE(1, r2); |
| HS_SLAB_GLOBAL_STORE(2, r3); |
| HS_SLAB_GLOBAL_STORE(3, r4); |
| HS_SLAB_GLOBAL_STORE(4, r5); |
| HS_SLAB_GLOBAL_STORE(5, r6); |
| HS_SLAB_GLOBAL_STORE(6, r7); |
| HS_SLAB_GLOBAL_STORE(7, r8); |
| HS_SLAB_GLOBAL_STORE(8, r9); |
| HS_SLAB_GLOBAL_STORE(9, r10); |
| HS_SLAB_GLOBAL_STORE(10, r11); |
| HS_SLAB_GLOBAL_STORE(11, r12); |
| HS_SLAB_GLOBAL_STORE(12, r13); |
| HS_SLAB_GLOBAL_STORE(13, r14); |
| HS_SLAB_GLOBAL_STORE(14, r15); |
| HS_SLAB_GLOBAL_STORE(15, r16); |
| } |
| |
| HS_OFFSET_BS_KERNEL_PROTO(2, 1) |
| { |
| HS_BLOCK_LOCAL_MEM_DECL(64, 16); |
| |
| HS_OFFSET_SLAB_GLOBAL_PREAMBLE(); |
| HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0); |
| HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1); |
| HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2); |
| HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3); |
| HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4); |
| HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5); |
| HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6); |
| HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7); |
| HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8); |
| HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 9); |
| HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 10); |
| HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 11); |
| HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 12); |
| HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 13); |
| HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 14); |
| HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 15); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r6, r11); |
| HS_CMP_XCHG(r7, r10); |
| HS_CMP_XCHG(r4, r13); |
| HS_CMP_XCHG(r14, r15); |
| HS_CMP_XCHG(r8, r12); |
| HS_CMP_XCHG(r2, r3); |
| HS_CMP_XCHG(r5, r9); |
| HS_CMP_XCHG(r2, r5); |
| HS_CMP_XCHG(r8, r14); |
| HS_CMP_XCHG(r3, r9); |
| HS_CMP_XCHG(r12, r15); |
| HS_CMP_XCHG(r3, r5); |
| HS_CMP_XCHG(r6, r7); |
| HS_CMP_XCHG(r10, r11); |
| HS_CMP_XCHG(r12, r14); |
| HS_CMP_XCHG(r4, r9); |
| HS_CMP_XCHG(r8, r13); |
| HS_CMP_XCHG(r7, r9); |
| HS_CMP_XCHG(r11, r13); |
| HS_CMP_XCHG(r4, r6); |
| HS_CMP_XCHG(r8, r10); |
| HS_CMP_XCHG(r4, r5); |
| HS_CMP_XCHG(r6, r7); |
| HS_CMP_XCHG(r8, r9); |
| HS_CMP_XCHG(r10, r11); |
| HS_CMP_XCHG(r12, r13); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| { |
| HS_SLAB_FLIP_PREAMBLE(1); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| { |
| HS_SLAB_FLIP_PREAMBLE(3); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| { |
| HS_SLAB_FLIP_PREAMBLE(7); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| { |
| HS_SLAB_FLIP_PREAMBLE(15); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| { |
| HS_SLAB_FLIP_PREAMBLE(31); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(8); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| HS_BS_MERGE_H_PREAMBLE(2); |
| HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0) = r1; |
| HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1) = r16; |
| HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2) = r2; |
| HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3) = r15; |
| HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4) = r3; |
| HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5) = r14; |
| HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6) = r4; |
| HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7) = r13; |
| HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 8) = r5; |
| HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 9) = r12; |
| HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 10) = r6; |
| HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 11) = r11; |
| HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 12) = r7; |
| HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 13) = r10; |
| HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 14) = r8; |
| HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 15) = r9; |
| HS_BLOCK_BARRIER(); |
| { |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(32); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(0) = r0_1; |
| HS_SLAB_LOCAL_R(32) = r0_2; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(128); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(160); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(128) = r0_1; |
| HS_SLAB_LOCAL_R(160) = r0_2; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(256); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(288); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(256) = r0_1; |
| HS_SLAB_LOCAL_R(288) = r0_2; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(384); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(416); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(384) = r0_1; |
| HS_SLAB_LOCAL_R(416) = r0_2; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(512); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(544); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(512) = r0_1; |
| HS_SLAB_LOCAL_R(544) = r0_2; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(640); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(672); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(640) = r0_1; |
| HS_SLAB_LOCAL_R(672) = r0_2; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(768); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(800); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(768) = r0_1; |
| HS_SLAB_LOCAL_R(800) = r0_2; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(896); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(928); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(896) = r0_1; |
| HS_SLAB_LOCAL_R(928) = r0_2; |
| } |
| } |
| HS_BLOCK_BARRIER(); |
| r1 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0); |
| r16 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1); |
| r2 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2); |
| r15 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3); |
| r3 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4); |
| r14 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5); |
| r4 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6); |
| r13 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7); |
| r5 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 8); |
| r12 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 9); |
| r6 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 10); |
| r11 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 11); |
| r7 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 12); |
| r10 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 13); |
| r8 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 14); |
| r9 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 15); |
| { |
| { |
| HS_SLAB_HALF_PREAMBLE(16); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(8); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| } |
| HS_SLAB_GLOBAL_STORE(0, r1); |
| HS_SLAB_GLOBAL_STORE(1, r2); |
| HS_SLAB_GLOBAL_STORE(2, r3); |
| HS_SLAB_GLOBAL_STORE(3, r4); |
| HS_SLAB_GLOBAL_STORE(4, r5); |
| HS_SLAB_GLOBAL_STORE(5, r6); |
| HS_SLAB_GLOBAL_STORE(6, r7); |
| HS_SLAB_GLOBAL_STORE(7, r8); |
| HS_SLAB_GLOBAL_STORE(8, r9); |
| HS_SLAB_GLOBAL_STORE(9, r10); |
| HS_SLAB_GLOBAL_STORE(10, r11); |
| HS_SLAB_GLOBAL_STORE(11, r12); |
| HS_SLAB_GLOBAL_STORE(12, r13); |
| HS_SLAB_GLOBAL_STORE(13, r14); |
| HS_SLAB_GLOBAL_STORE(14, r15); |
| HS_SLAB_GLOBAL_STORE(15, r16); |
| } |
| |
| HS_OFFSET_BS_KERNEL_PROTO(4, 2) |
| { |
| HS_BLOCK_LOCAL_MEM_DECL(128, 16); |
| |
| HS_OFFSET_SLAB_GLOBAL_PREAMBLE(); |
| HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0); |
| HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1); |
| HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2); |
| HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3); |
| HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4); |
| HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5); |
| HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6); |
| HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7); |
| HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8); |
| HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 9); |
| HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 10); |
| HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 11); |
| HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 12); |
| HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 13); |
| HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 14); |
| HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 15); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r6, r11); |
| HS_CMP_XCHG(r7, r10); |
| HS_CMP_XCHG(r4, r13); |
| HS_CMP_XCHG(r14, r15); |
| HS_CMP_XCHG(r8, r12); |
| HS_CMP_XCHG(r2, r3); |
| HS_CMP_XCHG(r5, r9); |
| HS_CMP_XCHG(r2, r5); |
| HS_CMP_XCHG(r8, r14); |
| HS_CMP_XCHG(r3, r9); |
| HS_CMP_XCHG(r12, r15); |
| HS_CMP_XCHG(r3, r5); |
| HS_CMP_XCHG(r6, r7); |
| HS_CMP_XCHG(r10, r11); |
| HS_CMP_XCHG(r12, r14); |
| HS_CMP_XCHG(r4, r9); |
| HS_CMP_XCHG(r8, r13); |
| HS_CMP_XCHG(r7, r9); |
| HS_CMP_XCHG(r11, r13); |
| HS_CMP_XCHG(r4, r6); |
| HS_CMP_XCHG(r8, r10); |
| HS_CMP_XCHG(r4, r5); |
| HS_CMP_XCHG(r6, r7); |
| HS_CMP_XCHG(r8, r9); |
| HS_CMP_XCHG(r10, r11); |
| HS_CMP_XCHG(r12, r13); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| { |
| HS_SLAB_FLIP_PREAMBLE(1); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| { |
| HS_SLAB_FLIP_PREAMBLE(3); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| { |
| HS_SLAB_FLIP_PREAMBLE(7); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| { |
| HS_SLAB_FLIP_PREAMBLE(15); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| { |
| HS_SLAB_FLIP_PREAMBLE(31); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(8); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| HS_BS_MERGE_H_PREAMBLE(4); |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0) = r1; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1) = r16; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2) = r2; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3) = r15; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4) = r3; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5) = r14; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6) = r4; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7) = r13; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 8) = r5; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 9) = r12; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 10) = r6; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 11) = r11; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 12) = r7; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 13) = r10; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 14) = r8; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 15) = r9; |
| HS_BLOCK_BARRIER(); |
| { |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(32); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(0) = r0_1; |
| HS_SLAB_LOCAL_R(32) = r0_2; |
| } |
| { |
| HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(64); |
| HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(96); |
| HS_CMP_XCHG(r1_1, r1_2); |
| HS_SLAB_LOCAL_L(64) = r1_1; |
| HS_SLAB_LOCAL_R(96) = r1_2; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(512); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(544); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(512) = r0_1; |
| HS_SLAB_LOCAL_R(544) = r0_2; |
| } |
| { |
| HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(576); |
| HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(608); |
| HS_CMP_XCHG(r1_1, r1_2); |
| HS_SLAB_LOCAL_L(576) = r1_1; |
| HS_SLAB_LOCAL_R(608) = r1_2; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(1024); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(1056); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(1024) = r0_1; |
| HS_SLAB_LOCAL_R(1056) = r0_2; |
| } |
| { |
| HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(1088); |
| HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(1120); |
| HS_CMP_XCHG(r1_1, r1_2); |
| HS_SLAB_LOCAL_L(1088) = r1_1; |
| HS_SLAB_LOCAL_R(1120) = r1_2; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(1536); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(1568); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(1536) = r0_1; |
| HS_SLAB_LOCAL_R(1568) = r0_2; |
| } |
| { |
| HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(1600); |
| HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(1632); |
| HS_CMP_XCHG(r1_1, r1_2); |
| HS_SLAB_LOCAL_L(1600) = r1_1; |
| HS_SLAB_LOCAL_R(1632) = r1_2; |
| } |
| } |
| HS_BLOCK_BARRIER(); |
| r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0); |
| r16 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1); |
| r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2); |
| r15 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3); |
| r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4); |
| r14 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5); |
| r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6); |
| r13 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7); |
| r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 8); |
| r12 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 9); |
| r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 10); |
| r11 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 11); |
| r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 12); |
| r10 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 13); |
| r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 14); |
| r9 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 15); |
| { |
| { |
| HS_SLAB_HALF_PREAMBLE(16); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(8); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| } |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0) = r1; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1) = r16; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2) = r2; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3) = r15; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4) = r3; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5) = r14; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6) = r4; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7) = r13; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 8) = r5; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 9) = r12; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 10) = r6; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 11) = r11; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 12) = r7; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 13) = r10; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 14) = r8; |
| HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 15) = r9; |
| HS_BLOCK_BARRIER(); |
| { |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32); |
| HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(64); |
| HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(96); |
| HS_CMP_XCHG(r0_2, r0_3); |
| HS_CMP_XCHG(r0_1, r0_4); |
| HS_CMP_XCHG(r0_3, r0_4); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(0) = r0_1; |
| HS_SLAB_LOCAL_L(32) = r0_2; |
| HS_SLAB_LOCAL_R(64) = r0_3; |
| HS_SLAB_LOCAL_R(96) = r0_4; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(512); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(544); |
| HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(576); |
| HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(608); |
| HS_CMP_XCHG(r0_2, r0_3); |
| HS_CMP_XCHG(r0_1, r0_4); |
| HS_CMP_XCHG(r0_3, r0_4); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(512) = r0_1; |
| HS_SLAB_LOCAL_L(544) = r0_2; |
| HS_SLAB_LOCAL_R(576) = r0_3; |
| HS_SLAB_LOCAL_R(608) = r0_4; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(1024); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(1056); |
| HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(1088); |
| HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(1120); |
| HS_CMP_XCHG(r0_2, r0_3); |
| HS_CMP_XCHG(r0_1, r0_4); |
| HS_CMP_XCHG(r0_3, r0_4); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(1024) = r0_1; |
| HS_SLAB_LOCAL_L(1056) = r0_2; |
| HS_SLAB_LOCAL_R(1088) = r0_3; |
| HS_SLAB_LOCAL_R(1120) = r0_4; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(1536); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(1568); |
| HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(1600); |
| HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(1632); |
| HS_CMP_XCHG(r0_2, r0_3); |
| HS_CMP_XCHG(r0_1, r0_4); |
| HS_CMP_XCHG(r0_3, r0_4); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(1536) = r0_1; |
| HS_SLAB_LOCAL_L(1568) = r0_2; |
| HS_SLAB_LOCAL_R(1600) = r0_3; |
| HS_SLAB_LOCAL_R(1632) = r0_4; |
| } |
| } |
| HS_BLOCK_BARRIER(); |
| r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0); |
| r16 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1); |
| r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2); |
| r15 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3); |
| r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4); |
| r14 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5); |
| r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6); |
| r13 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7); |
| r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 8); |
| r12 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 9); |
| r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 10); |
| r11 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 11); |
| r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 12); |
| r10 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 13); |
| r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 14); |
| r9 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 15); |
| { |
| { |
| HS_SLAB_HALF_PREAMBLE(16); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(8); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| } |
| HS_SLAB_GLOBAL_STORE(0, r1); |
| HS_SLAB_GLOBAL_STORE(1, r2); |
| HS_SLAB_GLOBAL_STORE(2, r3); |
| HS_SLAB_GLOBAL_STORE(3, r4); |
| HS_SLAB_GLOBAL_STORE(4, r5); |
| HS_SLAB_GLOBAL_STORE(5, r6); |
| HS_SLAB_GLOBAL_STORE(6, r7); |
| HS_SLAB_GLOBAL_STORE(7, r8); |
| HS_SLAB_GLOBAL_STORE(8, r9); |
| HS_SLAB_GLOBAL_STORE(9, r10); |
| HS_SLAB_GLOBAL_STORE(10, r11); |
| HS_SLAB_GLOBAL_STORE(11, r12); |
| HS_SLAB_GLOBAL_STORE(12, r13); |
| HS_SLAB_GLOBAL_STORE(13, r14); |
| HS_SLAB_GLOBAL_STORE(14, r15); |
| HS_SLAB_GLOBAL_STORE(15, r16); |
| } |
| |
| HS_OFFSET_BS_KERNEL_PROTO(8, 3) |
| { |
| HS_BLOCK_LOCAL_MEM_DECL(256, 16); |
| |
| HS_OFFSET_SLAB_GLOBAL_PREAMBLE(); |
| HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0); |
| HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1); |
| HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2); |
| HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3); |
| HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4); |
| HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5); |
| HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6); |
| HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7); |
| HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8); |
| HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 9); |
| HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 10); |
| HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 11); |
| HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 12); |
| HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 13); |
| HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 14); |
| HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 15); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r6, r11); |
| HS_CMP_XCHG(r7, r10); |
| HS_CMP_XCHG(r4, r13); |
| HS_CMP_XCHG(r14, r15); |
| HS_CMP_XCHG(r8, r12); |
| HS_CMP_XCHG(r2, r3); |
| HS_CMP_XCHG(r5, r9); |
| HS_CMP_XCHG(r2, r5); |
| HS_CMP_XCHG(r8, r14); |
| HS_CMP_XCHG(r3, r9); |
| HS_CMP_XCHG(r12, r15); |
| HS_CMP_XCHG(r3, r5); |
| HS_CMP_XCHG(r6, r7); |
| HS_CMP_XCHG(r10, r11); |
| HS_CMP_XCHG(r12, r14); |
| HS_CMP_XCHG(r4, r9); |
| HS_CMP_XCHG(r8, r13); |
| HS_CMP_XCHG(r7, r9); |
| HS_CMP_XCHG(r11, r13); |
| HS_CMP_XCHG(r4, r6); |
| HS_CMP_XCHG(r8, r10); |
| HS_CMP_XCHG(r4, r5); |
| HS_CMP_XCHG(r6, r7); |
| HS_CMP_XCHG(r8, r9); |
| HS_CMP_XCHG(r10, r11); |
| HS_CMP_XCHG(r12, r13); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| { |
| HS_SLAB_FLIP_PREAMBLE(1); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| { |
| HS_SLAB_FLIP_PREAMBLE(3); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| { |
| HS_SLAB_FLIP_PREAMBLE(7); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| { |
| HS_SLAB_FLIP_PREAMBLE(15); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| { |
| HS_SLAB_FLIP_PREAMBLE(31); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(8); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| HS_BS_MERGE_H_PREAMBLE(8); |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r16; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r15; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r14; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r13; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8) = r5; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9) = r12; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10) = r6; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11) = r11; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12) = r7; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13) = r10; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14) = r8; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15) = r9; |
| HS_BLOCK_BARRIER(); |
| { |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(32); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(0) = r0_1; |
| HS_SLAB_LOCAL_R(32) = r0_2; |
| } |
| { |
| HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(64); |
| HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(96); |
| HS_CMP_XCHG(r1_1, r1_2); |
| HS_SLAB_LOCAL_L(64) = r1_1; |
| HS_SLAB_LOCAL_R(96) = r1_2; |
| } |
| { |
| HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(128); |
| HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(160); |
| HS_CMP_XCHG(r2_1, r2_2); |
| HS_SLAB_LOCAL_L(128) = r2_1; |
| HS_SLAB_LOCAL_R(160) = r2_2; |
| } |
| { |
| HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(192); |
| HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(224); |
| HS_CMP_XCHG(r3_1, r3_2); |
| HS_SLAB_LOCAL_L(192) = r3_1; |
| HS_SLAB_LOCAL_R(224) = r3_2; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(2048); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(2080); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(2048) = r0_1; |
| HS_SLAB_LOCAL_R(2080) = r0_2; |
| } |
| { |
| HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(2112); |
| HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(2144); |
| HS_CMP_XCHG(r1_1, r1_2); |
| HS_SLAB_LOCAL_L(2112) = r1_1; |
| HS_SLAB_LOCAL_R(2144) = r1_2; |
| } |
| { |
| HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(2176); |
| HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(2208); |
| HS_CMP_XCHG(r2_1, r2_2); |
| HS_SLAB_LOCAL_L(2176) = r2_1; |
| HS_SLAB_LOCAL_R(2208) = r2_2; |
| } |
| { |
| HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(2240); |
| HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(2272); |
| HS_CMP_XCHG(r3_1, r3_2); |
| HS_SLAB_LOCAL_L(2240) = r3_1; |
| HS_SLAB_LOCAL_R(2272) = r3_2; |
| } |
| } |
| HS_BLOCK_BARRIER(); |
| r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0); |
| r16 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1); |
| r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2); |
| r15 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3); |
| r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4); |
| r14 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5); |
| r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6); |
| r13 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7); |
| r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8); |
| r12 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9); |
| r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10); |
| r11 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11); |
| r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12); |
| r10 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13); |
| r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14); |
| r9 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15); |
| { |
| { |
| HS_SLAB_HALF_PREAMBLE(16); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(8); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| } |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r16; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r15; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r14; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r13; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8) = r5; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9) = r12; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10) = r6; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11) = r11; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12) = r7; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13) = r10; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14) = r8; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15) = r9; |
| HS_BLOCK_BARRIER(); |
| { |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32); |
| HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(64); |
| HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(96); |
| HS_CMP_XCHG(r0_2, r0_3); |
| HS_CMP_XCHG(r0_1, r0_4); |
| HS_CMP_XCHG(r0_3, r0_4); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(0) = r0_1; |
| HS_SLAB_LOCAL_L(32) = r0_2; |
| HS_SLAB_LOCAL_R(64) = r0_3; |
| HS_SLAB_LOCAL_R(96) = r0_4; |
| } |
| { |
| HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(128); |
| HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(160); |
| HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(192); |
| HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(224); |
| HS_CMP_XCHG(r1_2, r1_3); |
| HS_CMP_XCHG(r1_1, r1_4); |
| HS_CMP_XCHG(r1_3, r1_4); |
| HS_CMP_XCHG(r1_1, r1_2); |
| HS_SLAB_LOCAL_L(128) = r1_1; |
| HS_SLAB_LOCAL_L(160) = r1_2; |
| HS_SLAB_LOCAL_R(192) = r1_3; |
| HS_SLAB_LOCAL_R(224) = r1_4; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(2048); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(2080); |
| HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(2112); |
| HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(2144); |
| HS_CMP_XCHG(r0_2, r0_3); |
| HS_CMP_XCHG(r0_1, r0_4); |
| HS_CMP_XCHG(r0_3, r0_4); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(2048) = r0_1; |
| HS_SLAB_LOCAL_L(2080) = r0_2; |
| HS_SLAB_LOCAL_R(2112) = r0_3; |
| HS_SLAB_LOCAL_R(2144) = r0_4; |
| } |
| { |
| HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(2176); |
| HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(2208); |
| HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(2240); |
| HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(2272); |
| HS_CMP_XCHG(r1_2, r1_3); |
| HS_CMP_XCHG(r1_1, r1_4); |
| HS_CMP_XCHG(r1_3, r1_4); |
| HS_CMP_XCHG(r1_1, r1_2); |
| HS_SLAB_LOCAL_L(2176) = r1_1; |
| HS_SLAB_LOCAL_L(2208) = r1_2; |
| HS_SLAB_LOCAL_R(2240) = r1_3; |
| HS_SLAB_LOCAL_R(2272) = r1_4; |
| } |
| } |
| HS_BLOCK_BARRIER(); |
| r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0); |
| r16 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1); |
| r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2); |
| r15 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3); |
| r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4); |
| r14 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5); |
| r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6); |
| r13 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7); |
| r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8); |
| r12 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9); |
| r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10); |
| r11 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11); |
| r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12); |
| r10 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13); |
| r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14); |
| r9 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15); |
| { |
| { |
| HS_SLAB_HALF_PREAMBLE(16); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(8); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| } |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0) = r1; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1) = r16; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2) = r2; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3) = r15; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4) = r3; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5) = r14; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6) = r4; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7) = r13; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8) = r5; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9) = r12; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10) = r6; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11) = r11; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12) = r7; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13) = r10; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14) = r8; |
| HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15) = r9; |
| HS_BLOCK_BARRIER(); |
| { |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32); |
| HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(64); |
| HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(96); |
| HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(128); |
| HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(160); |
| HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(192); |
| HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(224); |
| HS_CMP_XCHG(r0_4, r0_5); |
| HS_CMP_XCHG(r0_3, r0_6); |
| HS_CMP_XCHG(r0_2, r0_7); |
| HS_CMP_XCHG(r0_1, r0_8); |
| HS_CMP_XCHG(r0_5, r0_7); |
| HS_CMP_XCHG(r0_6, r0_8); |
| HS_CMP_XCHG(r0_5, r0_6); |
| HS_CMP_XCHG(r0_7, r0_8); |
| HS_CMP_XCHG(r0_1, r0_3); |
| HS_CMP_XCHG(r0_2, r0_4); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_CMP_XCHG(r0_3, r0_4); |
| HS_SLAB_LOCAL_L(0) = r0_1; |
| HS_SLAB_LOCAL_L(32) = r0_2; |
| HS_SLAB_LOCAL_L(64) = r0_3; |
| HS_SLAB_LOCAL_L(96) = r0_4; |
| HS_SLAB_LOCAL_R(128) = r0_5; |
| HS_SLAB_LOCAL_R(160) = r0_6; |
| HS_SLAB_LOCAL_R(192) = r0_7; |
| HS_SLAB_LOCAL_R(224) = r0_8; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(2048); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(2080); |
| HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(2112); |
| HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(2144); |
| HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(2176); |
| HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(2208); |
| HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(2240); |
| HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(2272); |
| HS_CMP_XCHG(r0_4, r0_5); |
| HS_CMP_XCHG(r0_3, r0_6); |
| HS_CMP_XCHG(r0_2, r0_7); |
| HS_CMP_XCHG(r0_1, r0_8); |
| HS_CMP_XCHG(r0_5, r0_7); |
| HS_CMP_XCHG(r0_6, r0_8); |
| HS_CMP_XCHG(r0_5, r0_6); |
| HS_CMP_XCHG(r0_7, r0_8); |
| HS_CMP_XCHG(r0_1, r0_3); |
| HS_CMP_XCHG(r0_2, r0_4); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_CMP_XCHG(r0_3, r0_4); |
| HS_SLAB_LOCAL_L(2048) = r0_1; |
| HS_SLAB_LOCAL_L(2080) = r0_2; |
| HS_SLAB_LOCAL_L(2112) = r0_3; |
| HS_SLAB_LOCAL_L(2144) = r0_4; |
| HS_SLAB_LOCAL_R(2176) = r0_5; |
| HS_SLAB_LOCAL_R(2208) = r0_6; |
| HS_SLAB_LOCAL_R(2240) = r0_7; |
| HS_SLAB_LOCAL_R(2272) = r0_8; |
| } |
| } |
| HS_BLOCK_BARRIER(); |
| r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0); |
| r16 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1); |
| r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2); |
| r15 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3); |
| r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4); |
| r14 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5); |
| r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6); |
| r13 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7); |
| r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8); |
| r12 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9); |
| r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10); |
| r11 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11); |
| r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12); |
| r10 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13); |
| r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14); |
| r9 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15); |
| { |
| { |
| HS_SLAB_HALF_PREAMBLE(16); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(8); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| } |
| HS_SLAB_GLOBAL_STORE(0, r1); |
| HS_SLAB_GLOBAL_STORE(1, r2); |
| HS_SLAB_GLOBAL_STORE(2, r3); |
| HS_SLAB_GLOBAL_STORE(3, r4); |
| HS_SLAB_GLOBAL_STORE(4, r5); |
| HS_SLAB_GLOBAL_STORE(5, r6); |
| HS_SLAB_GLOBAL_STORE(6, r7); |
| HS_SLAB_GLOBAL_STORE(7, r8); |
| HS_SLAB_GLOBAL_STORE(8, r9); |
| HS_SLAB_GLOBAL_STORE(9, r10); |
| HS_SLAB_GLOBAL_STORE(10, r11); |
| HS_SLAB_GLOBAL_STORE(11, r12); |
| HS_SLAB_GLOBAL_STORE(12, r13); |
| HS_SLAB_GLOBAL_STORE(13, r14); |
| HS_SLAB_GLOBAL_STORE(14, r15); |
| HS_SLAB_GLOBAL_STORE(15, r16); |
| } |
| |
| HS_BS_KERNEL_PROTO(16, 4) |
| { |
| HS_BLOCK_LOCAL_MEM_DECL(512, 16); |
| |
| HS_SLAB_GLOBAL_PREAMBLE(); |
| HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vin, 0); |
| HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vin, 1); |
| HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vin, 2); |
| HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vin, 3); |
| HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vin, 4); |
| HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vin, 5); |
| HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vin, 6); |
| HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vin, 7); |
| HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vin, 8); |
| HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vin, 9); |
| HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vin, 10); |
| HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vin, 11); |
| HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vin, 12); |
| HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vin, 13); |
| HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vin, 14); |
| HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vin, 15); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r6, r11); |
| HS_CMP_XCHG(r7, r10); |
| HS_CMP_XCHG(r4, r13); |
| HS_CMP_XCHG(r14, r15); |
| HS_CMP_XCHG(r8, r12); |
| HS_CMP_XCHG(r2, r3); |
| HS_CMP_XCHG(r5, r9); |
| HS_CMP_XCHG(r2, r5); |
| HS_CMP_XCHG(r8, r14); |
| HS_CMP_XCHG(r3, r9); |
| HS_CMP_XCHG(r12, r15); |
| HS_CMP_XCHG(r3, r5); |
| HS_CMP_XCHG(r6, r7); |
| HS_CMP_XCHG(r10, r11); |
| HS_CMP_XCHG(r12, r14); |
| HS_CMP_XCHG(r4, r9); |
| HS_CMP_XCHG(r8, r13); |
| HS_CMP_XCHG(r7, r9); |
| HS_CMP_XCHG(r11, r13); |
| HS_CMP_XCHG(r4, r6); |
| HS_CMP_XCHG(r8, r10); |
| HS_CMP_XCHG(r4, r5); |
| HS_CMP_XCHG(r6, r7); |
| HS_CMP_XCHG(r8, r9); |
| HS_CMP_XCHG(r10, r11); |
| HS_CMP_XCHG(r12, r13); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| { |
| HS_SLAB_FLIP_PREAMBLE(1); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| { |
| HS_SLAB_FLIP_PREAMBLE(3); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| { |
| HS_SLAB_FLIP_PREAMBLE(7); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| { |
| HS_SLAB_FLIP_PREAMBLE(15); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| { |
| HS_SLAB_FLIP_PREAMBLE(31); |
| HS_CMP_FLIP(0, r1, r16); |
| HS_CMP_FLIP(1, r2, r15); |
| HS_CMP_FLIP(2, r3, r14); |
| HS_CMP_FLIP(3, r4, r13); |
| HS_CMP_FLIP(4, r5, r12); |
| HS_CMP_FLIP(5, r6, r11); |
| HS_CMP_FLIP(6, r7, r10); |
| HS_CMP_FLIP(7, r8, r9); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(8); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| HS_BS_MERGE_H_PREAMBLE(16); |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r16; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r15; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r14; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r13; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8) = r5; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9) = r12; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10) = r6; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11) = r11; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12) = r7; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13) = r10; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14) = r8; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15) = r9; |
| HS_BLOCK_BARRIER(); |
| { |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_R(32); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(0) = r0_1; |
| HS_SLAB_LOCAL_R(32) = r0_2; |
| } |
| { |
| HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(64); |
| HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_R(96); |
| HS_CMP_XCHG(r1_1, r1_2); |
| HS_SLAB_LOCAL_L(64) = r1_1; |
| HS_SLAB_LOCAL_R(96) = r1_2; |
| } |
| { |
| HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(128); |
| HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_R(160); |
| HS_CMP_XCHG(r2_1, r2_2); |
| HS_SLAB_LOCAL_L(128) = r2_1; |
| HS_SLAB_LOCAL_R(160) = r2_2; |
| } |
| { |
| HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(192); |
| HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_R(224); |
| HS_CMP_XCHG(r3_1, r3_2); |
| HS_SLAB_LOCAL_L(192) = r3_1; |
| HS_SLAB_LOCAL_R(224) = r3_2; |
| } |
| { |
| HS_KEY_TYPE r4_1 = HS_SLAB_LOCAL_L(256); |
| HS_KEY_TYPE r4_2 = HS_SLAB_LOCAL_R(288); |
| HS_CMP_XCHG(r4_1, r4_2); |
| HS_SLAB_LOCAL_L(256) = r4_1; |
| HS_SLAB_LOCAL_R(288) = r4_2; |
| } |
| { |
| HS_KEY_TYPE r5_1 = HS_SLAB_LOCAL_L(320); |
| HS_KEY_TYPE r5_2 = HS_SLAB_LOCAL_R(352); |
| HS_CMP_XCHG(r5_1, r5_2); |
| HS_SLAB_LOCAL_L(320) = r5_1; |
| HS_SLAB_LOCAL_R(352) = r5_2; |
| } |
| { |
| HS_KEY_TYPE r6_1 = HS_SLAB_LOCAL_L(384); |
| HS_KEY_TYPE r6_2 = HS_SLAB_LOCAL_R(416); |
| HS_CMP_XCHG(r6_1, r6_2); |
| HS_SLAB_LOCAL_L(384) = r6_1; |
| HS_SLAB_LOCAL_R(416) = r6_2; |
| } |
| { |
| HS_KEY_TYPE r7_1 = HS_SLAB_LOCAL_L(448); |
| HS_KEY_TYPE r7_2 = HS_SLAB_LOCAL_R(480); |
| HS_CMP_XCHG(r7_1, r7_2); |
| HS_SLAB_LOCAL_L(448) = r7_1; |
| HS_SLAB_LOCAL_R(480) = r7_2; |
| } |
| } |
| HS_BLOCK_BARRIER(); |
| r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0); |
| r16 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1); |
| r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2); |
| r15 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3); |
| r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4); |
| r14 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5); |
| r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6); |
| r13 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7); |
| r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8); |
| r12 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9); |
| r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10); |
| r11 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11); |
| r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12); |
| r10 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13); |
| r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14); |
| r9 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15); |
| { |
| { |
| HS_SLAB_HALF_PREAMBLE(16); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(8); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| } |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r16; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r15; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r14; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r13; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8) = r5; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9) = r12; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10) = r6; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11) = r11; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12) = r7; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13) = r10; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14) = r8; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15) = r9; |
| HS_BLOCK_BARRIER(); |
| { |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32); |
| HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_R(64); |
| HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_R(96); |
| HS_CMP_XCHG(r0_2, r0_3); |
| HS_CMP_XCHG(r0_1, r0_4); |
| HS_CMP_XCHG(r0_3, r0_4); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(0) = r0_1; |
| HS_SLAB_LOCAL_L(32) = r0_2; |
| HS_SLAB_LOCAL_R(64) = r0_3; |
| HS_SLAB_LOCAL_R(96) = r0_4; |
| } |
| { |
| HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(128); |
| HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(160); |
| HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_R(192); |
| HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_R(224); |
| HS_CMP_XCHG(r1_2, r1_3); |
| HS_CMP_XCHG(r1_1, r1_4); |
| HS_CMP_XCHG(r1_3, r1_4); |
| HS_CMP_XCHG(r1_1, r1_2); |
| HS_SLAB_LOCAL_L(128) = r1_1; |
| HS_SLAB_LOCAL_L(160) = r1_2; |
| HS_SLAB_LOCAL_R(192) = r1_3; |
| HS_SLAB_LOCAL_R(224) = r1_4; |
| } |
| { |
| HS_KEY_TYPE r2_1 = HS_SLAB_LOCAL_L(256); |
| HS_KEY_TYPE r2_2 = HS_SLAB_LOCAL_L(288); |
| HS_KEY_TYPE r2_3 = HS_SLAB_LOCAL_R(320); |
| HS_KEY_TYPE r2_4 = HS_SLAB_LOCAL_R(352); |
| HS_CMP_XCHG(r2_2, r2_3); |
| HS_CMP_XCHG(r2_1, r2_4); |
| HS_CMP_XCHG(r2_3, r2_4); |
| HS_CMP_XCHG(r2_1, r2_2); |
| HS_SLAB_LOCAL_L(256) = r2_1; |
| HS_SLAB_LOCAL_L(288) = r2_2; |
| HS_SLAB_LOCAL_R(320) = r2_3; |
| HS_SLAB_LOCAL_R(352) = r2_4; |
| } |
| { |
| HS_KEY_TYPE r3_1 = HS_SLAB_LOCAL_L(384); |
| HS_KEY_TYPE r3_2 = HS_SLAB_LOCAL_L(416); |
| HS_KEY_TYPE r3_3 = HS_SLAB_LOCAL_R(448); |
| HS_KEY_TYPE r3_4 = HS_SLAB_LOCAL_R(480); |
| HS_CMP_XCHG(r3_2, r3_3); |
| HS_CMP_XCHG(r3_1, r3_4); |
| HS_CMP_XCHG(r3_3, r3_4); |
| HS_CMP_XCHG(r3_1, r3_2); |
| HS_SLAB_LOCAL_L(384) = r3_1; |
| HS_SLAB_LOCAL_L(416) = r3_2; |
| HS_SLAB_LOCAL_R(448) = r3_3; |
| HS_SLAB_LOCAL_R(480) = r3_4; |
| } |
| } |
| HS_BLOCK_BARRIER(); |
| r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0); |
| r16 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1); |
| r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2); |
| r15 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3); |
| r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4); |
| r14 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5); |
| r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6); |
| r13 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7); |
| r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8); |
| r12 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9); |
| r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10); |
| r11 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11); |
| r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12); |
| r10 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13); |
| r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14); |
| r9 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15); |
| { |
| { |
| HS_SLAB_HALF_PREAMBLE(16); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(8); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| } |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r16; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r15; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r14; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r13; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8) = r5; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9) = r12; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10) = r6; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11) = r11; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12) = r7; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13) = r10; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14) = r8; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15) = r9; |
| HS_BLOCK_BARRIER(); |
| { |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32); |
| HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(64); |
| HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(96); |
| HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_R(128); |
| HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_R(160); |
| HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_R(192); |
| HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_R(224); |
| HS_CMP_XCHG(r0_4, r0_5); |
| HS_CMP_XCHG(r0_3, r0_6); |
| HS_CMP_XCHG(r0_2, r0_7); |
| HS_CMP_XCHG(r0_1, r0_8); |
| HS_CMP_XCHG(r0_5, r0_7); |
| HS_CMP_XCHG(r0_6, r0_8); |
| HS_CMP_XCHG(r0_5, r0_6); |
| HS_CMP_XCHG(r0_7, r0_8); |
| HS_CMP_XCHG(r0_1, r0_3); |
| HS_CMP_XCHG(r0_2, r0_4); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_CMP_XCHG(r0_3, r0_4); |
| HS_SLAB_LOCAL_L(0) = r0_1; |
| HS_SLAB_LOCAL_L(32) = r0_2; |
| HS_SLAB_LOCAL_L(64) = r0_3; |
| HS_SLAB_LOCAL_L(96) = r0_4; |
| HS_SLAB_LOCAL_R(128) = r0_5; |
| HS_SLAB_LOCAL_R(160) = r0_6; |
| HS_SLAB_LOCAL_R(192) = r0_7; |
| HS_SLAB_LOCAL_R(224) = r0_8; |
| } |
| { |
| HS_KEY_TYPE r1_1 = HS_SLAB_LOCAL_L(256); |
| HS_KEY_TYPE r1_2 = HS_SLAB_LOCAL_L(288); |
| HS_KEY_TYPE r1_3 = HS_SLAB_LOCAL_L(320); |
| HS_KEY_TYPE r1_4 = HS_SLAB_LOCAL_L(352); |
| HS_KEY_TYPE r1_5 = HS_SLAB_LOCAL_R(384); |
| HS_KEY_TYPE r1_6 = HS_SLAB_LOCAL_R(416); |
| HS_KEY_TYPE r1_7 = HS_SLAB_LOCAL_R(448); |
| HS_KEY_TYPE r1_8 = HS_SLAB_LOCAL_R(480); |
| HS_CMP_XCHG(r1_4, r1_5); |
| HS_CMP_XCHG(r1_3, r1_6); |
| HS_CMP_XCHG(r1_2, r1_7); |
| HS_CMP_XCHG(r1_1, r1_8); |
| HS_CMP_XCHG(r1_5, r1_7); |
| HS_CMP_XCHG(r1_6, r1_8); |
| HS_CMP_XCHG(r1_5, r1_6); |
| HS_CMP_XCHG(r1_7, r1_8); |
| HS_CMP_XCHG(r1_1, r1_3); |
| HS_CMP_XCHG(r1_2, r1_4); |
| HS_CMP_XCHG(r1_1, r1_2); |
| HS_CMP_XCHG(r1_3, r1_4); |
| HS_SLAB_LOCAL_L(256) = r1_1; |
| HS_SLAB_LOCAL_L(288) = r1_2; |
| HS_SLAB_LOCAL_L(320) = r1_3; |
| HS_SLAB_LOCAL_L(352) = r1_4; |
| HS_SLAB_LOCAL_R(384) = r1_5; |
| HS_SLAB_LOCAL_R(416) = r1_6; |
| HS_SLAB_LOCAL_R(448) = r1_7; |
| HS_SLAB_LOCAL_R(480) = r1_8; |
| } |
| } |
| HS_BLOCK_BARRIER(); |
| r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0); |
| r16 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1); |
| r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2); |
| r15 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3); |
| r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4); |
| r14 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5); |
| r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6); |
| r13 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7); |
| r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8); |
| r12 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9); |
| r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10); |
| r11 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11); |
| r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12); |
| r10 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13); |
| r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14); |
| r9 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15); |
| { |
| { |
| HS_SLAB_HALF_PREAMBLE(16); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(8); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| } |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0) = r1; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1) = r16; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2) = r2; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3) = r15; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4) = r3; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5) = r14; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6) = r4; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7) = r13; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8) = r5; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9) = r12; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10) = r6; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11) = r11; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12) = r7; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13) = r10; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14) = r8; |
| HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15) = r9; |
| HS_BLOCK_BARRIER(); |
| { |
| { |
| HS_KEY_TYPE r0_1 = HS_SLAB_LOCAL_L(0); |
| HS_KEY_TYPE r0_2 = HS_SLAB_LOCAL_L(32); |
| HS_KEY_TYPE r0_3 = HS_SLAB_LOCAL_L(64); |
| HS_KEY_TYPE r0_4 = HS_SLAB_LOCAL_L(96); |
| HS_KEY_TYPE r0_5 = HS_SLAB_LOCAL_L(128); |
| HS_KEY_TYPE r0_6 = HS_SLAB_LOCAL_L(160); |
| HS_KEY_TYPE r0_7 = HS_SLAB_LOCAL_L(192); |
| HS_KEY_TYPE r0_8 = HS_SLAB_LOCAL_L(224); |
| HS_KEY_TYPE r0_9 = HS_SLAB_LOCAL_R(256); |
| HS_KEY_TYPE r0_10 = HS_SLAB_LOCAL_R(288); |
| HS_KEY_TYPE r0_11 = HS_SLAB_LOCAL_R(320); |
| HS_KEY_TYPE r0_12 = HS_SLAB_LOCAL_R(352); |
| HS_KEY_TYPE r0_13 = HS_SLAB_LOCAL_R(384); |
| HS_KEY_TYPE r0_14 = HS_SLAB_LOCAL_R(416); |
| HS_KEY_TYPE r0_15 = HS_SLAB_LOCAL_R(448); |
| HS_KEY_TYPE r0_16 = HS_SLAB_LOCAL_R(480); |
| HS_CMP_XCHG(r0_8, r0_9); |
| HS_CMP_XCHG(r0_7, r0_10); |
| HS_CMP_XCHG(r0_6, r0_11); |
| HS_CMP_XCHG(r0_5, r0_12); |
| HS_CMP_XCHG(r0_4, r0_13); |
| HS_CMP_XCHG(r0_3, r0_14); |
| HS_CMP_XCHG(r0_2, r0_15); |
| HS_CMP_XCHG(r0_1, r0_16); |
| HS_CMP_XCHG(r0_9, r0_13); |
| HS_CMP_XCHG(r0_11, r0_15); |
| HS_CMP_XCHG(r0_9, r0_11); |
| HS_CMP_XCHG(r0_13, r0_15); |
| HS_CMP_XCHG(r0_10, r0_14); |
| HS_CMP_XCHG(r0_12, r0_16); |
| HS_CMP_XCHG(r0_10, r0_12); |
| HS_CMP_XCHG(r0_14, r0_16); |
| HS_CMP_XCHG(r0_9, r0_10); |
| HS_CMP_XCHG(r0_11, r0_12); |
| HS_CMP_XCHG(r0_13, r0_14); |
| HS_CMP_XCHG(r0_15, r0_16); |
| HS_CMP_XCHG(r0_1, r0_5); |
| HS_CMP_XCHG(r0_3, r0_7); |
| HS_CMP_XCHG(r0_1, r0_3); |
| HS_CMP_XCHG(r0_5, r0_7); |
| HS_CMP_XCHG(r0_2, r0_6); |
| HS_CMP_XCHG(r0_4, r0_8); |
| HS_CMP_XCHG(r0_2, r0_4); |
| HS_CMP_XCHG(r0_6, r0_8); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_CMP_XCHG(r0_3, r0_4); |
| HS_CMP_XCHG(r0_5, r0_6); |
| HS_CMP_XCHG(r0_7, r0_8); |
| HS_SLAB_LOCAL_L(0) = r0_1; |
| HS_SLAB_LOCAL_L(32) = r0_2; |
| HS_SLAB_LOCAL_L(64) = r0_3; |
| HS_SLAB_LOCAL_L(96) = r0_4; |
| HS_SLAB_LOCAL_L(128) = r0_5; |
| HS_SLAB_LOCAL_L(160) = r0_6; |
| HS_SLAB_LOCAL_L(192) = r0_7; |
| HS_SLAB_LOCAL_L(224) = r0_8; |
| HS_SLAB_LOCAL_R(256) = r0_9; |
| HS_SLAB_LOCAL_R(288) = r0_10; |
| HS_SLAB_LOCAL_R(320) = r0_11; |
| HS_SLAB_LOCAL_R(352) = r0_12; |
| HS_SLAB_LOCAL_R(384) = r0_13; |
| HS_SLAB_LOCAL_R(416) = r0_14; |
| HS_SLAB_LOCAL_R(448) = r0_15; |
| HS_SLAB_LOCAL_R(480) = r0_16; |
| } |
| } |
| HS_BLOCK_BARRIER(); |
| r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0); |
| r16 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1); |
| r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2); |
| r15 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3); |
| r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4); |
| r14 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5); |
| r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6); |
| r13 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7); |
| r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8); |
| r12 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9); |
| r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10); |
| r11 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11); |
| r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12); |
| r10 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13); |
| r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14); |
| r9 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15); |
| { |
| { |
| HS_SLAB_HALF_PREAMBLE(16); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(8); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| } |
| HS_SLAB_GLOBAL_STORE(0, r1); |
| HS_SLAB_GLOBAL_STORE(1, r2); |
| HS_SLAB_GLOBAL_STORE(2, r3); |
| HS_SLAB_GLOBAL_STORE(3, r4); |
| HS_SLAB_GLOBAL_STORE(4, r5); |
| HS_SLAB_GLOBAL_STORE(5, r6); |
| HS_SLAB_GLOBAL_STORE(6, r7); |
| HS_SLAB_GLOBAL_STORE(7, r8); |
| HS_SLAB_GLOBAL_STORE(8, r9); |
| HS_SLAB_GLOBAL_STORE(9, r10); |
| HS_SLAB_GLOBAL_STORE(10, r11); |
| HS_SLAB_GLOBAL_STORE(11, r12); |
| HS_SLAB_GLOBAL_STORE(12, r13); |
| HS_SLAB_GLOBAL_STORE(13, r14); |
| HS_SLAB_GLOBAL_STORE(14, r15); |
| HS_SLAB_GLOBAL_STORE(15, r16); |
| } |
| |
| HS_BC_KERNEL_PROTO(1, 0) |
| { |
| HS_SLAB_GLOBAL_PREAMBLE(); |
| HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vout, 0); |
| HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vout, 1); |
| HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vout, 2); |
| HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vout, 3); |
| HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vout, 4); |
| HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vout, 5); |
| HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vout, 6); |
| HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vout, 7); |
| HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vout, 8); |
| HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vout, 9); |
| HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vout, 10); |
| HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vout, 11); |
| HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vout, 12); |
| HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vout, 13); |
| HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vout, 14); |
| HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vout, 15); |
| { |
| { |
| HS_SLAB_HALF_PREAMBLE(16); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(8); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| } |
| HS_SLAB_GLOBAL_STORE(0, r1); |
| HS_SLAB_GLOBAL_STORE(1, r2); |
| HS_SLAB_GLOBAL_STORE(2, r3); |
| HS_SLAB_GLOBAL_STORE(3, r4); |
| HS_SLAB_GLOBAL_STORE(4, r5); |
| HS_SLAB_GLOBAL_STORE(5, r6); |
| HS_SLAB_GLOBAL_STORE(6, r7); |
| HS_SLAB_GLOBAL_STORE(7, r8); |
| HS_SLAB_GLOBAL_STORE(8, r9); |
| HS_SLAB_GLOBAL_STORE(9, r10); |
| HS_SLAB_GLOBAL_STORE(10, r11); |
| HS_SLAB_GLOBAL_STORE(11, r12); |
| HS_SLAB_GLOBAL_STORE(12, r13); |
| HS_SLAB_GLOBAL_STORE(13, r14); |
| HS_SLAB_GLOBAL_STORE(14, r15); |
| HS_SLAB_GLOBAL_STORE(15, r16); |
| } |
| |
| HS_BC_KERNEL_PROTO(2, 1) |
| { |
| HS_BLOCK_LOCAL_MEM_DECL(64, 16); |
| |
| HS_SLAB_GLOBAL_PREAMBLE(); |
| HS_BC_MERGE_H_PREAMBLE(2); |
| { |
| { |
| HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0); |
| HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(16); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(0) = r0_1; |
| HS_SLAB_LOCAL_L(32) = r0_2; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(2); |
| HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(18); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(128) = r0_1; |
| HS_SLAB_LOCAL_L(160) = r0_2; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(4); |
| HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(20); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(256) = r0_1; |
| HS_SLAB_LOCAL_L(288) = r0_2; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(6); |
| HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(22); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(384) = r0_1; |
| HS_SLAB_LOCAL_L(416) = r0_2; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8); |
| HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(24); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(512) = r0_1; |
| HS_SLAB_LOCAL_L(544) = r0_2; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(10); |
| HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(26); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(640) = r0_1; |
| HS_SLAB_LOCAL_L(672) = r0_2; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(12); |
| HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(28); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(768) = r0_1; |
| HS_SLAB_LOCAL_L(800) = r0_2; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(14); |
| HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(30); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_SLAB_LOCAL_L(896) = r0_1; |
| HS_SLAB_LOCAL_L(928) = r0_2; |
| } |
| } |
| HS_BLOCK_BARRIER(); |
| HS_KEY_TYPE r1 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 0); |
| HS_KEY_TYPE r2 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 1); |
| HS_KEY_TYPE r3 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 2); |
| HS_KEY_TYPE r4 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 3); |
| HS_KEY_TYPE r5 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 4); |
| HS_KEY_TYPE r6 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 5); |
| HS_KEY_TYPE r7 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 6); |
| HS_KEY_TYPE r8 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 7); |
| HS_KEY_TYPE r9 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 8); |
| HS_KEY_TYPE r10 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 9); |
| HS_KEY_TYPE r11 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 10); |
| HS_KEY_TYPE r12 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 11); |
| HS_KEY_TYPE r13 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 12); |
| HS_KEY_TYPE r14 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 13); |
| HS_KEY_TYPE r15 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 14); |
| HS_KEY_TYPE r16 = HS_BX_LOCAL_V(2 * HS_SLAB_THREADS * 15); |
| { |
| { |
| HS_SLAB_HALF_PREAMBLE(16); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(8); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| } |
| HS_SLAB_GLOBAL_STORE(0, r1); |
| HS_SLAB_GLOBAL_STORE(1, r2); |
| HS_SLAB_GLOBAL_STORE(2, r3); |
| HS_SLAB_GLOBAL_STORE(3, r4); |
| HS_SLAB_GLOBAL_STORE(4, r5); |
| HS_SLAB_GLOBAL_STORE(5, r6); |
| HS_SLAB_GLOBAL_STORE(6, r7); |
| HS_SLAB_GLOBAL_STORE(7, r8); |
| HS_SLAB_GLOBAL_STORE(8, r9); |
| HS_SLAB_GLOBAL_STORE(9, r10); |
| HS_SLAB_GLOBAL_STORE(10, r11); |
| HS_SLAB_GLOBAL_STORE(11, r12); |
| HS_SLAB_GLOBAL_STORE(12, r13); |
| HS_SLAB_GLOBAL_STORE(13, r14); |
| HS_SLAB_GLOBAL_STORE(14, r15); |
| HS_SLAB_GLOBAL_STORE(15, r16); |
| } |
| |
| HS_BC_KERNEL_PROTO(4, 2) |
| { |
| HS_BLOCK_LOCAL_MEM_DECL(128, 16); |
| |
| HS_SLAB_GLOBAL_PREAMBLE(); |
| HS_BC_MERGE_H_PREAMBLE(4); |
| { |
| { |
| HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0); |
| HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(16); |
| HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(32); |
| HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(48); |
| HS_CMP_XCHG(r0_1, r0_3); |
| HS_CMP_XCHG(r0_2, r0_4); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_CMP_XCHG(r0_3, r0_4); |
| HS_SLAB_LOCAL_L(0) = r0_1; |
| HS_SLAB_LOCAL_L(32) = r0_2; |
| HS_SLAB_LOCAL_L(64) = r0_3; |
| HS_SLAB_LOCAL_L(96) = r0_4; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(4); |
| HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(20); |
| HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(36); |
| HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(52); |
| HS_CMP_XCHG(r0_1, r0_3); |
| HS_CMP_XCHG(r0_2, r0_4); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_CMP_XCHG(r0_3, r0_4); |
| HS_SLAB_LOCAL_L(512) = r0_1; |
| HS_SLAB_LOCAL_L(544) = r0_2; |
| HS_SLAB_LOCAL_L(576) = r0_3; |
| HS_SLAB_LOCAL_L(608) = r0_4; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8); |
| HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(24); |
| HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(40); |
| HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(56); |
| HS_CMP_XCHG(r0_1, r0_3); |
| HS_CMP_XCHG(r0_2, r0_4); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_CMP_XCHG(r0_3, r0_4); |
| HS_SLAB_LOCAL_L(1024) = r0_1; |
| HS_SLAB_LOCAL_L(1056) = r0_2; |
| HS_SLAB_LOCAL_L(1088) = r0_3; |
| HS_SLAB_LOCAL_L(1120) = r0_4; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(12); |
| HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(28); |
| HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(44); |
| HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(60); |
| HS_CMP_XCHG(r0_1, r0_3); |
| HS_CMP_XCHG(r0_2, r0_4); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_CMP_XCHG(r0_3, r0_4); |
| HS_SLAB_LOCAL_L(1536) = r0_1; |
| HS_SLAB_LOCAL_L(1568) = r0_2; |
| HS_SLAB_LOCAL_L(1600) = r0_3; |
| HS_SLAB_LOCAL_L(1632) = r0_4; |
| } |
| } |
| HS_BLOCK_BARRIER(); |
| HS_KEY_TYPE r1 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 0); |
| HS_KEY_TYPE r2 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 1); |
| HS_KEY_TYPE r3 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 2); |
| HS_KEY_TYPE r4 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 3); |
| HS_KEY_TYPE r5 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 4); |
| HS_KEY_TYPE r6 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 5); |
| HS_KEY_TYPE r7 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 6); |
| HS_KEY_TYPE r8 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 7); |
| HS_KEY_TYPE r9 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 8); |
| HS_KEY_TYPE r10 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 9); |
| HS_KEY_TYPE r11 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 10); |
| HS_KEY_TYPE r12 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 11); |
| HS_KEY_TYPE r13 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 12); |
| HS_KEY_TYPE r14 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 13); |
| HS_KEY_TYPE r15 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 14); |
| HS_KEY_TYPE r16 = HS_BX_LOCAL_V(4 * HS_SLAB_THREADS * 15); |
| { |
| { |
| HS_SLAB_HALF_PREAMBLE(16); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(8); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| } |
| HS_SLAB_GLOBAL_STORE(0, r1); |
| HS_SLAB_GLOBAL_STORE(1, r2); |
| HS_SLAB_GLOBAL_STORE(2, r3); |
| HS_SLAB_GLOBAL_STORE(3, r4); |
| HS_SLAB_GLOBAL_STORE(4, r5); |
| HS_SLAB_GLOBAL_STORE(5, r6); |
| HS_SLAB_GLOBAL_STORE(6, r7); |
| HS_SLAB_GLOBAL_STORE(7, r8); |
| HS_SLAB_GLOBAL_STORE(8, r9); |
| HS_SLAB_GLOBAL_STORE(9, r10); |
| HS_SLAB_GLOBAL_STORE(10, r11); |
| HS_SLAB_GLOBAL_STORE(11, r12); |
| HS_SLAB_GLOBAL_STORE(12, r13); |
| HS_SLAB_GLOBAL_STORE(13, r14); |
| HS_SLAB_GLOBAL_STORE(14, r15); |
| HS_SLAB_GLOBAL_STORE(15, r16); |
| } |
| |
| HS_BC_KERNEL_PROTO(8, 3) |
| { |
| HS_BLOCK_LOCAL_MEM_DECL(256, 16); |
| |
| HS_SLAB_GLOBAL_PREAMBLE(); |
| HS_BC_MERGE_H_PREAMBLE(8); |
| { |
| { |
| HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0); |
| HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(16); |
| HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(32); |
| HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(48); |
| HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(64); |
| HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(80); |
| HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(96); |
| HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(112); |
| HS_CMP_XCHG(r0_1, r0_5); |
| HS_CMP_XCHG(r0_3, r0_7); |
| HS_CMP_XCHG(r0_1, r0_3); |
| HS_CMP_XCHG(r0_5, r0_7); |
| HS_CMP_XCHG(r0_2, r0_6); |
| HS_CMP_XCHG(r0_4, r0_8); |
| HS_CMP_XCHG(r0_2, r0_4); |
| HS_CMP_XCHG(r0_6, r0_8); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_CMP_XCHG(r0_3, r0_4); |
| HS_CMP_XCHG(r0_5, r0_6); |
| HS_CMP_XCHG(r0_7, r0_8); |
| HS_SLAB_LOCAL_L(0) = r0_1; |
| HS_SLAB_LOCAL_L(32) = r0_2; |
| HS_SLAB_LOCAL_L(64) = r0_3; |
| HS_SLAB_LOCAL_L(96) = r0_4; |
| HS_SLAB_LOCAL_L(128) = r0_5; |
| HS_SLAB_LOCAL_L(160) = r0_6; |
| HS_SLAB_LOCAL_L(192) = r0_7; |
| HS_SLAB_LOCAL_L(224) = r0_8; |
| } |
| { |
| HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(8); |
| HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(24); |
| HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(40); |
| HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(56); |
| HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(72); |
| HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(88); |
| HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(104); |
| HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(120); |
| HS_CMP_XCHG(r0_1, r0_5); |
| HS_CMP_XCHG(r0_3, r0_7); |
| HS_CMP_XCHG(r0_1, r0_3); |
| HS_CMP_XCHG(r0_5, r0_7); |
| HS_CMP_XCHG(r0_2, r0_6); |
| HS_CMP_XCHG(r0_4, r0_8); |
| HS_CMP_XCHG(r0_2, r0_4); |
| HS_CMP_XCHG(r0_6, r0_8); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_CMP_XCHG(r0_3, r0_4); |
| HS_CMP_XCHG(r0_5, r0_6); |
| HS_CMP_XCHG(r0_7, r0_8); |
| HS_SLAB_LOCAL_L(2048) = r0_1; |
| HS_SLAB_LOCAL_L(2080) = r0_2; |
| HS_SLAB_LOCAL_L(2112) = r0_3; |
| HS_SLAB_LOCAL_L(2144) = r0_4; |
| HS_SLAB_LOCAL_L(2176) = r0_5; |
| HS_SLAB_LOCAL_L(2208) = r0_6; |
| HS_SLAB_LOCAL_L(2240) = r0_7; |
| HS_SLAB_LOCAL_L(2272) = r0_8; |
| } |
| } |
| HS_BLOCK_BARRIER(); |
| HS_KEY_TYPE r1 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 0); |
| HS_KEY_TYPE r2 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 1); |
| HS_KEY_TYPE r3 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 2); |
| HS_KEY_TYPE r4 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 3); |
| HS_KEY_TYPE r5 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 4); |
| HS_KEY_TYPE r6 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 5); |
| HS_KEY_TYPE r7 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 6); |
| HS_KEY_TYPE r8 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 7); |
| HS_KEY_TYPE r9 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 8); |
| HS_KEY_TYPE r10 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 9); |
| HS_KEY_TYPE r11 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 10); |
| HS_KEY_TYPE r12 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 11); |
| HS_KEY_TYPE r13 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 12); |
| HS_KEY_TYPE r14 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 13); |
| HS_KEY_TYPE r15 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 14); |
| HS_KEY_TYPE r16 = HS_BX_LOCAL_V(8 * HS_SLAB_THREADS * 15); |
| { |
| { |
| HS_SLAB_HALF_PREAMBLE(16); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(8); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| } |
| HS_SLAB_GLOBAL_STORE(0, r1); |
| HS_SLAB_GLOBAL_STORE(1, r2); |
| HS_SLAB_GLOBAL_STORE(2, r3); |
| HS_SLAB_GLOBAL_STORE(3, r4); |
| HS_SLAB_GLOBAL_STORE(4, r5); |
| HS_SLAB_GLOBAL_STORE(5, r6); |
| HS_SLAB_GLOBAL_STORE(6, r7); |
| HS_SLAB_GLOBAL_STORE(7, r8); |
| HS_SLAB_GLOBAL_STORE(8, r9); |
| HS_SLAB_GLOBAL_STORE(9, r10); |
| HS_SLAB_GLOBAL_STORE(10, r11); |
| HS_SLAB_GLOBAL_STORE(11, r12); |
| HS_SLAB_GLOBAL_STORE(12, r13); |
| HS_SLAB_GLOBAL_STORE(13, r14); |
| HS_SLAB_GLOBAL_STORE(14, r15); |
| HS_SLAB_GLOBAL_STORE(15, r16); |
| } |
| |
| HS_BC_KERNEL_PROTO(16, 4) |
| { |
| HS_BLOCK_LOCAL_MEM_DECL(512, 16); |
| |
| HS_SLAB_GLOBAL_PREAMBLE(); |
| HS_BC_MERGE_H_PREAMBLE(16); |
| { |
| { |
| HS_KEY_TYPE r0_1 = HS_BC_GLOBAL_LOAD_L(0); |
| HS_KEY_TYPE r0_2 = HS_BC_GLOBAL_LOAD_L(16); |
| HS_KEY_TYPE r0_3 = HS_BC_GLOBAL_LOAD_L(32); |
| HS_KEY_TYPE r0_4 = HS_BC_GLOBAL_LOAD_L(48); |
| HS_KEY_TYPE r0_5 = HS_BC_GLOBAL_LOAD_L(64); |
| HS_KEY_TYPE r0_6 = HS_BC_GLOBAL_LOAD_L(80); |
| HS_KEY_TYPE r0_7 = HS_BC_GLOBAL_LOAD_L(96); |
| HS_KEY_TYPE r0_8 = HS_BC_GLOBAL_LOAD_L(112); |
| HS_KEY_TYPE r0_9 = HS_BC_GLOBAL_LOAD_L(128); |
| HS_KEY_TYPE r0_10 = HS_BC_GLOBAL_LOAD_L(144); |
| HS_KEY_TYPE r0_11 = HS_BC_GLOBAL_LOAD_L(160); |
| HS_KEY_TYPE r0_12 = HS_BC_GLOBAL_LOAD_L(176); |
| HS_KEY_TYPE r0_13 = HS_BC_GLOBAL_LOAD_L(192); |
| HS_KEY_TYPE r0_14 = HS_BC_GLOBAL_LOAD_L(208); |
| HS_KEY_TYPE r0_15 = HS_BC_GLOBAL_LOAD_L(224); |
| HS_KEY_TYPE r0_16 = HS_BC_GLOBAL_LOAD_L(240); |
| HS_CMP_XCHG(r0_1, r0_9); |
| HS_CMP_XCHG(r0_5, r0_13); |
| HS_CMP_XCHG(r0_1, r0_5); |
| HS_CMP_XCHG(r0_9, r0_13); |
| HS_CMP_XCHG(r0_3, r0_11); |
| HS_CMP_XCHG(r0_7, r0_15); |
| HS_CMP_XCHG(r0_3, r0_7); |
| HS_CMP_XCHG(r0_11, r0_15); |
| HS_CMP_XCHG(r0_1, r0_3); |
| HS_CMP_XCHG(r0_5, r0_7); |
| HS_CMP_XCHG(r0_9, r0_11); |
| HS_CMP_XCHG(r0_13, r0_15); |
| HS_CMP_XCHG(r0_2, r0_10); |
| HS_CMP_XCHG(r0_6, r0_14); |
| HS_CMP_XCHG(r0_2, r0_6); |
| HS_CMP_XCHG(r0_10, r0_14); |
| HS_CMP_XCHG(r0_4, r0_12); |
| HS_CMP_XCHG(r0_8, r0_16); |
| HS_CMP_XCHG(r0_4, r0_8); |
| HS_CMP_XCHG(r0_12, r0_16); |
| HS_CMP_XCHG(r0_2, r0_4); |
| HS_CMP_XCHG(r0_6, r0_8); |
| HS_CMP_XCHG(r0_10, r0_12); |
| HS_CMP_XCHG(r0_14, r0_16); |
| HS_CMP_XCHG(r0_1, r0_2); |
| HS_CMP_XCHG(r0_3, r0_4); |
| HS_CMP_XCHG(r0_5, r0_6); |
| HS_CMP_XCHG(r0_7, r0_8); |
| HS_CMP_XCHG(r0_9, r0_10); |
| HS_CMP_XCHG(r0_11, r0_12); |
| HS_CMP_XCHG(r0_13, r0_14); |
| HS_CMP_XCHG(r0_15, r0_16); |
| HS_SLAB_LOCAL_L(0) = r0_1; |
| HS_SLAB_LOCAL_L(32) = r0_2; |
| HS_SLAB_LOCAL_L(64) = r0_3; |
| HS_SLAB_LOCAL_L(96) = r0_4; |
| HS_SLAB_LOCAL_L(128) = r0_5; |
| HS_SLAB_LOCAL_L(160) = r0_6; |
| HS_SLAB_LOCAL_L(192) = r0_7; |
| HS_SLAB_LOCAL_L(224) = r0_8; |
| HS_SLAB_LOCAL_L(256) = r0_9; |
| HS_SLAB_LOCAL_L(288) = r0_10; |
| HS_SLAB_LOCAL_L(320) = r0_11; |
| HS_SLAB_LOCAL_L(352) = r0_12; |
| HS_SLAB_LOCAL_L(384) = r0_13; |
| HS_SLAB_LOCAL_L(416) = r0_14; |
| HS_SLAB_LOCAL_L(448) = r0_15; |
| HS_SLAB_LOCAL_L(480) = r0_16; |
| } |
| } |
| HS_BLOCK_BARRIER(); |
| HS_KEY_TYPE r1 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 0); |
| HS_KEY_TYPE r2 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 1); |
| HS_KEY_TYPE r3 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 2); |
| HS_KEY_TYPE r4 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 3); |
| HS_KEY_TYPE r5 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 4); |
| HS_KEY_TYPE r6 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 5); |
| HS_KEY_TYPE r7 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 6); |
| HS_KEY_TYPE r8 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 7); |
| HS_KEY_TYPE r9 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 8); |
| HS_KEY_TYPE r10 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 9); |
| HS_KEY_TYPE r11 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 10); |
| HS_KEY_TYPE r12 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 11); |
| HS_KEY_TYPE r13 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 12); |
| HS_KEY_TYPE r14 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 13); |
| HS_KEY_TYPE r15 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 14); |
| HS_KEY_TYPE r16 = HS_BX_LOCAL_V(16 * HS_SLAB_THREADS * 15); |
| { |
| { |
| HS_SLAB_HALF_PREAMBLE(16); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(8); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(4); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(2); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| { |
| HS_SLAB_HALF_PREAMBLE(1); |
| HS_CMP_HALF(0, r1); |
| HS_CMP_HALF(1, r2); |
| HS_CMP_HALF(2, r3); |
| HS_CMP_HALF(3, r4); |
| HS_CMP_HALF(4, r5); |
| HS_CMP_HALF(5, r6); |
| HS_CMP_HALF(6, r7); |
| HS_CMP_HALF(7, r8); |
| HS_CMP_HALF(8, r9); |
| HS_CMP_HALF(9, r10); |
| HS_CMP_HALF(10, r11); |
| HS_CMP_HALF(11, r12); |
| HS_CMP_HALF(12, r13); |
| HS_CMP_HALF(13, r14); |
| HS_CMP_HALF(14, r15); |
| HS_CMP_HALF(15, r16); |
| } |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| } |
| HS_SLAB_GLOBAL_STORE(0, r1); |
| HS_SLAB_GLOBAL_STORE(1, r2); |
| HS_SLAB_GLOBAL_STORE(2, r3); |
| HS_SLAB_GLOBAL_STORE(3, r4); |
| HS_SLAB_GLOBAL_STORE(4, r5); |
| HS_SLAB_GLOBAL_STORE(5, r6); |
| HS_SLAB_GLOBAL_STORE(6, r7); |
| HS_SLAB_GLOBAL_STORE(7, r8); |
| HS_SLAB_GLOBAL_STORE(8, r9); |
| HS_SLAB_GLOBAL_STORE(9, r10); |
| HS_SLAB_GLOBAL_STORE(10, r11); |
| HS_SLAB_GLOBAL_STORE(11, r12); |
| HS_SLAB_GLOBAL_STORE(12, r13); |
| HS_SLAB_GLOBAL_STORE(13, r14); |
| HS_SLAB_GLOBAL_STORE(14, r15); |
| HS_SLAB_GLOBAL_STORE(15, r16); |
| } |
| |
| HS_OFFSET_FM_KERNEL_PROTO(0, 0) |
| { |
| HS_OFFSET_FM_PREAMBLE(8); |
| HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); |
| HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); |
| HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); |
| HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); |
| HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); |
| HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); |
| HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); |
| HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); |
| HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0); |
| HS_CMP_XCHG(r8, r9); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_XM_GLOBAL_STORE_L(0, r1); |
| HS_XM_GLOBAL_STORE_L(1, r2); |
| HS_XM_GLOBAL_STORE_L(2, r3); |
| HS_XM_GLOBAL_STORE_L(3, r4); |
| HS_XM_GLOBAL_STORE_L(4, r5); |
| HS_XM_GLOBAL_STORE_L(5, r6); |
| HS_XM_GLOBAL_STORE_L(6, r7); |
| HS_XM_GLOBAL_STORE_L(7, r8); |
| HS_FM_GLOBAL_STORE_R(0, r9); |
| } |
| |
| HS_OFFSET_FM_KERNEL_PROTO(0, 1) |
| { |
| HS_OFFSET_FM_PREAMBLE(8); |
| HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); |
| HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); |
| HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); |
| HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); |
| HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); |
| HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); |
| HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); |
| HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); |
| HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0); |
| HS_KEY_TYPE r10 = HS_FM_GLOBAL_LOAD_R(1); |
| HS_CMP_XCHG(r8, r9); |
| HS_CMP_XCHG(r7, r10); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_XM_GLOBAL_STORE_L(0, r1); |
| HS_XM_GLOBAL_STORE_L(1, r2); |
| HS_XM_GLOBAL_STORE_L(2, r3); |
| HS_XM_GLOBAL_STORE_L(3, r4); |
| HS_XM_GLOBAL_STORE_L(4, r5); |
| HS_XM_GLOBAL_STORE_L(5, r6); |
| HS_XM_GLOBAL_STORE_L(6, r7); |
| HS_XM_GLOBAL_STORE_L(7, r8); |
| HS_FM_GLOBAL_STORE_R(0, r9); |
| HS_FM_GLOBAL_STORE_R(1, r10); |
| } |
| |
| HS_OFFSET_FM_KERNEL_PROTO(0, 2) |
| { |
| HS_OFFSET_FM_PREAMBLE(8); |
| HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); |
| HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); |
| HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); |
| HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); |
| HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); |
| HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); |
| HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); |
| HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); |
| HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0); |
| HS_KEY_TYPE r10 = HS_FM_GLOBAL_LOAD_R(1); |
| HS_KEY_TYPE r11 = HS_FM_GLOBAL_LOAD_R(2); |
| HS_KEY_TYPE r12 = HS_FM_GLOBAL_LOAD_R(3); |
| HS_CMP_XCHG(r8, r9); |
| HS_CMP_XCHG(r7, r10); |
| HS_CMP_XCHG(r6, r11); |
| HS_CMP_XCHG(r5, r12); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_XM_GLOBAL_STORE_L(0, r1); |
| HS_XM_GLOBAL_STORE_L(1, r2); |
| HS_XM_GLOBAL_STORE_L(2, r3); |
| HS_XM_GLOBAL_STORE_L(3, r4); |
| HS_XM_GLOBAL_STORE_L(4, r5); |
| HS_XM_GLOBAL_STORE_L(5, r6); |
| HS_XM_GLOBAL_STORE_L(6, r7); |
| HS_XM_GLOBAL_STORE_L(7, r8); |
| HS_FM_GLOBAL_STORE_R(0, r9); |
| HS_FM_GLOBAL_STORE_R(1, r10); |
| HS_FM_GLOBAL_STORE_R(2, r11); |
| HS_FM_GLOBAL_STORE_R(3, r12); |
| } |
| |
| HS_FM_KERNEL_PROTO(0, 3) |
| { |
| HS_FM_PREAMBLE(8); |
| HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); |
| HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); |
| HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); |
| HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); |
| HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); |
| HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); |
| HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); |
| HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); |
| HS_KEY_TYPE r9 = HS_FM_GLOBAL_LOAD_R(0); |
| HS_KEY_TYPE r10 = HS_FM_GLOBAL_LOAD_R(1); |
| HS_KEY_TYPE r11 = HS_FM_GLOBAL_LOAD_R(2); |
| HS_KEY_TYPE r12 = HS_FM_GLOBAL_LOAD_R(3); |
| HS_KEY_TYPE r13 = HS_FM_GLOBAL_LOAD_R(4); |
| HS_KEY_TYPE r14 = HS_FM_GLOBAL_LOAD_R(5); |
| HS_KEY_TYPE r15 = HS_FM_GLOBAL_LOAD_R(6); |
| HS_KEY_TYPE r16 = HS_FM_GLOBAL_LOAD_R(7); |
| HS_CMP_XCHG(r8, r9); |
| HS_CMP_XCHG(r7, r10); |
| HS_CMP_XCHG(r6, r11); |
| HS_CMP_XCHG(r5, r12); |
| HS_CMP_XCHG(r4, r13); |
| HS_CMP_XCHG(r3, r14); |
| HS_CMP_XCHG(r2, r15); |
| HS_CMP_XCHG(r1, r16); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| HS_XM_GLOBAL_STORE_L(0, r1); |
| HS_XM_GLOBAL_STORE_L(1, r2); |
| HS_XM_GLOBAL_STORE_L(2, r3); |
| HS_XM_GLOBAL_STORE_L(3, r4); |
| HS_XM_GLOBAL_STORE_L(4, r5); |
| HS_XM_GLOBAL_STORE_L(5, r6); |
| HS_XM_GLOBAL_STORE_L(6, r7); |
| HS_XM_GLOBAL_STORE_L(7, r8); |
| HS_FM_GLOBAL_STORE_R(0, r9); |
| HS_FM_GLOBAL_STORE_R(1, r10); |
| HS_FM_GLOBAL_STORE_R(2, r11); |
| HS_FM_GLOBAL_STORE_R(3, r12); |
| HS_FM_GLOBAL_STORE_R(4, r13); |
| HS_FM_GLOBAL_STORE_R(5, r14); |
| HS_FM_GLOBAL_STORE_R(6, r15); |
| HS_FM_GLOBAL_STORE_R(7, r16); |
| } |
| |
| HS_HM_KERNEL_PROTO(0) |
| { |
| HS_HM_PREAMBLE(8); |
| HS_KEY_TYPE r1 = HS_XM_GLOBAL_LOAD_L(0); |
| HS_KEY_TYPE r2 = HS_XM_GLOBAL_LOAD_L(1); |
| HS_KEY_TYPE r3 = HS_XM_GLOBAL_LOAD_L(2); |
| HS_KEY_TYPE r4 = HS_XM_GLOBAL_LOAD_L(3); |
| HS_KEY_TYPE r5 = HS_XM_GLOBAL_LOAD_L(4); |
| HS_KEY_TYPE r6 = HS_XM_GLOBAL_LOAD_L(5); |
| HS_KEY_TYPE r7 = HS_XM_GLOBAL_LOAD_L(6); |
| HS_KEY_TYPE r8 = HS_XM_GLOBAL_LOAD_L(7); |
| HS_KEY_TYPE r9 = HS_XM_GLOBAL_LOAD_L(8); |
| HS_KEY_TYPE r10 = HS_XM_GLOBAL_LOAD_L(9); |
| HS_KEY_TYPE r11 = HS_XM_GLOBAL_LOAD_L(10); |
| HS_KEY_TYPE r12 = HS_XM_GLOBAL_LOAD_L(11); |
| HS_KEY_TYPE r13 = HS_XM_GLOBAL_LOAD_L(12); |
| HS_KEY_TYPE r14 = HS_XM_GLOBAL_LOAD_L(13); |
| HS_KEY_TYPE r15 = HS_XM_GLOBAL_LOAD_L(14); |
| HS_KEY_TYPE r16 = HS_XM_GLOBAL_LOAD_L(15); |
| HS_CMP_XCHG(r1, r9); |
| HS_CMP_XCHG(r5, r13); |
| HS_CMP_XCHG(r1, r5); |
| HS_CMP_XCHG(r9, r13); |
| HS_CMP_XCHG(r3, r11); |
| HS_CMP_XCHG(r7, r15); |
| HS_CMP_XCHG(r3, r7); |
| HS_CMP_XCHG(r11, r15); |
| HS_CMP_XCHG(r1, r3); |
| HS_CMP_XCHG(r5, r7); |
| HS_CMP_XCHG(r9, r11); |
| HS_CMP_XCHG(r13, r15); |
| HS_CMP_XCHG(r2, r10); |
| HS_CMP_XCHG(r6, r14); |
| HS_CMP_XCHG(r2, r6); |
| HS_CMP_XCHG(r10, r14); |
| HS_CMP_XCHG(r4, r12); |
| HS_CMP_XCHG(r8, r16); |
| HS_CMP_XCHG(r4, r8); |
| HS_CMP_XCHG(r12, r16); |
| HS_CMP_XCHG(r2, r4); |
| HS_CMP_XCHG(r6, r8); |
| HS_CMP_XCHG(r10, r12); |
| HS_CMP_XCHG(r14, r16); |
| HS_CMP_XCHG(r1, r2); |
| HS_CMP_XCHG(r3, r4); |
| HS_CMP_XCHG(r5, r6); |
| HS_CMP_XCHG(r7, r8); |
| HS_CMP_XCHG(r9, r10); |
| HS_CMP_XCHG(r11, r12); |
| HS_CMP_XCHG(r13, r14); |
| HS_CMP_XCHG(r15, r16); |
| HS_XM_GLOBAL_STORE_L(0, r1); |
| HS_XM_GLOBAL_STORE_L(1, r2); |
| HS_XM_GLOBAL_STORE_L(2, r3); |
| HS_XM_GLOBAL_STORE_L(3, r4); |
| HS_XM_GLOBAL_STORE_L(4, r5); |
| HS_XM_GLOBAL_STORE_L(5, r6); |
| HS_XM_GLOBAL_STORE_L(6, r7); |
| HS_XM_GLOBAL_STORE_L(7, r8); |
| HS_XM_GLOBAL_STORE_L(8, r9); |
| HS_XM_GLOBAL_STORE_L(9, r10); |
| HS_XM_GLOBAL_STORE_L(10, r11); |
| HS_XM_GLOBAL_STORE_L(11, r12); |
| HS_XM_GLOBAL_STORE_L(12, r13); |
| HS_XM_GLOBAL_STORE_L(13, r14); |
| HS_XM_GLOBAL_STORE_L(14, r15); |
| HS_XM_GLOBAL_STORE_L(15, r16); |
| } |
| |
| HS_TRANSPOSE_KERNEL_PROTO() |
| { |
| HS_SLAB_GLOBAL_PREAMBLE(); |
| HS_KEY_TYPE r1 = HS_SLAB_GLOBAL_LOAD(vout, 0); |
| HS_KEY_TYPE r2 = HS_SLAB_GLOBAL_LOAD(vout, 1); |
| HS_KEY_TYPE r3 = HS_SLAB_GLOBAL_LOAD(vout, 2); |
| HS_KEY_TYPE r4 = HS_SLAB_GLOBAL_LOAD(vout, 3); |
| HS_KEY_TYPE r5 = HS_SLAB_GLOBAL_LOAD(vout, 4); |
| HS_KEY_TYPE r6 = HS_SLAB_GLOBAL_LOAD(vout, 5); |
| HS_KEY_TYPE r7 = HS_SLAB_GLOBAL_LOAD(vout, 6); |
| HS_KEY_TYPE r8 = HS_SLAB_GLOBAL_LOAD(vout, 7); |
| HS_KEY_TYPE r9 = HS_SLAB_GLOBAL_LOAD(vout, 8); |
| HS_KEY_TYPE r10 = HS_SLAB_GLOBAL_LOAD(vout, 9); |
| HS_KEY_TYPE r11 = HS_SLAB_GLOBAL_LOAD(vout, 10); |
| HS_KEY_TYPE r12 = HS_SLAB_GLOBAL_LOAD(vout, 11); |
| HS_KEY_TYPE r13 = HS_SLAB_GLOBAL_LOAD(vout, 12); |
| HS_KEY_TYPE r14 = HS_SLAB_GLOBAL_LOAD(vout, 13); |
| HS_KEY_TYPE r15 = HS_SLAB_GLOBAL_LOAD(vout, 14); |
| HS_KEY_TYPE r16 = HS_SLAB_GLOBAL_LOAD(vout, 15); |
| HS_TRANSPOSE_SLAB(); |
| } |
| |
| // |
| // |
| // |
| |
| #include "../../hs_cuda.inl" |
| |
| // |
| // |
| // |