/** * This code is released under a BSD License. */ #ifdef __SSE4_1__ #include "simdintegratedbitpacking.h" #include SIMDCOMP_ALIGNED(16) int8_t shuffle_mask_bytes[256] = { 0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0, 4,5,6,7,0,0,0,0,0,0,0,0,0,0,0,0, 8,9,10,11,0,0,0,0,0,0,0,0,0,0,0,0, 12,13,14,15,0,0,0,0,0,0,0,0,0,0,0,0, }; static const __m128i *shuffle_mask = (__m128i *) shuffle_mask_bytes; uint32_t branchlessextract (__m128i out, int i) { return _mm_cvtsi128_si32(_mm_shuffle_epi8(out,shuffle_mask[i])); } #define PrefixSum(ret, curr, prev) do { \ const __m128i _tmp1 = _mm_add_epi32(_mm_slli_si128(curr, 8), curr); \ const __m128i _tmp2 = _mm_add_epi32(_mm_slli_si128(_tmp1, 4), _tmp1); \ ret = _mm_add_epi32(_tmp2, _mm_shuffle_epi32(prev, 0xff)); \ } while (0) #define CHECK_AND_INCREMENT(i, out, slot) \ i += 4; \ if (i > slot) { \ return branchlessextract (out, slot - (i - 4)); \ } static uint32_t iunpackselect1(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<1)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,23); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,25); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,27); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,29); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,31); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect2(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<2)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect3(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<3)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,27); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,25); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,23); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,29); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect4(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<4)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect5(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<5)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,25); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,23); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,27); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect6(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<6)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect7(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<7)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,23); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,25); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect8(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<8)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect9(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<9)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,23); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect10(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<10)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect11(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<11)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,23); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,21); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect12(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<12)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect13(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<13)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,21); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,23); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,19); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect14(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<14)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect15(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<15)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,23); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,21); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,19); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,17); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect16(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<16)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect17(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<17)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,17); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,19); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,21); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,23); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,15); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect18(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<18)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect19(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<19)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,19); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,17); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,23); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,15); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,21); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,13); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect20(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<20)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect21(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<21)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,21); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,19); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,17); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,15); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,13); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,23); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,11); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect22(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<22)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect23(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<23)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,23); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,19); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,15); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,11); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,21); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,17); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,13); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,9); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect24(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<24)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect25(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<25)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,11); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,15); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,19); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,23); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,9); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,13); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,17); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,21); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,7); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect26(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<26)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect27(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<27)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,17); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,7); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,19); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,9); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,21); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,11); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,23); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,13); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,15); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,5); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect28(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<28)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect29(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<29)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,23); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,17); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,11); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,5); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,19); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,13); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,7); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,21); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,15); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,9); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,3); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect30(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<30)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect31(__m128i * initOffset, const __m128i *in, int slot) { int i = 0; __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<31)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,23); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,21); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,19); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,17); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,15); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,13); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,11); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,10); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,9); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,7); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,6); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,5); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,4); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,3); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,2); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); tmp = _mm_srli_epi32(InReg,1); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; CHECK_AND_INCREMENT(i, out, slot); return (0); } static uint32_t iunpackselect32(__m128i * initOffset , const __m128i *in, int slot) { uint32_t *begin = (uint32_t *)in; *initOffset = _mm_load_si128(in + 31); return begin[slot]; } uint32_t simdselectd1(uint32_t init, const __m128i *in, uint32_t bit, int slot) { __m128i vecinitOffset = _mm_set1_epi32(init); __m128i * initOffset = &vecinitOffset; slot &= 127; /* to avoid problems */ switch (bit) { case 0: return _mm_extract_epi32(*initOffset,3); break; case 1: return iunpackselect1(initOffset, in, slot); break; case 2: return iunpackselect2(initOffset, in, slot); break; case 3: return iunpackselect3(initOffset, in, slot); break; case 4: return iunpackselect4(initOffset, in, slot); break; case 5: return iunpackselect5(initOffset, in, slot); break; case 6: return iunpackselect6(initOffset, in, slot); break; case 7: return iunpackselect7(initOffset, in, slot); break; case 8: return iunpackselect8(initOffset, in, slot); break; case 9: return iunpackselect9(initOffset, in, slot); break; case 10: return iunpackselect10(initOffset, in, slot); break; case 11: return iunpackselect11(initOffset, in, slot); break; case 12: return iunpackselect12(initOffset, in, slot); break; case 13: return iunpackselect13(initOffset, in, slot); break; case 14: return iunpackselect14(initOffset, in, slot); break; case 15: return iunpackselect15(initOffset, in, slot); break; case 16: return iunpackselect16(initOffset, in, slot); break; case 17: return iunpackselect17(initOffset, in, slot); break; case 18: return iunpackselect18(initOffset, in, slot); break; case 19: return iunpackselect19(initOffset, in, slot); break; case 20: return iunpackselect20(initOffset, in, slot); break; case 21: return iunpackselect21(initOffset, in, slot); break; case 22: return iunpackselect22(initOffset, in, slot); break; case 23: return iunpackselect23(initOffset, in, slot); break; case 24: return iunpackselect24(initOffset, in, slot); break; case 25: return iunpackselect25(initOffset, in, slot); break; case 26: return iunpackselect26(initOffset, in, slot); break; case 27: return iunpackselect27(initOffset, in, slot); break; case 28: return iunpackselect28(initOffset, in, slot); break; case 29: return iunpackselect29(initOffset, in, slot); break; case 30: return iunpackselect30(initOffset, in, slot); break; case 31: return iunpackselect31(initOffset, in, slot); break; case 32: return iunpackselect32(initOffset, in, slot); break; default: break; } return (-1); } static void iunpackscan1(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<1)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,23); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,25); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,27); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,29); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,31); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan2(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<2)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan3(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<3)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,27); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,25); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 3-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,23); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,29); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan4(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<4)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan5(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<5)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,25); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,23); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 5-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,27); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan6(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<6)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 6-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan7(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<7)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,23); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 7-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,25); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan8(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<8)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan9(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<9)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,21); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 9-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,23); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan10(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<10)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 10-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan11(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<11)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,23); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,19); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 11-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,21); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan12(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<12)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 12-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan13(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<13)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,21); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,23); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,17); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 13-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,19); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan14(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<14)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 14-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan15(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<15)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,15); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,23); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,21); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,19); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 15-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,17); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan16(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<16)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan17(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<17)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,17); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,19); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,21); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,23); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,13); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 17-15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,15); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan18(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<18)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 18-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan19(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<19)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,19); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,11); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,17); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,23); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,15); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,21); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 19-13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,13); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan20(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<20)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 20-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan21(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<21)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,21); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,9); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,19); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,17); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,15); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,13); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,23); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 21-11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,11); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan22(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<22)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 22-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan23(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<23)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,23); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,19); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,15); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,11); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,7); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,21); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,17); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,13); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 23-9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,9); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan24(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<24)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 24-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan25(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<25)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,11); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,15); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,19); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,5); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-23), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,23); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,9); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,13); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,17); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,21); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 25-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,7); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan26(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<26)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 26-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan27(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<27)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,17); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,7); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,19); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,9); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,21); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,11); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-23), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,23); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,13); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,3); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-25), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,15); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 27-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,5); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan28(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<28)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 28-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan29(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<29)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-23), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,23); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,17); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,11); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,5); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-28), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-25), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,19); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,13); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,7); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,1); out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-27), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,21); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,15); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,9); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 29-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,3); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan30(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<30)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = tmp; ++in; InReg = _mm_loadu_si128(in); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-28), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 30-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan31(__m128i * initOffset, const __m128i *in) { __m128i InReg = _mm_loadu_si128(in); __m128i out; __m128i tmp; __m128i mask = _mm_set1_epi32((1U<<31)-1); tmp = InReg; out = _mm_and_si128(tmp, mask); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,31); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-30), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,30); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-29), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,29); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-28), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,28); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-27), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,27); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-26), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,26); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-25), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,25); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-24), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,24); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-23), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,23); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-22), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,22); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-21), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,21); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-20), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,20); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-19), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,19); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-18), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,18); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-17), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,17); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-16), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,16); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-15), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,15); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-14), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,14); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-13), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,13); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-12), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,12); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-11), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,11); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-10), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,10); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-9), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,9); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-8), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,8); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-7), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,7); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-6), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,6); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-5), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,5); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-4), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,4); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-3), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,3); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-2), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,2); out = tmp; ++in; InReg = _mm_loadu_si128(in); out = _mm_or_si128(out, _mm_and_si128(_mm_slli_epi32(InReg, 31-1), mask)); PrefixSum(out, out, *initOffset); *initOffset = out; tmp = _mm_srli_epi32(InReg,1); out = tmp; PrefixSum(out, out, *initOffset); *initOffset = out; } static void iunpackscan32(__m128i * initOffset , const __m128i *in) { *initOffset = _mm_load_si128(in + 31); } void simdscand1(__m128i * initOffset, const __m128i *in, uint32_t bit) { switch (bit) { case 0: return; break; case 1: iunpackscan1(initOffset, in); break; case 2: iunpackscan2(initOffset, in); break; case 3: iunpackscan3(initOffset, in); break; case 4: iunpackscan4(initOffset, in); break; case 5: iunpackscan5(initOffset, in); break; case 6: iunpackscan6(initOffset, in); break; case 7: iunpackscan7(initOffset, in); break; case 8: iunpackscan8(initOffset, in); break; case 9: iunpackscan9(initOffset, in); break; case 10: iunpackscan10(initOffset, in); break; case 11: iunpackscan11(initOffset, in); break; case 12: iunpackscan12(initOffset, in); break; case 13: iunpackscan13(initOffset, in); break; case 14: iunpackscan14(initOffset, in); break; case 15: iunpackscan15(initOffset, in); break; case 16: iunpackscan16(initOffset, in); break; case 17: iunpackscan17(initOffset, in); break; case 18: iunpackscan18(initOffset, in); break; case 19: iunpackscan19(initOffset, in); break; case 20: iunpackscan20(initOffset, in); break; case 21: iunpackscan21(initOffset, in); break; case 22: iunpackscan22(initOffset, in); break; case 23: iunpackscan23(initOffset, in); break; case 24: iunpackscan24(initOffset, in); break; case 25: iunpackscan25(initOffset, in); break; case 26: iunpackscan26(initOffset, in); break; case 27: iunpackscan27(initOffset, in); break; case 28: iunpackscan28(initOffset, in); break; case 29: iunpackscan29(initOffset, in); break; case 30: iunpackscan30(initOffset, in); break; case 31: iunpackscan31(initOffset, in); break; case 32: iunpackscan32(initOffset, in); break; default: break; } return ; } #endif